GOAL: Elite Classifier basing on User Basic Information 

Models to Compare On: 

    -Logistic Regression

    -Random Forest

    -Gradient Boosting

    -AdaBoost Classifier
    
    - Support vector machines.



In [64]:
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string
from collections import Counter
from collections import defaultdict


# Evaluation - Optimization Tools
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, classification_report
from sklearn.preprocessing import normalize, scale, StandardScaler, Normalizer
from sklearn.grid_search import GridSearchCV

# Classification Algorithms
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier


Populating the interactive namespace from numpy and matplotlib


In [2]:
user = pd.read_pickle('data/user')

In [3]:
user.columns

Index(['yelping_since', 'compliments_plain', 'review_count',
       'compliments_cute', 'compliments_writer', 'fans', 'compliments_note',
       'compliments_hot', 'compliments_cool', 'compliments_profile',
       'average_stars', 'compliments_more', 'elite', 'user_id', 'votes_cool',
       'compliments_list', 'votes_funny', 'compliments_photos',
       'compliments_funny', 'votes_useful', 'yelping_period', 'elite_status',
       'n_friends', 'tip_count'],
      dtype='object')

In [60]:
len(user)

70995

In [4]:
# aggregate all compliment features to one feature
comp_col = [i for i in list(user.columns) if 'compliments' in i]
user['compliments'] =user[comp_col].sum(axis = 1)

In [5]:
features = [ 'review_count','fans', 'average_stars','yelping_period','n_friends','tip_count',
            'votes_cool','votes_funny', 'votes_useful', 'compliments']
X = user[features]
y = user.elite_status
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [6]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

# Choose Best Models on CV

In [51]:
def cross_val(models,X_train,y_train):
    for model in models:
        model.fit(X_train,y_train)
        
        print(str(model).split('(')[0],'CV:')
        print('-- Accuracy:', round((cross_val_score(model, X_train, y_train, cv=5,scoring='accuracy').mean()), 3),
              '| Recall:', round(cross_val_score(model, X_train, y_train, cv=5, scoring='recall').mean(), 3),
              '| Precision:', round(cross_val_score(model, X_train, y_train, cv=5, scoring='precision').mean(), 3),
              '| f1:', round(cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean(), 3))
            

In [52]:
clf = [LogisticRegression(),RandomForestClassifier(),AdaBoostClassifier(),
       KNeighborsClassifier(),DecisionTreeClassifier(),SGDClassifier()]

In [53]:
cross_val(clf,X_train,y_train)

LogisticRegression CV:
-- Accuracy: 0.772 | Recall: 0.636 | Precision: 0.844 | f1: 0.726
RandomForestClassifier CV:
-- Accuracy: 0.815 | Recall: 0.763 | Precision: 0.829 | f1: 0.795
AdaBoostClassifier CV:
-- Accuracy: 0.822 | Recall: 0.808 | Precision: 0.813 | f1: 0.811
KNeighborsClassifier CV:
-- Accuracy: 0.775 | Recall: 0.742 | Precision: 0.774 | f1: 0.758
DecisionTreeClassifier CV:
-- Accuracy: 0.751 | Recall: 0.741 | Precision: 0.737 | f1: 0.738


LogisticRegression CV:
-- Accuracy: 0.772 | Recall: 0.636 | Precision: 0.844 | f1: 0.726
RandomForestClassifier CV:
-- Accuracy: 0.815 | Recall: 0.763 | Precision: 0.829 | f1: 0.795
AdaBoostClassifier CV:
-- Accuracy: 0.822 | Recall: 0.808 | Precision: 0.813 | f1: 0.811
KNeighborsClassifier CV:
-- Accuracy: 0.775 | Recall: 0.742 | Precision: 0.774 | f1: 0.758
DecisionTreeClassifier CV:
-- Accuracy: 0.751 | Recall: 0.741 | Precision: 0.737 | f1: 0.738

# We see the AdaBoostClassifier and RandomForestClassifier have the highest score for accuracy, recall and precision and f1 out of all the estimators 

In [46]:
new_clf = [RandomForestClassifier(),AdaBoostClassifier()]

# Optimize Chosen Models

In [70]:
def gs_rf(X_train,y_train):
    scores = ['accuracy', 'recall','precision', 'f1']
    output = defaultdict(list)
    
    param =  [{'max_features': ['auto', 'log2', None],'max_depth':list(range(6,14,3)), 
                    'min_samples_leaf': [2,5,8] }]
    print ('Random Forest')
    for score in scores:
        print (score, '\n')
        grid_search = GridSearchCV(RandomForestClassifier(), param_grid = param, verbose = 3, cv = 5,n_jobs= -1, scoring = score)
        grid_search.fit(X_train, y_train)
        output[score] = grid_search.best_score_, grid_search.best_params_
    return output 
                   

In [71]:
gs_rf(X_train,y_train)

Random Forest
accuracy 

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.820511 -   1.2s
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.820759 -   1.2s
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.818750 -   1.2s
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.820231 -   1.2s
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_feature

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.2s


[CV]  max_features=log2, max_depth=6, min_samples_leaf=5, score=0.817662 -   0.8s
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.822255 -   0.7s
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.822975 -   0.7s
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.816637 -   0.7s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.819702 -   0.7s
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.817133 -   0.7s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=6, min_samples_leaf=2, score=0.820070 -   1.7s


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   36.4s


[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.829049 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.822799 -   2.8s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.827097 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.823048 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.822680 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.827905 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.824208 -   2.6s


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.8min


[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.831866 -   1.3s
[CV] max_features=auto, max_depth=12, min_samples_leaf=8 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.824208 -   1.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.827802 -   1.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.827450 -   1.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.824353 -   1.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.822711 -   1.1s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.825528 - 

[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  2.8min finished


recall 

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.793937 -   0.7s
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.789621 -   0.7s
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.794681 -   0.8s
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.791295 -   0.7s
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_dept

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.5s


[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.791333 -   0.8s
[CV]  max_features=log2, max_depth=6, min_samples_leaf=5, score=0.795201 -   0.8s
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.789846 -   0.8s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.797061 -   0.7s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.796875 -   0.7s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.787016 -   0.7s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=6, min_samples_leaf=2, score=0.801748 -   1.9s


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   44.0s


[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.815139 -   3.9s
[CV] max_features=None, max_depth=8, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.802492 -   4.0s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.809524 -   3.8s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.809524 -   4.0s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.799851 -   3.7s
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.809745 -   3.6s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.809896 -   3.0s


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.2min


[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.812163 -   2.0s
[CV] max_features=auto, max_depth=12, min_samples_leaf=8 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.798587 -   1.8s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.811198 -   1.6s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.805990 -   1.5s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.807292 -   1.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.810117 -   1.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.802678 - 

[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.4min finished


precision 

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.821311 -   0.8s
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.815749 -   0.7s
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.817869 -   0.8s
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.821894 -   0.8s
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_d

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.4s


[CV]  max_features=log2, max_depth=6, min_samples_leaf=5, score=0.820523 -   0.9s
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.817957 -   0.9s
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.821693 -   0.9s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.823041 -   0.9s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.824008 -   0.9s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.818673 -   0.8s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=6, min_samples_leaf=2, score=0.815977 -   2.0s


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   41.9s


[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.824532 -   2.6s
[CV] max_features=None, max_depth=8, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.817253 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.821874 -   2.5s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.820839 -   2.3s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.816207 -   2.1s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.820585 -   2.2s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.817733 -   2.2s


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.0min


[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.825963 -   1.2s
[CV] max_features=auto, max_depth=12, min_samples_leaf=8 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.817489 -   1.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.823730 -   1.2s
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.827973 -   1.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.823869 -   1.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.822117 -   1.5s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.823116 - 

[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.4min finished


f1 

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.812441 -   1.0s
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.808160 -   1.0s
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.810639 -   1.0s
[CV] max_features=auto, max_depth=6, min_samples_leaf=2 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_depth=6, min_samples_leaf=2, score=0.810770 -   1.0s
[CV] max_features=auto, max_depth=6, min_samples_leaf=5 ..............
[CV]  max_features=auto, max_depth=6,

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.0s


[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.808671 -   0.9s
[CV]  max_features=log2, max_depth=6, min_samples_leaf=5, score=0.804924 -   1.0s
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV] max_features=log2, max_depth=6, min_samples_leaf=8 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.805114 -   0.9s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.807521 -   1.0s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.807364 -   0.9s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=log2, max_depth=6, min_samples_leaf=8, score=0.805048 -   0.8s
[CV] max_features=None, max_depth=6, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=6, min_samples_leaf=2, score=0.804036 -   2.0s


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   40.6s


[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.817364 -   3.2s
[CV] max_features=None, max_depth=8, min_samples_leaf=2 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.812064 -   3.1s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.819051 -   3.2s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.812588 -   3.3s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=2, score=0.813098 -   3.2s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.816614 -   3.4s
[CV] max_features=None, max_depth=8, min_samples_leaf=5 ..............
[CV]  max_features=None, max_depth=8, min_samples_leaf=5, score=0.810304 -   3.2s


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  2.0min


[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.815191 -   1.7s
[CV] max_features=auto, max_depth=12, min_samples_leaf=8 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.807689 -   1.7s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.817247 -   1.8s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.815287 -   2.0s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=auto, max_depth=12, min_samples_leaf=8, score=0.812087 -   2.2s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.812477 -   2.3s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV]  max_features=log2, max_depth=12, min_samples_leaf=2, score=0.806009 - 

[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.3min finished


defaultdict(list,
            {'accuracy': (0.8272413550250017,
              {'max_depth': 11,
               'max_features': 'log2',
               'min_samples_leaf': 5}),
             'f1': (0.815686795194465,
              {'max_depth': 11, 'max_features': None, 'min_samples_leaf': 5}),
             'precision': (0.8255398737454445,
              {'max_depth': 14,
               'max_features': 'log2',
               'min_samples_leaf': 8}),
             'recall': (0.8106168614543637,
              {'max_depth': 9, 'max_features': None, 'min_samples_leaf': 5})})

In [72]:
defaultdict(list,
            {'accuracy': (0.8272413550250017,
              {'max_depth': 11,
               'max_features': 'log2',
               'min_samples_leaf': 5}),
             'f1': (0.815686795194465,
              {'max_depth': 11, 'max_features': None, 'min_samples_leaf': 5}),
             'precision': (0.8255398737454445,
              {'max_depth': 14,
               'max_features': 'log2',
               'min_samples_leaf': 8}),
             'recall': (0.8106168614543637,
              {'max_depth': 9, 'max_features': None, 'min_samples_leaf': 5})})

defaultdict(list,
            {'accuracy': (0.8272413550250017,
              {'max_depth': 11,
               'max_features': 'log2',
               'min_samples_leaf': 5}),
             'f1': (0.815686795194465,
              {'max_depth': 11, 'max_features': None, 'min_samples_leaf': 5}),
             'precision': (0.8255398737454445,
              {'max_depth': 14,
               'max_features': 'log2',
               'min_samples_leaf': 8}),
             'recall': (0.8106168614543637,
              {'max_depth': 9, 'max_features': None, 'min_samples_leaf': 5})})

In [None]:
boost_importances = opt_boost.feature_importances_
boost_indices = np.argsort(boost_importances)[::-1]
for f,index in enumerate(boost_indices):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[index], boost_importances[index]))

In [92]:
def gs_abc(X_train,y_train):
    scores = ['accuracy', 'recall','precision', 'f1']
    output = defaultdict(list)
    param = [{'learning_rate': [0.5,0.8,1],'n_estimators': [100,500,800]}]
    
    print ('AdaBoosting Classifer')
    for score in scores:
        print (score, '\n')
        grid_search = GridSearchCV(AdaBoostClassifier(), param_grid = param, verbose = 1, cv = 5,n_jobs= -1, scoring = score)
        grid_search.fit(X_train, y_train)
        output[score] = grid_search.best_score_, grid_search.best_params_
    return output 
    
                      

In [93]:
outcome = gs_abc(X_train,y_train)

AdaBoosting Classifer
accuracy 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  6.3min finished


recall 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  6.0min finished


precision 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  6.8min finished


f1 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  7.0min finished


In [79]:
outcome

defaultdict(list,
            {'accuracy': (0.8234558771744489,
              {'learning_rate': 1, 'n_estimators': 70}),
             'f1': (0.8129551167989895,
              {'learning_rate': 1, 'n_estimators': 70}),
             'precision': (0.8152954960974463,
              {'learning_rate': 1, 'n_estimators': 70}),
             'recall': (0.8106541245341439,
              {'learning_rate': 1, 'n_estimators': 70})})

In [94]:
outcome

defaultdict(list,
            {'accuracy': (0.8261497288541446,
              {'learning_rate': 0.8, 'n_estimators': 800}),
             'f1': (0.8156364736219641,
              {'learning_rate': 0.8, 'n_estimators': 800}),
             'precision': (0.8187658548023466,
              {'learning_rate': 0.8, 'n_estimators': 800}),
             'recall': (0.8129603337782988,
              {'learning_rate': 0.8, 'n_estimators': 500})})

In [None]:
boost_importances = opt_boost.feature_importances_
boost_indices = np.argsort(boost_importances)[::-1]
for f,index in enumerate(boost_indices):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[index], boost_importances[index]))

In [None]:
pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})



In [80]:
def fit_model(model,X_train,y_train):
    "Fit Model --- Return Evaluation Metrics"
    
    model = model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    print(str(model).split('(')[0],':')
    print('Train | Accuracy:', round(accuracy_score(y_train, train_pred),3),
          '| Recall',round(recall_score(y_train, train_pred),3),
          '| Precision',round(precision_score(y_train, train_pred),3),
          '| f1',round(f1_score(y_train, train_pred),3))

    print('Test | Accuracy:', round(accuracy_score(y_test, test_pred),3),
          '| Recall',round(recall_score(y_test, test_pred),3),
          '| Precision',round(precision_score(y_test, test_pred),3),
          '| f1',round(f1_score(y_test, test_pred),3))


    

In [90]:
model = RandomForestClassifier(max_depth= 11,
               max_features= 'log2',
               min_samples_leaf= 5)

In [91]:
fit_model(model,X_train,y_train)

RandomForestClassifier :
Train | Accuracy: 0.86 | Recall 0.841 | Precision 0.86 | f1 0.85
Test | Accuracy: 0.777 | Recall 0.605 | Precision 0.886 | f1 0.719


In [95]:
boost_importances = model.feature_importances_
boost_indices = np.argsort(boost_importances)[::-1]
for f,index in enumerate(boost_indices):
    print("%2d) %-*s %f" % (f + 1, 30,features[index], boost_importances[index]))

 1) votes_cool                     0.275151
 2) compliments                    0.227877
 3) votes_useful                   0.179726
 4) votes_funny                    0.099195
 5) yelping_period                 0.050572
 6) n_friends                      0.048973
 7) review_count                   0.044602
 8) average_stars                  0.036300
 9) fans                           0.030872
10) tip_count                      0.006732


In [98]:
model = AdaBoostClassifier(learning_rate=0.8,n_estimators=2000)

In [99]:
fit_model(model,X_train,y_train)

AdaBoostClassifier :
Train | Accuracy: 0.83 | Recall 0.815 | Precision 0.823 | f1 0.819
Test | Accuracy: 0.758 | Recall 0.545 | Precision 0.906 | f1 0.681


In [100]:
boost_importances = model.feature_importances_
boost_indices = np.argsort(boost_importances)[::-1]
for f,index in enumerate(boost_indices):
    print("%2d) %-*s %f" % (f + 1, 30,features[index], boost_importances[index]))

 1) votes_useful                   0.191333
 2) compliments                    0.138000
 3) votes_funny                    0.136667
 4) votes_cool                     0.134667
 5) review_count                   0.112000
 6) average_stars                  0.072000
 7) n_friends                      0.066667
 8) fans                           0.054000
 9) yelping_period                 0.051667
10) tip_count                      0.043000


In [None]:
pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})



In [None]:
def evaluation(model, X_train, y_train):
    model.fit(X_train,y_train)
    print('CV    | MSE:', round((cross_val_score(model, X_train, y_train, cv=5,scoring='accuracy').mean()), 3),\
          '| R2:', round(cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean(), 3))
    print('Train | MSE:', round(mean_squared_error(y_train, model.predict(X_train)),3),
          '| R2:', round(r2_score(y_train,model.predict(X_train)),3))
    print('Test  | MSE:', round(mean_squared_error(y_test, model.predict(X_test)),3),
          '| R2:', round(r2_score(y_test, model.predict(X_test)),3))

In [9]:
def test_final_model_nopl(xtrain, xtest, ytrain, ytest, model):
    """Get accuracy, precision, recall, and f1 scores for model. 
    Include scaling outside of pipeline.
    Return model."""
    standard_scaler = StandardScaler()
    standard_scaler.fit(xtrain)
    X_train_scaled = standard_scaler.transform(xtrain)
    m = model.fit(X_train_scaled, ytrain)
    X_test_scaled = standard_scaler.transform(xtest)
    test_pred = m.predict(X_test_scaled)
    print ('accuracy:', accuracy_score(ytest, test_pred))
    print ('precision:', precision_score(ytest, test_pred))
    print ('recall:', recall_score(ytest, test_pred))
    print ('f1:', f1_score(ytest, test_pred))
    return m

In [13]:
test_final_model_nopl(X_train, X_test, y_train, y_test, RandomForestClassifier())


accuracy: 0.813155856046
precision: 0.831027185414
recall: 0.759672619048
f1: 0.79374951411


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [31]:
model = AdaBoostClassifier()

In [36]:
print(model.split('\('))

AttributeError: 'AdaBoostClassifier' object has no attribute 'split'

In [20]:
model = 

# # check the accuracy on the training set
model.score(X_test, y_test)

0.81864920064793301

In [28]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X_train, y_train)

# # check the accuracy on the training set
model.score(X_test, y_test)

0.76787097682935423

In [29]:
model.score(X_train, y_train)

0.77357560391576874

In [31]:
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)

In [32]:
scores.mean()

0.77037808950768338

In [50]:
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [54]:
treeclf.score(X_train,y_train)

0.80722233960138035

In [None]:
pd.DataFrame({'feature':features, 'importance':treeclf.feature_importances_})

In [None]:
def evaluation(model, X_train, y_train):
    model.fit(X_train,y_train)
    print('CV    | MSE:', round((cross_val_score(model, X_train, y_train, cv=5,scoring='accuracy').mean()), 3),\
          '| R2:', round(cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean(), 3))
    print('Train | MSE:', round(mean_squared_error(y_train, model.predict(X_train)),3),
          '| R2:', round(r2_score(y_train,model.predict(X_train)),3))
    print('Test  | MSE:', round(mean_squared_error(y_test, model.predict(X_test)),3),
          '| R2:', round(r2_score(y_test, model.predict(X_test)),3))

In [36]:
def eval_models(features, df, estimators):
    """Get cross-validated accuracy, precision, recall, f1, and auc scores for list of models. 
    Args: 
    features (list) -- list of features
    df (df) -- df 
    estimators (list) -- list of model types
    Returns: 
    (model, accuracy, precision, recall, f1, auc)
    """
    acc_scores = []
    p_scores = []
    r_scores = []
    f1_scores = []
    auc_scores = []
    for est in estimators: 
        xtrain, xtest, ytrain, ytest = train_test_split(
            df[features], df.elite_status, test_size = .25)
        m = est.fit(xtrain, ytrain)
        acc_scores.append(cross_val_score(est, xtrain, ytrain, cv = 10).mean())
        p_scores.append(cross_val_score(est, xtrain, ytrain, cv = 10, scoring = 'precision').mean())
#         r_scores.append(cross_val_score(est, xtrain, ytrain, cv = 10, scoring = 'recall').mean())
#         f1_scores.append(cross_val_score(est, xtrain, ytrain, cv = 10, scoring = 'f1').mean())
#         auc_scores.append(cross_val_score(est, xtrain, ytrain, cv = 10, scoring = 'roc_auc').mean())
        print ('done with %s' %str(est).split('(')[0])
    return zip([str(x).split('(')[0] for x in estimators], acc_scores, p_scores, r_scores, f1_scores, auc_scores)

In [None]:
eval_models(features, user, estimators)

done with LogisticRegression
