In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

import sys 
sys.path.append('../')
import src.model.feature_cleaning as feature_cleaning

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


#sklearn models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
#sklearn other
import graphviz 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, log_loss, accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [3]:
df, fieldofdegree_df, SOCP_labels, schl_labels, major_majors, NAICSP_labels_df, MAJ_NAICSP_labels_df = feature_cleaning.load_dfs()

In [4]:
youngemp_df = feature_cleaning.clean_that_target(df, SOCP_labels)
youngemp_df = feature_cleaning.single_occ_target(youngemp_df)
edu_df = feature_cleaning.create_edu_df(youngemp_df, fieldofdegree_df, schl_labels, major_majors)

  SOCPdf = df.dropna(axis='index', subset=['SOCP'])[df.SOCP != '999920']


Number of employed people: 218454
Percent employed people: 0.5785711448056677
Number of young employed people: 77406
Percent young employed people(out of all PUMS): 0.20500827650135733
Number of emp cats: 23
Number of degree fields present (max 173): 173


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  edu_df['SCHL_labels'] = edu_df.SCHL.map(schl_labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  edu_df['SCHL_ord'] = edu_df.SCHL.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

before dummies:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77406 entries, 0 to 77405
Data columns (total 14 columns):
SERIALNO            77406 non-null int64
SOCP                77406 non-null object
MAJ_SOCP            77406 non-null object
MAJ_SOCP_labels     77406 non-null object
MAJ_SOCP_15         77406 non-null int64
FOD1P               77406 non-null object
FOD2P               77406 non-null object
FOD1P_labels        77406 non-null object
FOD2P_labels        77406 non-null object
SCHL                77406 non-null object
SCHL_labels         77406 non-null object
SCHL_ord            77406 non-null int64
FOD1P_MAJ           77406 non-null int64
FOD1P_MAJ_labels    77406 non-null object
dtypes: int64(4), object(10)
memory usage: 53.1 MB
None


In [None]:
edu_df.sample(3)

In [5]:
# split the data
X = edu_df.drop(columns=[ 'SERIALNO', 'FOD1P', 'FOD2P','SOCP','MAJ_SOCP','MAJ_SOCP_labels', 
                'MAJ_SOCP_15','FOD1P_labels','FOD2P_labels','SCHL',
                'SCHL_labels','FOD1P_MAJ_labels', 'FOD1P_MAJ'])
y = edu_df.loc[:,'MAJ_SOCP_15']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, 
                                                    random_state=42)

In [None]:
X_train.shape

In [None]:
X_train.sample(5)

In [None]:
y_train.shape

In [41]:

# model pipelines
#-----------------------------------
pipe_lr_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=20)),
			('clf', LogisticRegression(random_state=42))])

pipe_sgd_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', SGDClassifier(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', RandomForestClassifier(random_state=42))])

pipe_svm_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', SVC(random_state=42))])

#-----------------------------------

In [42]:

# grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]
max_depth = [10,100,1000,10000]
alpha_range = [.1, .001, .00001, .000001]
gamma_range = [.1, 1, 10]

#-------------linear
grid_params_lr = [{'clf__penalty': ['l1'],
		'clf__C': param_range_fl,
		'clf__solver': ['liblinear', ],  #,'saga'
        #'clf__multi_class': ['ovr', 'multinomial', 'auto'],
        'clf__class_weight': [None, 'balanced']}] 

grid_params_lr_l2 = [{'clf__penalty': ['l2'],
		'clf__C': param_range_fl,
		'clf__solver': ['newton-cg', 'lbfgs', 'liblinear'],  #, 'sag'
        #'clf__multi_class': ['ovr', 'multinomial', 'auto'],
        'clf__class_weight': [None, 'balanced']}]

grid_params_sgd = [{'clf__loss': ['hinge', 'log', 'perceptron'],
		'clf__alpha': alpha_range,
		'clf__penalty': ['l1', 'l2', 'elasticnet'],
        'clf__class_weight': [None, 'balanced']}] 

#-------------trees
grid_params_dt = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': max_depth,
		'clf__min_samples_split': param_range[1:],
        'clf__class_weight': [None, 'balanced']}]

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': max_depth,
		'clf__min_samples_split': param_range[1:],
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']}]

grid_params_gb = [{'clf__loss': ['deviance', 'exponential'],
		'clf__learning_rate': alpha_range,
		'clf__n_estimators': max_depth,
		'clf__subsample': param_range_fl}]

#-------------SVM
grid_params_svm = [{'clf__kernel': ['linear', 'rbf', 'poly'],
        'clf__degree': param_range[1:],
        'clf__gamma': gamma_range,
        'clf__C': gamma_range,
        'clf__class_weight': [None, 'balanced']}]

#-------------KNN
grid_params_knn = [{'clf__n_neighbors': param_range}]

In [47]:
# Construct grid searches
jobs = -1
verbose = 1

gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca,
			param_grid=grid_params_lr,
			scoring='f1_micro',
			cv=10,
            verbose=verbose)

gs_lr_pca_l2 = GridSearchCV(estimator=pipe_lr_pca,
			param_grid=grid_params_lr_l2,
			scoring='f1_micro',
			cv=10,
            verbose=verbose)

gs_sgd_pca = GridSearchCV(estimator=pipe_sgd_pca,
			param_grid=grid_params_sgd,
			scoring='f1_micro',
			cv=10,
            verbose=verbose)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
			param_grid=grid_params_rf,
			scoring='f1_micro',
			cv=10, 
			n_jobs=jobs,
            verbose=verbose)

gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca,
			param_grid=grid_params_svm,
			scoring='f1_micro',
			cv=10,
			n_jobs=jobs,
            verbose=verbose)


In [None]:
# List of pipelines for ease of iteration
grids = [gs_rf_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'rf pca'}

# Fit the grid search objects
print('Performing model optimizations...')
best_f1_micro = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
	print('\nEstimator: %s' % grid_dict[idx])
	# Fit grid search
	gs.fit(X_train, y_train)
    
	# Best params
	print('Best params: %s' % gs.best_params_)
    
	# Best training data f1
	print('Best training f1: %.3f' % gs.best_score_)
    
	# Predict on test data with best params
	y_pred = gs.predict(X_test)
    
    #accuracy
    print('test accuracy:' accuracy_score(y_train, y_pred))
    
	# Test data accuracy of model with best params
	print('Test set f1 score for best params: %.3f ' % f1_score(y_test, y_pred))
    
	# Track best (highest test f1) model
	if f1_score(y_test, y_pred) > best_f1_micro:
		best_f1_micro = f1_score(y_test, y_pred)
		best_gs = gs
		best_clf = idx
print('\nClassifier with best test set f1: %s' % grid_dict[best_clf])




# Save best grid search pipeline to file
# dump_file = 'best_model_no_feat_sel_extr_occ_15.pkl'
# joblib.dump(best_gs, dump_file, compress=1)
# print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: rf pca
Fitting 10 folds for each of 2160 candidates, totalling 21600 fits
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2 
[CV] clf__class_weight=None, clf__criter

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.8s


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, score=0.9642000369071785, total=  11.5s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, score=0.9651227163683337, total=  11.9s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, score=0.9643845727994095, total=  12.0s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, score=0.9640155010149474, total=  12

[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   14.2s


[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=2, score=0.9640022152482924, total=  12.4s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3, score=0.9634618933382543, total=  12.5s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3, score=0.966599003506182, total=   8.0s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__cri

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   26.1s


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3, score=0.966402067565073, total=  10.6s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4, score=0.9640155010149474, total=  10.5s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=3, score=0.9641868192726601, total=  10.6s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4, score=0.9675216829673371, total=  10.

[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   35.7s


[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4, score=0.9641934293097084, total=   8.8s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4, score=0.9636330071995569, total=  10.2s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=6 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=4, score=0.9636330071995569, total=  10.5s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=6 
[CV]  clf__class_weight=None, clf__cr

[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   47.2s


[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=6 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, score=0.9640022152482924, total=   8.5s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=7 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=5, score=0.9645560273213956, total=   9.5s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=7 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=6, score=0.9636464292304854, total=   9.9s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=7 
[CV]  clf__class_weight=None, clf__cr

[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   50.3s


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=6, score=0.9640022152482924, total=   6.9s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=6, score=0.9640022152482924, total=   7.1s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=8 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=8 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=7, score=0.9636464292304854, total=   7.7s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=8 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=7, score=0.9645691086916405, total=   8

[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  1.2min


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=8, score=0.9660453958294888, total=   7.9s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=8, score=0.9658608599372578, total=   8.9s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=8, score=0.9590330319247093, total=   9.8s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9 
[CV]  clf__class_weight=None, clf__cr

[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  1.3min


[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=10 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9, score=0.9638309651227164, total=   9.4s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9, score=0.9640155010149474, total=   9.0s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=10 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=10 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=9, score=0.9641934293097084, total=   8.9s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=10 
[CV]  clf__class_weight=None, clf

[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  1.6min


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=1, clf__min_samples_split=10, score=0.9641868192726601, total=   8.7s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=3 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=2, score=0.9640155010149474, total=  10.0s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=2, score=0.9642000369071785, total=  10.1s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=3 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=2, score=0.9642000369071785, total=  10.0s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_spli

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.8min


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=3, score=0.9636464292304854, total=  10.1s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=3, score=0.9638309651227164, total=  10.2s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=3, score=0.9643779992617202, total=  10.1s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=4 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=4 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=3, score=0.9640088593576965, total=  10

[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  2.1min


[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=5, score=0.9664144676139509, total=   9.1s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=5, score=0.9642000369071785, total=   9.2s
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=5, score=0.9645691086916405, total=   8.7s
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=6 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=6 
[CV] clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=6 
[CV]  clf__class_weight=None, clf__criterion=gini, clf__max_depth=10, clf__min_samples_leaf=2, clf__min_samples_split=5, score=0.9642000369071785, total=   9

In [32]:
y_pred = gs.predict(X_train)

In [36]:
accuracy_score(y_train, y_pred)

0.9640484275800975

In [16]:
gs.scoring()

TypeError: 'list' object is not callable

In [None]:
PCA






------------------








In [None]:
ssh -i ~/.ssh/first_key.pem ubuntu@ec2-54-91-234-129.compute-1.amazonaws.com
[ec2-user ~]$ git clone https://github.com/DMSaunders/capstone
#[ec2-user ~]$ wget https://s3.amazonaws.com/galv-dsi-2018-ds/psam_p06.csv