# Senate Modeling

### Load Libraries, Set Options and Styling

In [62]:
# import libraries
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline

pd.set_option('display.max_columns', 500)

### Import Data

In [37]:
df = pd.read_csv('./data/sens_final.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.drop('percent_female', inplace=True, axis=1)

In [38]:
df.head()

Unnamed: 0,race_id,office,loc_date_id,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,total_male,total_female,age_under18,age_18to29,age_30to59,age_60over,race_white_nh,race_black,race_natamer,race_asian,race_hispanic,percent_male,percent_under18,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_natamer,percent_race_asian,percent_race_hispanic
0,MN_sen_1996,MN_sen,MN_1996,Minnesota,MN,1996,0,Paul Wellstone Democratic,Rudy Boschwitz,0,0,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,3.8,2319452,2382275,1238850,803983,1908266,750628,4184744,141016,52944,165795,107583,0.493319,0.263488,0.170997,0.405865,0.159649,0.890044,0.029992,0.011261,0.035263,0.022882
1,AL_sen_1996,AL_sen,AL_1996,Alabama,AL,1996,1,Jeff Sessions,Roger Bedford,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.1,2062367,2222128,1097568,749644,1694035,743248,3032654,1101840,20060,48083,55350,0.481356,0.256172,0.174967,0.395387,0.173474,0.707821,0.257169,0.004682,0.011223,0.012919
2,AR_sen_1996,AR_sen,AR_1996,Arkansas,AR,1996,1,Tim Hutchinson,Winston Bryant,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.3,1236046,1308284,656674,429484,980270,477902,2000986,400935,15794,45098,60070,0.485804,0.258093,0.1688,0.385276,0.18783,0.786449,0.15758,0.006208,0.017725,0.023609
3,DE_sen_1996,DE_sen,DE_1996,Delaware,DE,1996,0,Joe Biden,Raymond J Clatworthy,0,1,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,5.3,357512,379115,182089,130136,299793,124610,536207,135384,2446,26076,28694,0.485337,0.247193,0.176665,0.406981,0.169163,0.727922,0.183789,0.003321,0.035399,0.038953
4,GA_sen_1996,GA_sen,GA_1996,Georgia,GA,1996,0,Max Cleland,Guy Millner,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,4.8,3674069,3829089,1992462,1419104,3091725,999868,4731723,2108351,18381,271485,304705,0.48967,0.26555,0.189134,0.412056,0.13326,0.630631,0.280995,0.00245,0.036183,0.04061


### Lists to Use

In [39]:
state_codes = df.abbrev.unique()
state_names = df.state.unique()
years = list(range(1976, 2018))

id_cols = ['race_id', 'office', 'loc_date_id', 'state', 'abbrev', 'year', 'winner', 'rival']

percent_columns = ['percent_male', 'percent_under18', 'percent_age_18to29', 
                   'percent_age_30to59', 'percent_age_60over', 'percent_race_white', 'percent_race_black',
                   'percent_race_natamer', 'percent_race_asian', 'percent_race_hispanic']
amnt_columns = ['total_male', 'total_female', 'age_under18', 'age_18to29', 'age_30to59',
       'age_60over', 'race_white_nh', 'race_black', 'race_natamer', 'race_asian', 'race_hispanic']

### Set Up Variables

In [40]:
#basic setup
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)

## Model with Amounts

### Set Up

In [41]:
#drop percent vars from X
X.drop(percent_columns, axis=1, inplace=True
X.head()

Unnamed: 0,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,total_male,total_female,age_under18,age_18to29,age_30to59,age_60over,race_white_nh,race_black,race_natamer,race_asian,race_hispanic
0,0,0,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,3.8,2319452,2382275,1238850,803983,1908266,750628,4184744,141016,52944,165795,107583
1,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.1,2062367,2222128,1097568,749644,1694035,743248,3032654,1101840,20060,48083,55350
2,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.3,1236046,1308284,656674,429484,980270,477902,2000986,400935,15794,45098,60070
3,0,1,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,5.3,357512,379115,182089,130136,299793,124610,536207,135384,2446,26076,28694
4,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,4.8,3674069,3829089,1992462,1419104,3091725,999868,4731723,2108351,18381,271485,304705


In [43]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# scale data 
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

# initiate all models
logreg_class = LogisticRegression()

knn_class = KNeighborsClassifier()

dt_class = DecisionTreeClassifier()

bagged_class = BaggingClassifier()

rf_class = RandomForestClassifier()

adaboost_class = AdaBoostClassifier()

support_vector_class = SVC()

#list of models
models = [logreg_class, knn_class, cart_class, bagged_class, random_forest_class, adaboost_class, support_vector_class]

### Initial Model Evaluation

For this particular model, we want to use Accuracy as our metric to optimize each model.
The model is a two-class classifcation problem where a classification of 1 indicates a win for the GOP candidate and a classification of 0 indicates a win for the Democratic candidate. Since this study is intended to be unbiased, we value a false negative and false positive the same: a wrong prediction. If we were working for one party or saw unquantifiable signs in favor of one party, we may want to minimize false positives or negatives to be more pessimistic or optimistic in one direction. 

In [63]:
#all cross val scores
for model in models:
    cv_scores = cross_val_score(model, X_train_sc,  y_train, cv=5)
    print('\n\n Model:', model)
    print('Mean CV Scores:', cv_scores.mean())
    print('Std CV Scores:', cv_scores.std())



 Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Mean CV Scores: 0.8081185016379433
Std CV Scores: 0.02758398192840574


 Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Mean CV Scores: 0.7980202250391681
Std CV Scores: 0.023456532070620876


 Model: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Mean CV Scores: 0.7895598917533115
Std CV Scores: 

### Model Building with Grid Search

In [81]:
models = [logreg_class, knn_class, cart_class, bagged_class, rf_class, adaboost_class, support_vector_class]

#### Model 1: Logistic Regression

In [72]:
param_grid = { 'C' : [.001, .01, .1, 1, 10, 100, 1000],
               'penalty' : ['l1', 'l2']
             }

clf = GridSearchCV(logreg_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)

C_opt = clf.best_params_['C']
penalty_opt = clf.best_params_['penalty']

clf_opt = LogisticRegression(penalty=penalty_opt, C=C_opt)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'C': 1, 'penalty': 'l2'}
Train Accuracy:   0.8181818181818182
Test Accuracy:   0.864321608040201


#### Model 2: KNN Classifier

In [74]:
param_grid = {'n_neighbors' : range(1,15),
              'weights' : ['uniform', 'distance'],
              'leaf_size' : range(1,10)
             }
clf = GridSearchCV(knn_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_neighbors']
w = clf.best_params_['weights']
l = clf.best_params_['leaf_size']

clf_opt = KNeighborsClassifier(n_neighbors=n, weights=w, leaf_size=l)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))


{'leaf_size': 1, 'n_neighbors': 13, 'weights': 'distance'}
Train Accuracy:   0.9949494949494949
Test Accuracy:   0.8944723618090452


#### Model 3: Decision Tree Classifier

In [78]:
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth' : [10, 15, 20, 50],
              'max_features' : ['auto', 'sqrt', 'log2', None]
             }
clf = GridSearchCV(dt_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']
mf = clf.best_params_['max_features']

clf_opt = DecisionTreeClassifier(criterion=c, max_depth=md, max_features=mf)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'gini', 'max_depth': 15, 'max_features': 'log2'}
Train Accuracy:   0.9949494949494949
Test Accuracy:   0.8190954773869347


#### Model 4: Bagging Classifier

In [135]:
param_grid = {'n_estimators' : range(7, 15),
              'max_samples' : range(1, 4), 
             }
clf = GridSearchCV(bagged_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
ms = clf.best_params_['max_samples']

clf_opt = BaggingClassifier(n_estimators=n, max_samples=ms)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'max_samples': 3, 'n_estimators': 13}
Train Accuracy:   0.5572390572390572
Test Accuracy:   0.5678391959798995


#### Model 5: Random Forest Model

In [84]:
param_grid = {'n_estimators' : [5, 10, 15, 20, 25],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [None, 10, 25, 50],
              }
clf = GridSearchCV(rf_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']

clf_opt = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=md)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 25}
Train Accuracy:   0.9949494949494949
Test Accuracy:   0.9045226130653267


#### Model 6: Adaboost Classifier

In [89]:
param_grid = {'n_estimators' : range(50, 100, 5),
              'learning_rate': [.5, 1, 1.5, 2, 2.5]
    
}
clf = GridSearchCV(adaboost_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
lr = clf.best_params_['learning_rate']

clf_opt = AdaBoostClassifier(n_estimators=n, learning_rate=lr)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'learning_rate': 1.5, 'n_estimators': 95}
Train Accuracy:   0.9444444444444444
Test Accuracy:   0.864321608040201


#### Model 7: SVM Classifier

In [93]:
param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma' : [.001, .01, .1, 1, 10, 100]
             }

clf = GridSearchCV(support_vector_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['C']
g = clf.best_params_['gamma']

clf_opt = SVC(C=c, gamma=g)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'C': 10, 'gamma': 0.1}
Train Accuracy:   0.9158249158249159
Test Accuracy:   0.8844221105527639


| Model | Train Accuracy | Test Accuracy|
| ----------- | ----------- |----------- |
| Logarithmic Regression | 0.81818 |0.86432|
| KNeighbors | 0.99495 |0.89447|
| Decision Tree| 0.99495 |0.81909|
| Bagging | 0.70539 |0.72864|
| Random Forest | 0.99495 |0.90452|
| Adaboost | 0.94444 |0.86432|
| SVM | 0.91582 |0.88442|



## Model with Percentages

### Set Up

In [104]:
#basic setup
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)


#drop amount vars from X

X.drop(amnt_columns, axis=1, inplace=True)
X.head()

Unnamed: 0,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_male,percent_under18,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_natamer,percent_race_asian,percent_race_hispanic
0,0,0,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,3.8,0.493319,0.263488,0.170997,0.405865,0.159649,0.890044,0.029992,0.011261,0.035263,0.022882
1,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.1,0.481356,0.256172,0.174967,0.395387,0.173474,0.707821,0.257169,0.004682,0.011223,0.012919
2,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.3,0.485804,0.258093,0.1688,0.385276,0.18783,0.786449,0.15758,0.006208,0.017725,0.023609
3,0,1,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,5.3,0.485337,0.247193,0.176665,0.406981,0.169163,0.727922,0.183789,0.003321,0.035399,0.038953
4,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,4.8,0.48967,0.26555,0.189134,0.412056,0.13326,0.630631,0.280995,0.00245,0.036183,0.04061


In [105]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# scale data 
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

# initiate all models
logreg_class = LogisticRegression()

knn_class = KNeighborsClassifier()

dt_class = DecisionTreeClassifier()

bagged_class = BaggingClassifier()

rf_class = RandomForestClassifier()

adaboost_class = AdaBoostClassifier()

support_vector_class = SVC()

#list of models
models = [logreg_class, knn_class, cart_class, bagged_class, random_forest_class, adaboost_class, support_vector_class]

## Initial Model Evaluation

In [106]:
#all cross val scores
for model in models:
    cv_scores = cross_val_score(model, X_train_sc,  y_train, cv=5)
    print('\n\n Model:', model)
    print('Mean CV Scores:', cv_scores.mean())
    print('Std CV Scores:', cv_scores.std())



 Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Mean CV Scores: 0.8030508474576271
Std CV Scores: 0.012084850093183355


 Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Mean CV Scores: 0.8098022598870056
Std CV Scores: 0.016931158056140965


 Model: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Mean CV Scores: 0.8282485875706215
Std CV Scores:

Based on these results, going to stay with the amounts.

# Final Model Tinkering and Selection

## Set Up Amounts Again

In [112]:
#basic setup
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)

In [113]:
#drop percent vars from X
X.drop(percent_columns, axis=1, inplace=True)
X.head()

Unnamed: 0,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,total_male,total_female,age_under18,age_18to29,age_30to59,age_60over,race_white_nh,race_black,race_natamer,race_asian,race_hispanic
0,0,0,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,3.8,2319452,2382275,1238850,803983,1908266,750628,4184744,141016,52944,165795,107583
1,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.1,2062367,2222128,1097568,749644,1694035,743248,3032654,1101840,20060,48083,55350
2,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.3,1236046,1308284,656674,429484,980270,477902,2000986,400935,15794,45098,60070
3,0,1,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,5.3,357512,379115,182089,130136,299793,124610,536207,135384,2446,26076,28694
4,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,4.8,3674069,3829089,1992462,1419104,3091725,999868,4731723,2108351,18381,271485,304705


In [114]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# scale data 
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [136]:
# initiate all models
logreg_class = LogisticRegression()

knn_class = KNeighborsClassifier()

dt_class = DecisionTreeClassifier()

bagged_class = BaggingClassifier()

rf_class = RandomForestClassifier()

adaboost_class = AdaBoostClassifier()

support_vector_class = SVC()

### Random Forest

In [137]:
rf_class = RandomForestClassifier()
param_grid = {'n_estimators' : [5, 10, 15, 20, 25],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : range(20, 100, 10),
              }
clf = GridSearchCV(rf_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']

clf_opt = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=md)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'entropy', 'max_depth': 90, 'n_estimators': 20}
Train Accuracy:   0.9966329966329966
Test Accuracy:   0.8793969849246231


In [146]:
rf_final = RandomForestClassifier(n_estimators= 100,
                                   criterion= 'entropy',
                                   max_depth= 75)

rf_final.fit(X_train_sc, y_train)

y_hat_train = rf_final.predict(X_train_sc)
y_hat_test = rf_final.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.9983164983164983
Test Accuracy:   0.8844221105527639


### KNeighbors

In [145]:
param_grid = {'n_neighbors' : range(1,15),
              'weights' : ['uniform', 'distance'],
              'leaf_size' : range(1,10)
             }
clf = GridSearchCV(knn_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_neighbors']
w = clf.best_params_['weights']
l = clf.best_params_['leaf_size']

clf_opt = KNeighborsClassifier(n_neighbors=n, weights=w, leaf_size=l)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))


{'leaf_size': 1, 'n_neighbors': 12, 'weights': 'distance'}
Train Accuracy:   0.9983164983164983
Test Accuracy:   0.8693467336683417


In [155]:
knn_final = KNeighborsClassifier(leaf_size= 1,
                                 n_neighbors= 50,
                                 weights = 'distance')
knn_final.fit(X_train_sc, y_train)

y_hat_train = knn_final.predict(X_train_sc)
y_hat_test = knn_final.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.9983164983164983
Test Accuracy:   0.8844221105527639
