# Senate Modeling

### Load Libraries, Set Options and Styling

In [1]:
# import libraries
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import random
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline

pd.set_option('display.max_columns', 500)

### Import Data

In [2]:
df = pd.read_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/sens_final.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)

In [3]:
df.head()

Unnamed: 0,race_id,office,loc_date_id,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,total_male,total_female,age_under18,age_18to29,age_30to59,age_60over,race_white_nh,race_black,race_natamer,race_asian,race_hispanic,percent_male,percent_female,percent_under18,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_natamer,percent_race_asian,percent_race_hispanic
0,AK_sen_1978,AK_sen,AK_1978,Alaska,AK,1978,1,Ted Stevens,Donald W Hobbs,1,0,0,1,0,0.0,0.0,13.0,0.0,5.8,0.0,10.2,203084,178473,128568,101180,133656,18154,288403,12781,54741,14337,8525,0.532251,0.467749,0.336956,0.265177,0.350291,0.047579,0.755858,0.033497,0.143467,0.037575,0.022343
1,AL_sen_1978,AL_sen,AL_1978,Alabama,AL,1978,0,Howell Heflin,Jerome B Couch,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.5,1829615,1974328,1176502,788346,1256746,582349,2770988,977720,7932,9413,34409,0.480979,0.519021,0.309285,0.207244,0.33038,0.153091,0.728452,0.257028,0.002085,0.002475,0.009046
2,AL_sen2_1978,AL_sen,AL_1978,Alabama,AL,1978,0,Donald W Stewart,James D Martin,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.5,1829615,1974328,1176502,788346,1256746,582349,2770988,977720,7932,9413,34409,0.480979,0.519021,0.309285,0.207244,0.33038,0.153091,0.728452,0.257028,0.002085,0.002475,0.009046
3,AR_sen_1978,AR_sen,AR_1978,Arkansas,AR,1978,0,David Pryor,Tom Kelly,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.4,1070212,1143595,668101,427351,713077,405278,1805936,368909,10608,6370,19195,0.483426,0.516574,0.301788,0.193039,0.322104,0.183068,0.81576,0.16664,0.004792,0.002877,0.008671
4,CO_sen_1978,CO_sen,CO_1978,Colorado,CO,1978,1,William L Armstrong,Floyd K Haskell,0,1,0,0,1,0.0,0.0,13.0,0.0,5.8,0.0,5.1,1365310,1388113,801853,666524,951729,333317,2178611,94638,18579,31338,316875,0.495859,0.504141,0.29122,0.242071,0.345653,0.121056,0.791237,0.034371,0.006748,0.011381,0.115084


### Lists to Use

In [4]:
state_codes = df.abbrev.unique()
state_names = df.state.unique()
years = list(range(1976, 2018))

id_cols = ['office', 'loc_date_id', 'race_id', 'state', 'abbrev', 'year', 'winner', 'rival']

percent_columns = ['percent_male', 'percent_under18', 'percent_age_18to29', 
                   'percent_age_30to59', 'percent_age_60over', 'percent_race_white', 'percent_race_black',
                   'percent_race_natamer', 'percent_race_asian', 'percent_race_hispanic']
amnt_columns = ['total_male', 'total_female', 'age_under18', 'age_18to29', 'age_30to59',
       'age_60over', 'race_white_nh', 'race_black', 'race_natamer', 'race_asian', 'race_hispanic']

In [72]:
df.columns

Index(['race_id', 'office', 'loc_date_id', 'state', 'abbrev', 'year',
       'GOP_win', 'winner', 'rival', 'pred_GOP', 'pred_DEM', 'unopposed',
       'inc_GOP_running', 'inc_DEM_running', 'prez_GOP',
       'approval_effects_GOP', 'approval_effects_DEM', 'nat_UR_effects_GOP',
       'nat_UR_effects_DEM', 'state_UR_effects_GOP', 'state_UR_effects_DEM',
       'total_male', 'total_female', 'age_under18', 'age_18to29', 'age_30to59',
       'age_60over', 'race_white_nh', 'race_black', 'race_natamer',
       'race_asian', 'race_hispanic', 'percent_male', 'percent_female',
       'percent_under18', 'percent_age_18to29', 'percent_age_30to59',
       'percent_age_60over', 'percent_race_white', 'percent_race_black',
       'percent_race_natamer', 'percent_race_asian', 'percent_race_hispanic'],
      dtype='object')

## Model with Percentages

### Set Up

In [5]:
#basic setup
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)


# Option 1: drop amount vars from X
X.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
X.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)



# Option 2: drop percentage vars
# X.drop(percent_columns, axis=1, inplace=True)



print(X.shape)
X.head()

(598, 20)


Unnamed: 0,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic
0,1,0,0,1,0,0.0,0.0,13.0,0.0,5.8,0.0,10.2,0.467749,0.265177,0.350291,0.047579,0.755858,0.033497,0.037575,0.022343
1,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.5,0.519021,0.207244,0.33038,0.153091,0.728452,0.257028,0.002475,0.009046
2,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.5,0.519021,0.207244,0.33038,0.153091,0.728452,0.257028,0.002475,0.009046
3,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.4,0.516574,0.193039,0.322104,0.183068,0.81576,0.16664,0.002877,0.008671
4,0,1,0,0,1,0.0,0.0,13.0,0.0,5.8,0.0,5.1,0.504141,0.242071,0.345653,0.121056,0.791237,0.034371,0.011381,0.115084


In [6]:
random.seed(19)
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

# scale data 
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

# initiate all models
logreg_class = LogisticRegression()

knn_class = KNeighborsClassifier()

dt_class = DecisionTreeClassifier()

bagged_class = BaggingClassifier()

rf_class = RandomForestClassifier()

adaboost_class = AdaBoostClassifier()

support_vector_class = SVC()

#list of models
models = [logreg_class, knn_class,dt_class, rf_class, bagged_class, adaboost_class, support_vector_class]

### Initial Model Evaluation

For this particular model, we want to use Accuracy as our metric to optimize each model.
The model is a two-class classifcation problem where a classification of 1 indicates a win for the GOP candidate and a classification of 0 indicates a win for the Democratic candidate. Since this study is intended to be unbiased, we value a false negative and false positive the same: a wrong prediction. If we were working for one party or saw unquantifiable signs in favor of one party, we may want to minimize false positives or negatives to be more pessimistic or optimistic in one direction. 

In [7]:
#all cross val scores
for model in models:
    random.seed(19)
    cv_scores = cross_val_score(model, X_train_sc,  y_train, cv=5)
    print('\n\n Model:', model)
    print('Mean CV Scores:', cv_scores.mean())
    print('Std CV Scores:', cv_scores.std())



 Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Mean CV Scores: 0.8138076505697232
Std CV Scores: 0.02141117253551227


 Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Mean CV Scores: 0.8219678061132212
Std CV Scores: 0.0366277956063477


 Model: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Mean CV Scores: 0.7343947820582384
Std CV Scores: 0.

### Model Building with Grid Search

#### Model 1: Logistic Regression

In [73]:
param_grid = { 'C' : [.001, .01, .1, 1, 10, 100, 1000],
               'penalty' : ['l1', 'l2']
             }

clf = GridSearchCV(logreg_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)

C_opt = clf.best_params_['C']
penalty_opt = clf.best_params_['penalty']

lr_opt = LogisticRegression(penalty=penalty_opt, C=C_opt)
lr_opt.fit(X_train_sc, y_train)

y_hat_train = lr_opt.predict(X_train_sc)
y_hat_test = lr_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'C': 0.1, 'penalty': 'l2'}
Train Accuracy:   0.8305439330543933
Test Accuracy:   0.85


In [75]:
lr_opt = LogisticRegression(penalty='l2', C=.5)
lr_opt.fit(X_train_sc, y_train)

y_hat_train = lr_opt.predict(X_train_sc)
y_hat_test = lr_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.8326359832635983
Test Accuracy:   0.85


#### Model 2: KNN Classifier

In [48]:
param_grid = {'n_neighbors' : range(1, 25, 1),
              'leaf_size' : range(1, 10, 1)
             }

knn_class = KNeighborsClassifier(weights='distance',
                                 leaf_size = 1)
clf = GridSearchCV(knn_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)


print(clf.best_params_)

y_hat_train = clf.predict(X_train_sc)
y_hat_test = clf.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))


{'leaf_size': 1, 'n_neighbors': 19}
Train Accuracy:   0.99581589958159
Test Accuracy:   0.85


In [79]:
knn_opt = KNeighborsClassifier(n_neighbors=7, leaf_size=1, weights='distance')

knn_opt.fit(X_train_sc, y_train)

y_hat_train = knn_opt.predict(X_train_sc)
y_hat_test = knn_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))


Train Accuracy:   0.99581589958159
Test Accuracy:   0.8666666666666667


#### Model 3: Decision Tree Classifier

In [83]:
random.seed(19)
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth' : [1, 5, 10, 20, None],
              'max_features' : ['auto', 'sqrt', 'log2', ],
             }
clf = GridSearchCV(dt_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']
mf = clf.best_params_['max_features']

clf_opt = DecisionTreeClassifier(criterion=c, max_depth=None, max_features=mf)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'entropy', 'max_depth': 5, 'max_features': 'auto'}
Train Accuracy:   0.99581589958159
Test Accuracy:   0.7333333333333333


In [87]:
dt_opt = DecisionTreeClassifier(criterion='entropy', max_depth=4, max_features= 'auto', random_state=19)
dt_opt.fit(X_train_sc, y_train)

y_hat_train = dt_opt.predict(X_train_sc)
y_hat_test = dt_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))
print("\nGap:", (accuracy_score(y_train, y_hat_train) - accuracy_score(y_test, y_hat_test)))

Train Accuracy:   0.8284518828451883
Test Accuracy:   0.8416666666666667

Gap: -0.013214783821478338


#### Model 4: Bagging Classifier

In [122]:
random.seed(19)
param_grid = {'n_estimators' : range(10, 25),
              'max_samples' : [.6, .75, .8, 1],
              
             }
clf = GridSearchCV(bagged_class, param_grid, cv=5)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
ms = clf.best_params_['max_samples']

clf.best_params_

bag_opt = BaggingClassifier(n_estimators=n,
                            max_samples=ms,
                            base_estimator=dt_opt)
bag_opt.fit(X_train_sc, y_train)

y_hat_train = bag_opt.predict(X_train_sc)
y_hat_test = bag_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'max_samples': 0.6, 'n_estimators': 15}
Train Accuracy:   0.8389121338912134
Test Accuracy:   0.8416666666666667


In [123]:
bag_opt = BaggingClassifier(base_estimator= dt_opt,
                            n_estimators = 15,
                            max_samples= .6)
bag_opt.fit(X_train_sc, y_train)

y_hat_train = bag_opt.predict(X_train_sc)
y_hat_test = bag_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.8284518828451883
Test Accuracy:   0.85


#### Model 5: Random Forest Model

In [15]:
random.seed(19)
param_grid = {'n_estimators' : [5, 8, 10, 15],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [10, 25, 40],
              }
clf = GridSearchCV(rf_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']

rf_opt = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=md)
rf_opt.fit(X_train_sc, y_train)

y_hat_train = rf_opt.predict(X_train_sc)
y_hat_test = rf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'gini', 'max_depth': 25, 'n_estimators': 15}
Train Accuracy:   0.9937238493723849
Test Accuracy:   0.85


In [113]:
rf_opt = RandomForestClassifier(n_estimators=6, criterion='entropy', max_depth=8, random_state=19)
rf_opt.fit(X_train_sc, y_train)

y_hat_train = rf_opt.predict(X_train_sc)
y_hat_test = rf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.9184100418410042
Test Accuracy:   0.8583333333333333


#### Model 6: Adaboost Classifier

In [115]:
param_grid = {'n_estimators' : range(50, 100, 10),
              'learning_rate': [.1, .5, 1, 1.25, 1.5]
    
}
clf = GridSearchCV(adaboost_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
lr = clf.best_params_['learning_rate']

clf_opt = AdaBoostClassifier(n_estimators=n, learning_rate=lr, random_state=19)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'learning_rate': 0.1, 'n_estimators': 50}
Train Accuracy:   0.8368200836820083
Test Accuracy:   0.8416666666666667


In [109]:
boost_opt = AdaBoostClassifier(n_estimators = 60,
                               learning_rate = .5,
                               random_state = 19)

boost_opt.fit(X_train_sc, y_train)

y_hat_train = boost_opt.predict(X_train_sc)
y_hat_test = boost_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.8723849372384938
Test Accuracy:   0.8416666666666667


#### Model 7: SVM Classifier

In [117]:
random.seed(19)
param_grid = { 'C' : [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma' : [.001, .01, .1, 1, 10, 100],
              'kernel' : ['linear', 'poly']
             }

clf = GridSearchCV(support_vector_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['C']
g = clf.best_params_['gamma']
k = clf.best_params_['kernel']

svm_opt = SVC(C= c, gamma = g, kernel = k, probability=True)
svm_opt.fit(X_train_sc, y_train)

y_hat_train = svm_opt.predict(X_train_sc)
y_hat_test = svm_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
Train Accuracy:   0.8430962343096234
Test Accuracy:   0.8416666666666667


In [116]:
svm_opt = SVC(kernel = 'poly',
              C = .1, 
              gamma = .1,
              probability=True)

svm_opt.fit(X_train_sc, y_train)

y_hat_train = svm_opt.predict(X_train_sc)
y_hat_test = svm_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.8430962343096234
Test Accuracy:   0.8416666666666667


# Project Results for 2018

### Import 2018 Data

In [134]:
this_year = pd.read_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/2018_final.csv')
# drop non-analysis vars
this_year.drop('Unnamed: 0', inplace=True, axis=1)


# Option 1: drop amount vars from X
this_year.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
this_year.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)

id_cols = ['office', 'loc_date', 'race_id', 'state', 'abbrev', 'year', 'winner', 'rival']


print(this_year.shape)
this_year

(35, 28)


Unnamed: 0,race_id,office,loc_date,state,abbrev,year,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic
0,2018_AZ_sen,AZ_sen,AZ_2018,Arizona,AZ,2018,TBD,TBD,1,0,0,0,0,1.0,-8.5,0,3.7,0,4.7,0,0.504449,0.165003,0.540617,0.205617,0.395501,0.04582,0.1561,0.320867
1,2018_CA_sen,CA_sen,CA_2018,California,CA,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.1,0,0.503697,0.182023,0.590001,0.178872,0.150891,0.058239,0.319912,0.4117
2,2018_CT_sen,CT_sen,CT_2018,Connecticut,CT,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.2,0,0.511489,0.160426,0.572296,0.21469,0.582367,0.108996,0.113004,0.163416
3,2018_DE_sen,DE_sen,DE_2018,Delaware,DE,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.0,0,0.516418,0.167856,0.563881,0.223531,0.549541,0.227139,0.081994,0.103068
4,2018_FL_sen,FL_sen,FL_2018,Florida,FL,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,3.5,0,0.510712,0.161149,0.548872,0.241003,0.473384,0.167709,0.069587,0.259145
5,2018_HI_sen,HI_sen,HI_2018,Hawaii,HI,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,2.2,0,0.50024,0.169397,0.565171,0.225542,0.150909,0.014169,0.482271,0.099611
6,2018_IN_sen,IN_sen,IN_2018,Indiana,IN,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,3.5,0,0.507005,0.162597,0.573229,0.198976,0.742878,0.096451,0.054873,0.077844
7,2018_ME_sen,ME_sen,ME_2018,Maine,ME,2018,TBD,TBD,0,1,0,0,0,1.0,-8.5,0,3.7,0,3.3,0,0.508535,0.14519,0.567623,0.255889,0.922712,0.016676,0.016622,0.016771
8,2018_MD_sen,MD_sen,MD_2018,Maryland,MD,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.2,0,0.51603,0.174909,0.580261,0.195758,0.433594,0.305206,0.114523,0.108007
9,2018_MA_sen,MA_sen,MA_2018,Massachusetts,MA,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,3.6,0,0.514963,0.179568,0.599718,0.210936,0.655607,0.075622,0.119435,0.117319


### Define and Scale Variables

In [22]:
X_final = this_year.drop(id_cols, axis = 1)
print(X.shape)
print(X_final.shape)
X_final_sc = ss.transform(X_final)

(598, 20)
(35, 20)


### Build Voting Classifier 

In [150]:
# Build model
voter = VotingClassifier([('knn', knn_opt),
                          ('dtree', dt_opt),
                          ('bagging', bag_opt),
                          ('rf', rf_opt),
#                           ('svm', svm_opt),
                          ('logreg', lr_opt), 
#                           ('boost', boost_opt)
                         ],
                          voting='hard')

# Test Model Against original train and test data
voter.fit(X_train_sc, y_train)

y_hat_train = voter.predict(X_train_sc)
y_hat_test = voter.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

Train Accuracy:   0.8661087866108786
Test Accuracy:   0.85


  if diff:
  if diff:


### Use to predict 2018 results

In [144]:
#train against all data before running on 2018
voter.fit(X, y)

results = voter.predict(X_final)

print(results)

[1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1]


  if diff:


In [145]:
this_year['predictions'] = results

In [146]:
predictions = this_year.loc[:, ['race_id', 'state', 'abbrev', 'predictions']]

In [147]:
predictions

Unnamed: 0,race_id,state,abbrev,predictions
0,2018_AZ_sen,Arizona,AZ,1
1,2018_CA_sen,California,CA,0
2,2018_CT_sen,Connecticut,CT,0
3,2018_DE_sen,Delaware,DE,0
4,2018_FL_sen,Florida,FL,0
5,2018_HI_sen,Hawaii,HI,0
6,2018_IN_sen,Indiana,IN,0
7,2018_ME_sen,Maine,ME,0
8,2018_MD_sen,Maryland,MD,0
9,2018_MA_sen,Massachusetts,MA,0


In [30]:
predictions.to_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/predictions.csv')

In [148]:
predictions.predictions.sum()

9