# Senate Modeling

### Load Libraries, Set Options and Styling

In [128]:
# import libraries
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import random
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline

pd.set_option('display.max_columns', 500)

### Import Data

In [129]:
df = pd.read_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/sens_final.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)

In [130]:
df.head()

Unnamed: 0,race_id,office,loc_date_id,state,abbrev,year,GOP_win,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,total_male,total_female,age_under18,age_18to29,age_30to59,age_60over,race_white_nh,race_black,race_natamer,race_asian,race_hispanic,percent_male,percent_female,percent_under18,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_natamer,percent_race_asian,percent_race_hispanic
0,MN_sen_1996,MN_sen,MN_1996,Minnesota,MN,1996,0,Paul Wellstone Democratic,Rudy Boschwitz,0,0,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,3.8,2319452,2382275,1238850,803983,1908266,750628,4184744,141016,52944,165795,107583,0.493319,0.506681,0.263488,0.170997,0.405865,0.159649,0.890044,0.029992,0.011261,0.035263,0.022882
1,AL_sen_1996,AL_sen,AL_1996,Alabama,AL,1996,1,Jeff Sessions,Roger Bedford,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.1,2062367,2222128,1097568,749644,1694035,743248,3032654,1101840,20060,48083,55350,0.481356,0.518644,0.256172,0.174967,0.395387,0.173474,0.707821,0.257169,0.004682,0.011223,0.012919
2,AR_sen_1996,AR_sen,AR_1996,Arkansas,AR,1996,1,Tim Hutchinson,Winston Bryant,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.3,1236046,1308284,656674,429484,980270,477902,2000986,400935,15794,45098,60070,0.485804,0.514196,0.258093,0.1688,0.385276,0.18783,0.786449,0.15758,0.006208,0.017725,0.023609
3,DE_sen_1996,DE_sen,DE_1996,Delaware,DE,1996,0,Joe Biden,Raymond J Clatworthy,0,1,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,5.3,357512,379115,182089,130136,299793,124610,536207,135384,2446,26076,28694,0.485337,0.514663,0.247193,0.176665,0.406981,0.169163,0.727922,0.183789,0.003321,0.035399,0.038953
4,GA_sen_1996,GA_sen,GA_1996,Georgia,GA,1996,0,Max Cleland,Guy Millner,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,4.8,3674069,3829089,1992462,1419104,3091725,999868,4731723,2108351,18381,271485,304705,0.48967,0.51033,0.26555,0.189134,0.412056,0.13326,0.630631,0.280995,0.00245,0.036183,0.04061


### Lists to Use

In [131]:
state_codes = df.abbrev.unique()
state_names = df.state.unique()
years = list(range(1976, 2018))

id_cols = ['office', 'loc_date_id', 'race_id', 'state', 'abbrev', 'year', 'winner', 'rival']

percent_columns = ['percent_male', 'percent_under18', 'percent_age_18to29', 
                   'percent_age_30to59', 'percent_age_60over', 'percent_race_white', 'percent_race_black',
                   'percent_race_natamer', 'percent_race_asian', 'percent_race_hispanic']
amnt_columns = ['total_male', 'total_female', 'age_under18', 'age_18to29', 'age_30to59',
       'age_60over', 'race_white_nh', 'race_black', 'race_natamer', 'race_asian', 'race_hispanic']

## Model with Percentages

### Set Up

In [132]:
#basic setup
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)


# Option 1: drop amount vars from X
X.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
X.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)



# Option 2: drop percentage vars
# X.drop(percent_columns, axis=1, inplace=True)



print(X.shape)
X.head()

(793, 20)


Unnamed: 0,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic
0,0,0,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,3.8,0.506681,0.170997,0.405865,0.159649,0.890044,0.029992,0.035263,0.022882
1,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.1,0.518644,0.174967,0.395387,0.173474,0.707821,0.257169,0.011223,0.012919
2,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,5.3,0.514196,0.1688,0.385276,0.18783,0.786449,0.15758,0.017725,0.023609
3,0,1,0,0,1,0.0,0.0,21.0,0.0,5.2,0.0,5.3,0.514663,0.176665,0.406981,0.169163,0.727922,0.183789,0.035399,0.038953
4,0,1,0,0,0,0.0,0.0,21.0,0.0,5.2,0.0,4.8,0.51033,0.189134,0.412056,0.13326,0.630631,0.280995,0.036183,0.04061


In [133]:
random.seed(17)
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# scale data 
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

# initiate all models
logreg_class = LogisticRegression()

knn_class = KNeighborsClassifier()

dt_class = DecisionTreeClassifier()

bagged_class = BaggingClassifier()

rf_class = RandomForestClassifier()

adaboost_class = AdaBoostClassifier()

support_vector_class = SVC()

#list of models
models = [logreg_class, knn_class,dt_class, rf_class, bagged_class, adaboost_class, support_vector_class]

### Initial Model Evaluation

For this particular model, we want to use Accuracy as our metric to optimize each model.
The model is a two-class classifcation problem where a classification of 1 indicates a win for the GOP candidate and a classification of 0 indicates a win for the Democratic candidate. Since this study is intended to be unbiased, we value a false negative and false positive the same: a wrong prediction. If we were working for one party or saw unquantifiable signs in favor of one party, we may want to minimize false positives or negatives to be more pessimistic or optimistic in one direction. 

In [135]:
#all cross val scores
for model in models:
    random.seed(17)
    cv_scores = cross_val_score(model, X_train_sc,  y_train, cv=5)
    print('\n\n Model:', model)
    print('Mean CV Scores:', cv_scores.mean())
    print('Std CV Scores:', cv_scores.std())



 Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Mean CV Scores: 0.823296301571476
Std CV Scores: 0.016147078455273498


 Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Mean CV Scores: 0.8165311209229454
Std CV Scores: 0.02254119395934345


 Model: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Mean CV Scores: 0.8334510753453923
Std CV Scores: 0

### Model Building with Grid Search

#### Model 1: Logistic Regression

In [138]:
param_grid = { 'C' : [.001, .01, .1, 1, 10, 100, 1000],
               'penalty' : ['l1', 'l2']
             }

clf = GridSearchCV(logreg_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)

C_opt = clf.best_params_['C']
penalty_opt = clf.best_params_['penalty']

clf_opt = LogisticRegression(penalty=penalty_opt, C=C_opt)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'C': 0.1, 'penalty': 'l1'}
Train Accuracy:   0.82996632996633
Test Accuracy:   0.7889447236180904


#### Model 2: KNN Classifier

In [139]:
param_grid = {'n_neighbors' : range(1,15),
              'weights' : ['uniform', 'distance'],
              'leaf_size' : range(1,10)
             }
clf = GridSearchCV(knn_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_neighbors']
w = clf.best_params_['weights']
l = clf.best_params_['leaf_size']

clf_opt = KNeighborsClassifier(n_neighbors=n, weights=w, leaf_size=l)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))


{'leaf_size': 1, 'n_neighbors': 8, 'weights': 'distance'}
Train Accuracy:   0.9983164983164983
Test Accuracy:   0.8793969849246231


#### Model 3: Decision Tree Classifier

In [62]:
random.seed(17)
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth' : [10, 15, 20, 50],
              'max_features' : ['auto', 'sqrt', 'log2', None]
             }
clf = GridSearchCV(dt_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']
mf = clf.best_params_['max_features']

clf_opt = DecisionTreeClassifier(criterion=c, max_depth=md, max_features=mf)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2'}
Train Accuracy:   0.9797979797979798
Test Accuracy:   0.8090452261306532


#### Model 4: Bagging Classifier

In [63]:
random.seed(17)
param_grid = {'n_estimators' : range(1, 20),
              'max_samples' : range(1, 7), 
             }
clf = GridSearchCV(bagged_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
ms = clf.best_params_['max_samples']

clf_opt = BaggingClassifier(n_estimators=n, max_samples=ms)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'max_samples': 6, 'n_estimators': 16}
Train Accuracy:   0.8232323232323232
Test Accuracy:   0.8140703517587939


#### Model 5: Random Forest Model

In [27]:
random.seed(17)
param_grid = {'n_estimators' : [5, 10, 15, 20, 25],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [None, 10, 25, 50],
              }
clf = GridSearchCV(rf_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']

clf_opt = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=md)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 20}
Train Accuracy:   0.9949494949494949
Test Accuracy:   0.8944723618090452


#### Model 6: Adaboost Classifier

In [49]:
random.seed(17)
param_grid = {'n_estimators' : range(50, 100, 5),
              'learning_rate': [.5, 1, 1.5, 2, 2.5]
    
}
clf = GridSearchCV(adaboost_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
n = clf.best_params_['n_estimators']
lr = clf.best_params_['learning_rate']

clf_opt = AdaBoostClassifier(n_estimators=n, learning_rate=lr)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'learning_rate': 1.5, 'n_estimators': 60}
Train Accuracy:   0.9326599326599326
Test Accuracy:   0.8090452261306532


#### Model 7: SVM Classifier

In [42]:
random.seed(17)
param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma' : [.001, .01, .1, 1, 10, 100],
              'kernel' : ['rbf', 'linear']
             }

clf = GridSearchCV(support_vector_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['C']
g = clf.best_params_['gamma']

clf_opt = SVC(C=c, gamma=g)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

print("Train Accuracy:  ", accuracy_score(y_train, y_hat_train))
print("Test Accuracy:  ", accuracy_score(y_test, y_hat_test))

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Train Accuracy:   0.8585858585858586
Test Accuracy:   0.7989949748743719


# Results from Final Two Models

In [122]:
this_year = pd.read_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/2018_final.csv')
# drop non-analysis vars
this_year.drop('Unnamed: 0', inplace=True, axis=1)


# Option 1: drop amount vars from X
this_year.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
this_year.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)

id_cols = ['office', 'loc_date', 'race_id', 'state', 'abbrev', 'year', 'winner', 'rival']


print(this_year.shape)
this_year.head()

(35, 28)


Unnamed: 0,race_id,office,loc_date,state,abbrev,year,winner,rival,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic
0,2018_AZ_sen,AZ_sen,AZ_2018,Arizona,AZ,2018,TBD,TBD,1,0,0,0,0,1.0,-8.5,0,3.7,0,4.7,0,0.504449,0.165003,0.540617,0.205617,0.395501,0.04582,0.1561,0.320867
1,2018_CA_sen,CA_sen,CA_2018,California,CA,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.1,0,0.503697,0.182023,0.590001,0.178872,0.150891,0.058239,0.319912,0.4117
2,2018_CT_sen,CT_sen,CT_2018,Connecticut,CT,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.2,0,0.511489,0.160426,0.572296,0.21469,0.582367,0.108996,0.113004,0.163416
3,2018_DE_sen,DE_sen,DE_2018,Delaware,DE,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,4.0,0,0.516418,0.167856,0.563881,0.223531,0.549541,0.227139,0.081994,0.103068
4,2018_FL_sen,FL_sen,FL_2018,Florida,FL,2018,TBD,TBD,0,1,0,0,1,1.0,-8.5,0,3.7,0,3.5,0,0.510712,0.161149,0.548872,0.241003,0.473384,0.167709,0.069587,0.259145


In [125]:
X_final = this_year.drop(id_cols, axis = 1)
print(X.shape)
print(X_final.shape)
X_final_sc = ss.transform(X_final)

(793, 20)
(35, 20)


In [None]:
random.seed(17)