# Senate Modeling

### Load Libraries, Set Options and Styling

In [1]:
# import libraries
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import random
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline

pd.set_option('display.max_columns', 500)

### Import Data

In [2]:
df = pd.read_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/sens_final.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)

### Lists to Use

In [3]:
state_codes = df.abbrev.unique()
state_names = df.state.unique()
years = list(range(1976, 2018))

id_cols = ['office', 'loc_date_id', 'race_id', 'state', 'abbrev', 'year', 'winner', 'rival']

percent_columns = ['percent_male', 'percent_under18', 'percent_age_18to29', 
                   'percent_age_30to59', 'percent_age_60over', 'percent_race_white', 'percent_race_black',
                   'percent_race_natamer', 'percent_race_asian', 'percent_race_hispanic']
amnt_columns = ['total_male', 'total_female', 'age_under18', 'age_18to29', 'age_30to59',
       'age_60over', 'race_white_nh', 'race_black', 'race_natamer', 'race_asian', 'race_hispanic']

## Model with Percentages

### Set Up

In [4]:
df.columns

Index(['race_id', 'office', 'loc_date_id', 'state', 'abbrev', 'year',
       'GOP_win', 'winner', 'rival', 'pred_GOP', 'pred_DEM', 'unopposed',
       'inc_GOP_running', 'inc_DEM_running', 'prez_GOP',
       'approval_effects_GOP', 'approval_effects_DEM', 'nat_UR_effects_GOP',
       'nat_UR_effects_DEM', 'state_UR_effects_GOP', 'state_UR_effects_DEM',
       'total_male', 'total_female', 'age_under18', 'age_18to29', 'age_30to59',
       'age_60over', 'race_white_nh', 'race_black', 'race_natamer',
       'race_asian', 'race_hispanic', 'percent_male', 'percent_female',
       'percent_under18', 'percent_age_18to29', 'percent_age_30to59',
       'percent_age_60over', 'percent_race_white', 'percent_race_black',
       'percent_race_natamer', 'percent_race_asian', 'percent_race_hispanic'],
      dtype='object')

In [5]:
#basic setup
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)


# Option 1: drop amount vars from X
X.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
X.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)



# Option 2: drop percentage vars
# X.drop(percent_columns, axis=1, inplace=True)


#Optional Drops
# X.drop(['inc_GOP_running', 'inc_DEM_running'], axis=1, inplace=True)
# X.drop(['pred_GOP', 'pred_DEM'], axis=1, inplace=True)
# X.drop(['pred_GOP', 'pred_DEM'], axis=1, inplace=True)
# X.drop(['nat_UR_effects_GOP', 'nat_UR_effects_DEM'], axis=1, inplace=True)



# Check
print(X.shape)
X.head()

(598, 20)


Unnamed: 0,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic
0,1,0,0,1,0,0.0,0.0,13.0,0.0,5.8,0.0,10.2,0.467749,0.265177,0.350291,0.047579,0.755858,0.033497,0.037575,0.022343
1,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.5,0.519021,0.207244,0.33038,0.153091,0.728452,0.257028,0.002475,0.009046
2,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.5,0.519021,0.207244,0.33038,0.153091,0.728452,0.257028,0.002475,0.009046
3,0,1,0,0,0,0.0,0.0,13.0,0.0,5.8,0.0,6.4,0.516574,0.193039,0.322104,0.183068,0.81576,0.16664,0.002877,0.008671
4,0,1,0,0,1,0.0,0.0,13.0,0.0,5.8,0.0,5.1,0.504141,0.242071,0.345653,0.121056,0.791237,0.034371,0.011381,0.115084


In [6]:
random.seed(19)
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

# scale data 
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

# initiate basic models
logreg_class = LogisticRegression()

knn_class = KNeighborsClassifier()

dt_class = DecisionTreeClassifier()

bagged_class = BaggingClassifier()

rf_class = RandomForestClassifier()

adaboost_class = AdaBoostClassifier()

support_vector_class = SVC()


### Initial Model Evaluation

For this particular model, we want to use Accuracy as our metric to optimize each model.
The model is a two-class classifcation problem where a classification of 1 indicates a win for the GOP candidate and a classification of 0 indicates a win for the Democratic candidate. Since this study is intended to be unbiased, we value a false negative and false positive the same: a wrong prediction. If we were working for one party or saw unquantifiable signs in favor of one party, we may want to minimize false positives or negatives to be more pessimistic or optimistic in one direction. 

In [7]:
# #all cross val scores
# for model in models:
#     random.seed(19)
#     cv_scores = cross_val_score(model, X_train_sc,  y_train, cv=5)
#     print('\n\n Model:', model)
#     print('Mean CV Scores:', cv_scores.mean())
#     print('Std CV Scores:', cv_scores.std())

### Model Building with Grid Search

### Start New Modeling Session

In [8]:
# building 
report = open("model_report.txt", "a")
report.write("\n***START NEW SESSION***\n\n")

26

#### Model 1: Logistic Regression

In [9]:
#name the model
report.write("\nLogistic Regression\n\n")
report.write("Grid Search:\n")

param_grid = { 'C' : [.01, .1, .2, .3, .4, .6, 1],
               'penalty' : ['l1', 'l2']
             }

clf = GridSearchCV(logreg_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)

C_opt = clf.best_params_['C']
penalty_opt = clf.best_params_['penalty']

report.write("Parameters: C: %f | penalty: %s \n" % (C_opt, penalty_opt))

lr_opt = LogisticRegression(penalty=penalty_opt, C=C_opt)
lr_opt.fit(X_train_sc, y_train)

y_hat_train = lr_opt.predict(X_train_sc)
y_hat_test = lr_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f     Test Accuracy:  %f \n" % (train_score, test_score))

{'C': 0.3, 'penalty': 'l2'}
Train Score:  0.8347280334728033
Test Score:  0.85


55

In [10]:
report.write("\nModded Version: \n")
lr_opt = LogisticRegression(penalty='l1', C=.4)
lr_opt.fit(X_train_sc, y_train)

y_hat_train = lr_opt.predict(X_train_sc)
y_hat_test = lr_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

C_opt = lr_opt.C
penalty_opt = lr_opt.penalty


report.write("Parameters: C: %f | penalty: %s \n" % (C_opt, penalty_opt))
print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f   Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8347280334728033
Test Score:  0.8583333333333333


53

#### Model 2: KNN Classifier

In [11]:
#name the model
report.write("\n\nKNN\n\n")
report.write("Grid Search:\n")

param_grid = {'n_neighbors' : range(1, 25, 1),
              'leaf_size' : range(1, 4, 1),
              'weights' : ['distance', 'uniform']
             }

knn_class = KNeighborsClassifier()
clf = GridSearchCV(knn_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)


print(clf.best_params_)
n = clf.best_params_['n_neighbors']
ls = clf.best_params_['leaf_size']
w = clf.best_params_['weights']
report.write("Parameters: N_neighbors: %f | Leaf Size: %d | weights: %s \n" % (n, ls, w))

y_hat_train = clf.predict(X_train_sc)
y_hat_test = clf.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

{'leaf_size': 1, 'n_neighbors': 13, 'weights': 'distance'}
Train Score:  0.997907949790795
Test Score:  0.825


54

In [12]:
report.write("\nModded Version: \n")

knn_opt = KNeighborsClassifier(n_neighbors= 25, leaf_size=1, weights='uniform')
knn_opt.fit(X_train_sc, y_train)

n = knn_opt.n_neighbors
ls = knn_opt.leaf_size
w = knn_opt.weights
report.write("Parameters: N_neighbors: %f | Leaf Size: %d | weights: %s \n" % (n, ls, w))


y_hat_train = knn_opt.predict(X_train_sc)
y_hat_test = knn_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8221757322175732
Test Score:  0.8333333333333334


54

#### Model 3: Decision Tree Classifier

In [13]:
report.write("\n\nDecision Tree\n\n")
report.write("Grid Search:\n")

random.seed(19)
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth' : [1, 5, 10, 20, None],
              'max_leaf_nodes' : [None, 2, 5, 10, 50],
             }
clf = GridSearchCV(dt_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)
print(clf.best_params_)
c = clf.best_params_['criterion']
md = clf.best_params_['max_depth']
ml = clf.best_params_['max_leaf_nodes']

# report.write("Parameters: Criterion: %s | Max Depth: %f | Max Leaf Nodes: %s \n" % (c, md, ml))


clf_opt = DecisionTreeClassifier(criterion=c, max_depth=md, max_features=ml)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

{'criterion': 'gini', 'max_depth': 1, 'max_leaf_nodes': None}
Train Score:  0.8179916317991632
Test Score:  0.8166666666666667


54

In [14]:
report.write("\nModded Version: \n")

dt_opt = DecisionTreeClassifier(criterion='gini', max_depth=1, max_leaf_nodes = 5, random_state=19)
dt_opt.fit(X_train_sc, y_train)

c = dt_opt.criterion
md = dt_opt.max_depth
ml = dt_opt.max_leaf_nodes

report.write("Parameters: Criterion: %s | Max Depth: %f | Max Leaf Nodes: %s \n" % (c, md, ml))

y_hat_train = dt_opt.predict(X_train_sc)
y_hat_test = dt_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8179916317991632
Test Score:  0.8166666666666667


54

#### Model 4: Bagging Classifier

In [15]:
report.write("\n\nBagging Classifier\n\n")
report.write("Grid Search:\n")

random.seed(19)
param_grid = {'n_estimators' : range(10, 30),
              'max_samples' : [.01, .1, .25, .6],
              
             }
clf = GridSearchCV(bagged_class, param_grid, cv=5)
clf.fit(X_train_sc, y_train)

print(clf.best_params_)
n = clf.best_params_['n_estimators']
ms = clf.best_params_['max_samples']
report.write("Parameters: N_estimators: %d | Max Samples: %f \n" % (n, ms))

bag_opt = BaggingClassifier(n_estimators=n,
                            max_samples=ms,
                            base_estimator=dt_opt)
bag_opt.fit(X_train_sc, y_train)

y_hat_train = bag_opt.predict(X_train_sc)
y_hat_test = bag_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

{'max_samples': 0.1, 'n_estimators': 17}
Train Score:  0.8305439330543933
Test Score:  0.8166666666666667


54

In [16]:
report.write("\nModded Version: \n")
bag_opt = BaggingClassifier(base_estimator= dt_opt,
                            n_estimators = 50,
                            max_samples= .2)
bag_opt.fit(X_train_sc, y_train)

n = bag_opt.n_estimators
ms = bag_opt.max_samples
report.write("Parameters: N_estimators: %d | Max Samples: %f \n" % (n, ms))

y_hat_train = bag_opt.predict(X_train_sc)
y_hat_test = bag_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8200836820083682
Test Score:  0.8166666666666667


54

#### Model 5: Random Forest Model

In [17]:
report.write("\n\nRandom Forest\n\n")
report.write("Grid Search:\n")

random.seed(19)
param_grid = {'n_estimators' : range(1, 30, 2),
              'max_leaf_nodes' : [20, 50, 60, 75],
              'max_depth' : [10, 15, 20, 25, 30, 35, 40],
              }
clf = GridSearchCV(rf_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)

print(clf.best_params_)
n = clf.best_params_['n_estimators']
ml = clf.best_params_['max_leaf_nodes']
md = clf.best_params_['max_depth']
report.write("Parameters: N_estimators: %d | Max Leaf Nodes: %s | Max Depth: %f \n" % (n, str(ml), md))

rf_opt = RandomForestClassifier(n_estimators=n, criterion=c, max_depth=md)
rf_opt.fit(X_train_sc, y_train)

y_hat_train = rf_opt.predict(X_train_sc)
y_hat_test = rf_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

{'max_depth': 15, 'max_leaf_nodes': 20, 'n_estimators': 25}
Train Score:  0.99581589958159
Test Score:  0.8083333333333333


54

In [18]:
report.write("\nModded Version: \n")

rf_opt = RandomForestClassifier(n_estimators=4, max_leaf_nodes = 12, max_depth=2, random_state=19)
rf_opt.fit(X_train_sc, y_train)

n = rf_opt.n_estimators
c = rf_opt.criterion
md = rf_opt.max_depth

report.write("Parameters: N_estimators: %d | Criterion: %s | Max Depth: %f \n" % (n, c, md))

y_hat_train = rf_opt.predict(X_train_sc)
y_hat_test = rf_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8263598326359832
Test Score:  0.8166666666666667


54

#### Model 6: Adaboost Classifier

In [19]:
report.write("\n\nAdaboost Classifier\n\n")
report.write("Grid Search:\n")

param_grid = {'n_estimators' : range(1, 20),
              'learning_rate': [.1, .2, .3, .4, .5, .6, .7, 1]
    
}
clf = GridSearchCV(adaboost_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)

print(clf.best_params_)
n = clf.best_params_['n_estimators']
lr = clf.best_params_['learning_rate']
report.write("Parameters: N_estimators: %f | Criterion: %f \n" % (n, lr))

clf_opt = AdaBoostClassifier(n_estimators=n, learning_rate=lr, random_state=19)
clf_opt.fit(X_train_sc, y_train)

y_hat_train = clf_opt.predict(X_train_sc)
y_hat_test = clf_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

{'learning_rate': 0.7, 'n_estimators': 4}
Train Score:  0.8263598326359832
Test Score:  0.85


54

In [20]:
report.write("\nModded Version: \n")
boost_opt = AdaBoostClassifier(n_estimators = 10,
                               learning_rate = .4,
                               random_state = 19)

boost_opt.fit(X_train_sc, y_train)

n = boost_opt.n_estimators
lr = boost_opt.learning_rate
report.write("Parameters: N_estimators: %f | Learning Rate: %f \n" % (n, lr))

y_hat_train = boost_opt.predict(X_train_sc)
y_hat_test = boost_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8305439330543933
Test Score:  0.85


54

#### Model 7: SVM Classifier

In [21]:
report.write("\n\nSupport Vector Machine \n\n")
report.write("Grid Search:\n")

random.seed(19)
param_grid = { 'C' : [0.001, 0.01, 0.1, .2, .3, 1],
              'gamma' : [.001, .01, .1, 1, 10, 100],
              'kernel' : ['poly', 'rbf']
             }

clf = GridSearchCV(support_vector_class, param_grid, cv=3)
clf.fit(X_train_sc, y_train)

print(clf.best_params_)
c = clf.best_params_['C']
g = clf.best_params_['gamma']
k = clf.best_params_['kernel']
report.write("Parameters: C: %f | Gamma: %f | Kernel: %s \n" % (c, g, k))

svm_opt = SVC(C= c, gamma = g, kernel = k, probability=True)
svm_opt.fit(X_train_sc, y_train)

y_hat_train = svm_opt.predict(X_train_sc)
y_hat_test = svm_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
Train Score:  0.8200836820083682
Test Score:  0.8166666666666667


54

In [22]:
report.write("\nModded Version: \n")

svm_opt = SVC(kernel = 'poly',
              C = .2, 
              gamma = .1,
              probability=True)

svm_opt.fit(X_train_sc, y_train)

c = svm_opt.C
g = svm_opt.gamma
k = svm_opt.kernel
report.write("Parameters: C: %f | Gamma: %f | Kernel: %s \n" % (c, g, k))

y_hat_train = svm_opt.predict(X_train_sc)
y_hat_test = svm_opt.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8556485355648535
Test Score:  0.825


54

## Final Voting Classifier

In [51]:
#name the model
# report.write("\n\nVOTER MODEL\n")

# Build model
voter = VotingClassifier([('logreg', lr_opt),
#                           ('dtree', dt_opt),
#                           ('bagging', bag_opt),
                          ('knn', knn_opt),
                          ('svm', svm_opt),
#                           ('rf', rf_opt), 
                          ('boost', boost_opt)
                         ],
                          voting='soft')

# Test Model Against original train and test data
voter.fit(X_train_sc, y_train)

y_hat_train = voter.predict(X_train_sc)
y_hat_test = voter.predict(X_test_sc)

train_score = accuracy_score(y_train, y_hat_train)
test_score = accuracy_score(y_test, y_hat_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
# report.write("Train Accuracy: %f    Test Accuracy:  %f \n" % (train_score, test_score))

Train Score:  0.8389121338912134
Test Score:  0.8416666666666667


  if diff:
  if diff:


In [36]:
report.close()

### 2018 Predictions

In [52]:
this_year = pd.read_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/2018_final.csv')
# drop non-analysis vars
this_year.drop('Unnamed: 0', inplace=True, axis=1)


# Option 1: drop amount vars from X
this_year.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
this_year.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)

id_cols2 = ['office', 'loc_date', 'race_id', 'state', 'abbrev', 'year', 'GOP_candidate', 'DEM_candidate',
            'FTE_GOP', 'FTE_DEM', 'FTE_GOP_win', 'FTE_label']


print(this_year.shape)
this_year

(35, 32)


Unnamed: 0,race_id,office,loc_date,state,abbrev,year,GOP_candidate,DEM_candidate,FTE_GOP,FTE_DEM,FTE_GOP_win,FTE_label,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic
0,2018_AZ_sen,AZ_sen,AZ_2018,Arizona,AZ,2018,Martha McSally,Kyrsten Sinema,0.389,0.611,0,Lean D,1,0,0,0,0,1,-8.5,0,3.7,0,4.7,0,0.504449,0.165003,0.540617,0.205617,0.395501,0.04582,0.1561,0.320867
1,2018_CA_sen,CA_sen,CA_2018,California,CA,2018,none,Dianne Feinstein,0.0,0.956,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.1,0,0.503697,0.182023,0.590001,0.178872,0.150891,0.058239,0.319912,0.4117
2,2018_CT_sen,CT_sen,CT_2018,Connecticut,CT,2018,Matthew Corey,Christopher Murphy,0.003,0.997,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.2,0,0.511489,0.160426,0.572296,0.21469,0.582367,0.108996,0.113004,0.163416
3,2018_DE_sen,DE_sen,DE_2018,Delaware,DE,2018,Rob Arlett,Thomas Carper,0.001,0.999,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.0,0,0.516418,0.167856,0.563881,0.223531,0.549541,0.227139,0.081994,0.103068
4,2018_FL_sen,FL_sen,FL_2018,Florida,FL,2018,Rick Scott,Bill Nelson,0.298,0.702,0,Lean D,0,1,0,0,1,1,-8.5,0,3.7,0,3.5,0,0.510712,0.161149,0.548872,0.241003,0.473384,0.167709,0.069587,0.259145
5,2018_HI_sen,HI_sen,HI_2018,Hawaii,HI,2018,Ron Curtis,Mazie Hirono,0.001,0.999,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,2.2,0,0.50024,0.169397,0.565171,0.225542,0.150909,0.014169,0.482271,0.099611
6,2018_IN_sen,IN_sen,IN_2018,Indiana,IN,2018,Mike Braun,Joe Donnelly,0.281,0.719,0,Lean D,0,1,0,0,1,1,-8.5,0,3.7,0,3.5,0,0.507005,0.162597,0.573229,0.198976,0.742878,0.096451,0.054873,0.077844
7,2018_ME_sen,ME_sen,ME_2018,Maine,ME,2018,Eric Brakey,Angus King,0.01,0.99,0,Solid D,0,1,0,0,0,1,-8.5,0,3.7,0,3.3,0,0.508535,0.14519,0.567623,0.255889,0.922712,0.016676,0.016622,0.016771
8,2018_MD_sen,MD_sen,MD_2018,Maryland,MD,2018,Tony Campbell,Benjamin Cardin,0.001,0.999,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.2,0,0.51603,0.174909,0.580261,0.195758,0.433594,0.305206,0.114523,0.108007
9,2018_MA_sen,MA_sen,MA_2018,Massachusetts,MA,2018,Geoff Diehl,Elizabeth Warren,0.002,0.998,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,3.6,0,0.514963,0.179568,0.599718,0.210936,0.655607,0.075622,0.119435,0.117319


In [53]:
X_final = this_year.drop(id_cols2, axis = 1)
print(X_final.shape)
X_final_sc = ss.fit_transform(X_final)

(35, 20)


In [54]:
y = df['GOP_win']
X = df.drop('GOP_win', axis=1)

# drop non-analysis vars
X.drop(id_cols, inplace=True, axis=1)


# Option 1: drop amount vars from X
X.drop(amnt_columns, axis=1, inplace=True)
# drop out group from binary/collinear groups
X.drop(['percent_male', 'percent_under18', 'percent_race_natamer'], inplace=True, axis=1)

X_scaled = ss.fit_transform(X)
X_scaled.shape

(598, 20)

In [55]:
voter.fit(X_scaled, y)

VotingClassifier(estimators=[('logreg', LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('knn', KN...thm='SAMME.R', base_estimator=None,
          learning_rate=0.4, n_estimators=10, random_state=19))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [56]:
results = voter.predict(X_final_sc)
results_per = voter.predict_proba(X_final_sc)

  if diff:


In [57]:
results_per[0][0]

0.26929327414772003

In [58]:
predictions = this_year

predictions['predictions'] = results



predictions['GOP_prob'] = 0
predictions['DEM_prob'] = 0

In [59]:
nrows = this_year.shape[0]
for i in range(nrows):
    predictions.loc[i, 'DEM_prob'] = results_per[i][0]
    predictions.loc[i, 'GOP_prob'] = results_per[i][1]

In [60]:
predictions

Unnamed: 0,race_id,office,loc_date,state,abbrev,year,GOP_candidate,DEM_candidate,FTE_GOP,FTE_DEM,FTE_GOP_win,FTE_label,pred_GOP,pred_DEM,unopposed,inc_GOP_running,inc_DEM_running,prez_GOP,approval_effects_GOP,approval_effects_DEM,nat_UR_effects_GOP,nat_UR_effects_DEM,state_UR_effects_GOP,state_UR_effects_DEM,percent_female,percent_age_18to29,percent_age_30to59,percent_age_60over,percent_race_white,percent_race_black,percent_race_asian,percent_race_hispanic,predictions,GOP_prob,DEM_prob
0,2018_AZ_sen,AZ_sen,AZ_2018,Arizona,AZ,2018,Martha McSally,Kyrsten Sinema,0.389,0.611,0,Lean D,1,0,0,0,0,1,-8.5,0,3.7,0,4.7,0,0.504449,0.165003,0.540617,0.205617,0.395501,0.04582,0.1561,0.320867,1,0.730707,0.269293
1,2018_CA_sen,CA_sen,CA_2018,California,CA,2018,none,Dianne Feinstein,0.0,0.956,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.1,0,0.503697,0.182023,0.590001,0.178872,0.150891,0.058239,0.319912,0.4117,0,0.187252,0.812748
2,2018_CT_sen,CT_sen,CT_2018,Connecticut,CT,2018,Matthew Corey,Christopher Murphy,0.003,0.997,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.2,0,0.511489,0.160426,0.572296,0.21469,0.582367,0.108996,0.113004,0.163416,0,0.256966,0.743034
3,2018_DE_sen,DE_sen,DE_2018,Delaware,DE,2018,Rob Arlett,Thomas Carper,0.001,0.999,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.0,0,0.516418,0.167856,0.563881,0.223531,0.549541,0.227139,0.081994,0.103068,0,0.288355,0.711645
4,2018_FL_sen,FL_sen,FL_2018,Florida,FL,2018,Rick Scott,Bill Nelson,0.298,0.702,0,Lean D,0,1,0,0,1,1,-8.5,0,3.7,0,3.5,0,0.510712,0.161149,0.548872,0.241003,0.473384,0.167709,0.069587,0.259145,0,0.311292,0.688708
5,2018_HI_sen,HI_sen,HI_2018,Hawaii,HI,2018,Ron Curtis,Mazie Hirono,0.001,0.999,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,2.2,0,0.50024,0.169397,0.565171,0.225542,0.150909,0.014169,0.482271,0.099611,0,0.202506,0.797494
6,2018_IN_sen,IN_sen,IN_2018,Indiana,IN,2018,Mike Braun,Joe Donnelly,0.281,0.719,0,Lean D,0,1,0,0,1,1,-8.5,0,3.7,0,3.5,0,0.507005,0.162597,0.573229,0.198976,0.742878,0.096451,0.054873,0.077844,0,0.387887,0.612113
7,2018_ME_sen,ME_sen,ME_2018,Maine,ME,2018,Eric Brakey,Angus King,0.01,0.99,0,Solid D,0,1,0,0,0,1,-8.5,0,3.7,0,3.3,0,0.508535,0.14519,0.567623,0.255889,0.922712,0.016676,0.016622,0.016771,1,0.571277,0.428723
8,2018_MD_sen,MD_sen,MD_2018,Maryland,MD,2018,Tony Campbell,Benjamin Cardin,0.001,0.999,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,4.2,0,0.51603,0.174909,0.580261,0.195758,0.433594,0.305206,0.114523,0.108007,0,0.308216,0.691784
9,2018_MA_sen,MA_sen,MA_2018,Massachusetts,MA,2018,Geoff Diehl,Elizabeth Warren,0.002,0.998,0,Solid D,0,1,0,0,1,1,-8.5,0,3.7,0,3.6,0,0.514963,0.179568,0.599718,0.210936,0.655607,0.075622,0.119435,0.117319,0,0.276002,0.723998


In [61]:
predictions.to_csv('/Users/Alexz/CodeMaster/capstone/capstone_repo/data/2018_predictions.csv')

In [48]:
def check_predictions(model):
    model.fit(X_scaled, y)
    new_predict = model.predict(X_final)
    for i in range(nrows):
        print(predictions.loc[i, 'race_id'], " ", new_predict[i])

In [49]:
check_predictions(lr_opt)

2018_AZ_sen   0
2018_CA_sen   0
2018_CT_sen   0
2018_DE_sen   0
2018_FL_sen   0
2018_HI_sen   0
2018_IN_sen   0
2018_ME_sen   0
2018_MD_sen   0
2018_MA_sen   0
2018_MI_sen   0
2018_MS_sen   0
2018_MS_sen   0
2018_MN_sen1   0
2018_MN_sen2   0
2018_MO_sen   0
2018_MT_sen   0
2018_NE_sen   1
2018_NV_sen   0
2018_NJ_sen   0
2018_NM_sen   0
2018_NY_sen   0
2018_ND_sen   0
2018_OH_sen   0
2018_PA_sen   0
2018_RI_sen   0
2018_TN_sen   0
2018_TX_sen   0
2018_UT_sen   0
2018_VT_sen   0
2018_VA_sen   0
2018_WA_sen   0
2018_WV_sen   0
2018_WI_sen   0
2018_WY_sen   0
