In [1]:
import pandas as pd
import numpy as np
from pickle import load
from pickle import dump
pd.set_option("max_columns", None)
pd.set_option('max_rows', None)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score, make_scorer, f1_score, precision_score, recall_score

In [2]:
df = load(open('results_race_with_alaska.pkl', 'rb'))

In [3]:
df = df.drop(['County_x', 'State_x', 'County_y'], axis = 1)

In [4]:
df.central_outlying.value_counts()

Rural       1319
Central     1291
Outlying     531
Name: central_outlying, dtype: int64

In [24]:
from tabulate import tabulate

In [25]:
table = table = [['', 'Counties'], ['Donald Trump', 2638], ['Hillary Clinton', 503]]

In [26]:
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))


╒═════════════════╤════════════╕
│                 │   Counties │
╞═════════════════╪════════════╡
│ Donald Trump    │       2638 │
├─────────────────┼────────────┤
│ Hillary Clinton │        503 │
╘═════════════════╧════════════╛


In [5]:
df.Target.value_counts()

Trump      2638
Clinton     503
Name: Target, dtype: int64

In [6]:
df.head()

Unnamed: 0,id,total_pop,total_pop_one_race,pop_white,pop_african_american,pop_native,pop_asian,pop_islander,pop_other,total_pop_two_races,State_y,2016_total_votes,Obama,Romney,2012_total_votes,2010_land_area,Density,central_outlying,Target
0,0500000US01001,58805.0,55648.0,42160.0,11445.0,217.0,881.0,35.0,910.0,3157.0,Alabama,24661.0,6354.0,17366.0,23909.0,594,93.0,Central,Trump
1,0500000US01003,231767.0,216743.0,189399.0,18217.0,1582.0,2067.0,143.0,5335.0,15024.0,Alabama,94090.0,18329.0,65772.0,84988.0,1590,128.0,Central,Trump
2,0500000US01005,25223.0,24523.0,11317.0,11933.0,116.0,117.0,1.0,1039.0,700.0,Alabama,10390.0,5873.0,5539.0,11459.0,885,30.0,Central,Trump
3,0500000US01007,22293.0,21534.0,16555.0,4413.0,60.0,32.0,9.0,465.0,759.0,Alabama,8748.0,2200.0,6131.0,8391.0,623,36.0,Outlying,Trump
4,0500000US01009,59134.0,55478.0,50663.0,845.0,337.0,178.0,24.0,3431.0,3656.0,Alabama,25384.0,2961.0,20741.0,23980.0,645,89.0,Outlying,Trump


In [7]:
state_dummies = pd.get_dummies(df['State_y'], drop_first = True)
central_outlying = pd.get_dummies(df['central_outlying'], drop_first = True)

In [8]:
df = df.drop(['State_y', 'central_outlying', 'id'], axis = 1)

In [9]:
df.head()

Unnamed: 0,total_pop,total_pop_one_race,pop_white,pop_african_american,pop_native,pop_asian,pop_islander,pop_other,total_pop_two_races,2016_total_votes,Obama,Romney,2012_total_votes,2010_land_area,Density,Target
0,58805.0,55648.0,42160.0,11445.0,217.0,881.0,35.0,910.0,3157.0,24661.0,6354.0,17366.0,23909.0,594,93.0,Trump
1,231767.0,216743.0,189399.0,18217.0,1582.0,2067.0,143.0,5335.0,15024.0,94090.0,18329.0,65772.0,84988.0,1590,128.0,Trump
2,25223.0,24523.0,11317.0,11933.0,116.0,117.0,1.0,1039.0,700.0,10390.0,5873.0,5539.0,11459.0,885,30.0,Trump
3,22293.0,21534.0,16555.0,4413.0,60.0,32.0,9.0,465.0,759.0,8748.0,2200.0,6131.0,8391.0,623,36.0,Trump
4,59134.0,55478.0,50663.0,845.0,337.0,178.0,24.0,3431.0,3656.0,25384.0,2961.0,20741.0,23980.0,645,89.0,Trump


In [10]:
df = pd.concat([df, state_dummies, central_outlying], axis = 1)
df.head()

Unnamed: 0,total_pop,total_pop_one_race,pop_white,pop_african_american,pop_native,pop_asian,pop_islander,pop_other,total_pop_two_races,2016_total_votes,Obama,Romney,2012_total_votes,2010_land_area,Density,Target,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Outlying,Rural
0,58805.0,55648.0,42160.0,11445.0,217.0,881.0,35.0,910.0,3157.0,24661.0,6354.0,17366.0,23909.0,594,93.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,231767.0,216743.0,189399.0,18217.0,1582.0,2067.0,143.0,5335.0,15024.0,94090.0,18329.0,65772.0,84988.0,1590,128.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,25223.0,24523.0,11317.0,11933.0,116.0,117.0,1.0,1039.0,700.0,10390.0,5873.0,5539.0,11459.0,885,30.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,22293.0,21534.0,16555.0,4413.0,60.0,32.0,9.0,465.0,759.0,8748.0,2200.0,6131.0,8391.0,623,36.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,59134.0,55478.0,50663.0,845.0,337.0,178.0,24.0,3431.0,3656.0,25384.0,2961.0,20741.0,23980.0,645,89.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [11]:
df.target = df.Target.map({'Trump': 0, 'Clinton': 1})

  df.target = df.Target.map({'Trump': 0, 'Clinton': 1})


In [12]:
df.head()

Unnamed: 0,total_pop,total_pop_one_race,pop_white,pop_african_american,pop_native,pop_asian,pop_islander,pop_other,total_pop_two_races,2016_total_votes,Obama,Romney,2012_total_votes,2010_land_area,Density,Target,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Outlying,Rural
0,58805.0,55648.0,42160.0,11445.0,217.0,881.0,35.0,910.0,3157.0,24661.0,6354.0,17366.0,23909.0,594,93.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,231767.0,216743.0,189399.0,18217.0,1582.0,2067.0,143.0,5335.0,15024.0,94090.0,18329.0,65772.0,84988.0,1590,128.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,25223.0,24523.0,11317.0,11933.0,116.0,117.0,1.0,1039.0,700.0,10390.0,5873.0,5539.0,11459.0,885,30.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,22293.0,21534.0,16555.0,4413.0,60.0,32.0,9.0,465.0,759.0,8748.0,2200.0,6131.0,8391.0,623,36.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,59134.0,55478.0,50663.0,845.0,337.0,178.0,24.0,3431.0,3656.0,25384.0,2961.0,20741.0,23980.0,645,89.0,Trump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [13]:
X = df.drop(['Target'], axis = 1)
y = df.Target

In [14]:
# scoring = {'accuracy' : make_scorer(accuracy_score), 
#            'macro_precision' : make_scorer(precision_score, average = 'macro'),
#            'macro_recall' : make_scorer(recall_score, average = 'macro'), 
#            'macro_f1_score' : make_scorer(f1_score, average = 'macro')}

In [15]:
accuracy = make_scorer(accuracy_score)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [17]:
param = {'svc__kernel': ['rbf', 'poly', 'linear']}
pipe2 = make_pipeline(StandardScaler(), svm.SVC())
grid = GridSearchCV(pipe2, param, scoring= accuracy)

In [18]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('svc', SVC())]),
             param_grid={'svc__kernel': ['rbf', 'poly', 'linear']},
             scoring=make_scorer(accuracy_score))

In [19]:
grid.best_score_

0.9197452229299363

In [20]:
grid.best_params_

{'svc__kernel': 'linear'}

In [21]:
grid.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(kernel='linear'))])

In [22]:
y_hat_train = cross_val_predict(pipe2, X_train, y_train)

In [23]:
confusion_matrix(y_train, y_hat_train)


array([[ 125,  252],
       [  28, 1950]], dtype=int64)

In [29]:
table1 = [['', 'Clinton Predictions', 'Trump Predictions'], ['Clinton Actual', 125, 252], ['Trump Actual', 28, 1950]]

In [30]:
print(tabulate(table1, headers='firstrow', tablefmt='fancy_grid'))

╒════════════════╤═══════════════════════╤═════════════════════╕
│                │   Clinton Predictions │   Trump Predictions │
╞════════════════╪═══════════════════════╪═════════════════════╡
│ Clinton Actual │                   125 │                 252 │
├────────────────┼───────────────────────┼─────────────────────┤
│ Trump Actual   │                    28 │                1950 │
╘════════════════╧═══════════════════════╧═════════════════════╛
