Import tools.

In [1]:
pip install pipelinehelper

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from pickle import load
from pickle import dump
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
pd.set_option("max_columns", None)
pd.set_option('max_rows', None)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score, make_scorer, f1_score, precision_score, recall_score

Bring in dataframe that has race, poverty, and results joined and has Alaska all tidied up.

In [3]:
df = load(open('race_poverty_results.pkl', 'rb'))

Look at and drop county and all but one state column for modeling. Drop id.

In [4]:
df.columns

Index(['id', 'total_pop', 'total_pop_one_race', 'pop_white',
       'pop_african_american', 'pop_native', 'pop_asian', 'pop_islander',
       'pop_other', 'total_pop_two_races', 'County_x', 'State_x', 'State_y',
       'County_y', '2016_total_votes', 'Obama', 'Romney', '2012_total_votes',
       '2010_land_area', 'Density', 'central_outlying', 'Target', 'state',
       'county', 'poverty_total', 'poverty_under_18',
       'median_household_income'],
      dtype='object')

In [5]:
df = df.drop(['County_x', 'id', 'State_x', 'County_y', 'State_y', 'county'], axis = 1)

Double check datatypes before starting. Categorical columns should be central_outlying, Target, and state. Target will be the target. The other two will be onehotencoded.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141 entries, 0 to 3140
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   total_pop                3141 non-null   float64
 1   total_pop_one_race       3141 non-null   float64
 2   pop_white                3141 non-null   float64
 3   pop_african_american     3141 non-null   float64
 4   pop_native               3141 non-null   float64
 5   pop_asian                3141 non-null   float64
 6   pop_islander             3141 non-null   float64
 7   pop_other                3141 non-null   float64
 8   total_pop_two_races      3141 non-null   float64
 9   2016_total_votes         3141 non-null   float64
 10  Obama                    3141 non-null   float64
 11  Romney                   3141 non-null   float64
 12  2012_total_votes         3141 non-null   float64
 13  2010_land_area           3141 non-null   int64  
 14  Density                 

Check for unique values. Important bits: state should be 51(includes DC) and central_outlying should be 3. Target should be 2.

In [7]:
df.nunique()

total_pop                  3083
total_pop_one_race         3070
pop_white                  3072
pop_african_american       2005
pop_native                 1198
pop_asian                  1275
pop_islander                461
pop_other                  1834
total_pop_two_races        2369
2016_total_votes           2991
Obama                      2808
Romney                     2919
2012_total_votes           3011
2010_land_area             1413
Density                     648
central_outlying              3
Target                        2
state                        51
poverty_total              2794
poverty_under_18           2353
median_household_income    2982
dtype: int64

Should be 2638 for Donnie and 503 for Hillary.

In [8]:
df.Target.value_counts()

Trump      2638
Clinton     503
Name: Target, dtype: int64

In [9]:
state_dummies = pd.get_dummies(df['state'], drop_first = True)
central_outlying = pd.get_dummies(df['central_outlying'], drop_first = True)

In [10]:
df = df.drop(['state', 'central_outlying'], axis = 1)

In [11]:
df = pd.concat([df, state_dummies, central_outlying], axis = 1)
df.head()

Unnamed: 0,total_pop,total_pop_one_race,pop_white,pop_african_american,pop_native,pop_asian,pop_islander,pop_other,total_pop_two_races,2016_total_votes,Obama,Romney,2012_total_votes,2010_land_area,Density,Target,poverty_total,poverty_under_18,median_household_income,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Outlying,Rural
0,58805.0,55648.0,42160.0,11445.0,217.0,881.0,35.0,910.0,3157.0,24661.0,6354.0,17366.0,23909.0,594,93.0,Trump,6459.0,2530.0,53049.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,231767.0,216743.0,189399.0,18217.0,1582.0,2067.0,143.0,5335.0,15024.0,94090.0,18329.0,65772.0,84988.0,1590,128.0,Trump,24056.0,8357.0,47618.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,25223.0,24523.0,11317.0,11933.0,116.0,117.0,1.0,1039.0,700.0,10390.0,5873.0,5539.0,11459.0,885,30.0,Trump,6098.0,2145.0,33074.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,22293.0,21534.0,16555.0,4413.0,60.0,32.0,9.0,465.0,759.0,8748.0,2200.0,6131.0,8391.0,623,36.0,Trump,4316.0,1448.0,35472.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,59134.0,55478.0,50663.0,845.0,337.0,178.0,24.0,3431.0,3656.0,25384.0,2961.0,20741.0,23980.0,645,89.0,Trump,9358.0,3356.0,42906.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [12]:
df.Target = df.Target.map({'Trump': 0, 'Clinton': 1})

In [13]:
df.head()

Unnamed: 0,total_pop,total_pop_one_race,pop_white,pop_african_american,pop_native,pop_asian,pop_islander,pop_other,total_pop_two_races,2016_total_votes,Obama,Romney,2012_total_votes,2010_land_area,Density,Target,poverty_total,poverty_under_18,median_household_income,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Outlying,Rural
0,58805.0,55648.0,42160.0,11445.0,217.0,881.0,35.0,910.0,3157.0,24661.0,6354.0,17366.0,23909.0,594,93.0,0,6459.0,2530.0,53049.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,231767.0,216743.0,189399.0,18217.0,1582.0,2067.0,143.0,5335.0,15024.0,94090.0,18329.0,65772.0,84988.0,1590,128.0,0,24056.0,8357.0,47618.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,25223.0,24523.0,11317.0,11933.0,116.0,117.0,1.0,1039.0,700.0,10390.0,5873.0,5539.0,11459.0,885,30.0,0,6098.0,2145.0,33074.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,22293.0,21534.0,16555.0,4413.0,60.0,32.0,9.0,465.0,759.0,8748.0,2200.0,6131.0,8391.0,623,36.0,0,4316.0,1448.0,35472.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,59134.0,55478.0,50663.0,845.0,337.0,178.0,24.0,3431.0,3656.0,25384.0,2961.0,20741.0,23980.0,645,89.0,0,9358.0,3356.0,42906.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [14]:
X = df.drop(['Target'], axis = 1)
y = df.Target

In [15]:
f1_scores = make_scorer(f1_score)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [17]:
y_train.head()

201     0
683     0
959     0
1145    0
854     0
Name: Target, dtype: int64

https://stackoverflow.com/questions/23045318/scikit-grid-search-over-multiple-classifiers

In [18]:
pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler())
    ])),
    ('classifier', PipelineHelper([
        ('svm', LinearSVC()),
        ('rf', RandomForestClassifier()),
    ])),
])

In [19]:
params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate({
        'std__with_mean': [True, False],
        'std__with_std': [True, False],
        'max__copy': [True],  # just for displaying
    }),
    'classifier__selected_model': pipe.named_steps['classifier'].generate({
        'svm__C': [0.1, 1.0],
        'rf__n_estimators': [100, 20],
    })
}

In [20]:
grid = GridSearchCV(pipe, params, scoring = f1_scores, verbose = 1)

In [21]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   22.9s finished


GridSearchCV(estimator=Pipeline(steps=[('scaler',
                                        PipelineHelper(available_models={'max': MaxAbsScaler(),
                                                                         'std': StandardScaler()})),
                                       ('classifier',
                                        PipelineHelper(available_models={'rf': RandomForestClassifier(),
                                                                         'svm': LinearSVC()}))]),
             param_grid={'classifier__selected_model': [('svm', {'C': 0.1}),
                                                        ('svm', {'C': 1.0}),
                                                        ('rf',
                                                         {'n_estimators': 100}),
                                                        ('rf',
                                                         {'n_estimators': 20})],
                         'scaler__selected_model': [('

In [22]:
grid.best_score_

0.8618974873888513

In [23]:
grid.best_params_

{'classifier__selected_model': ('svm', {'C': 1.0}),
 'scaler__selected_model': ('std', {'with_mean': False, 'with_std': False})}

In [24]:
grid.best_estimator_

Pipeline(steps=[('scaler',
                 PipelineHelper(available_models={'max': MaxAbsScaler(),
                                                  'std': StandardScaler(with_mean=False,
                                                                        with_std=False)},
                                selected_model=StandardScaler(with_mean=False,
                                                              with_std=False))),
                ('classifier',
                 PipelineHelper(available_models={'rf': RandomForestClassifier(),
                                                  'svm': LinearSVC()},
                                selected_model=LinearSVC()))])

In [26]:
# cross_val_predict(pipe, X_train, y_train)

In [None]:
# y_hat_train = cross_val_predict(pipe, X_train, y_train)

In [None]:
confusion_matrix(y_train, y_hat_train)