## Exloring various classification models

We explore various classification models, finding one that performs the best again the baseline model of a coin flip. 





In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

import pandas as pd
import numpy as np

## this is to suppress warnings I was getting in this code. 
import warnings
# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)


In [15]:
survey_df = pd.read_csv('survey_data_train.csv')

In [16]:
replacement = True
estimators = 100
samples = .25

## the models we will run for the training survey data

models = {'slc': LogisticRegression(penalty=None, max_iter=300),
          'svc': SVC(),
          'knn': Pipeline((('scale', StandardScaler()), ('knnc',KNeighborsClassifier()))),
         'ada': AdaBoostClassifier(),
         'gbc': GradientBoostingClassifier(),
         'rfc': RandomForestClassifier(),
          'etc': ExtraTreesClassifier(),
         'xgbc': XGBClassifier()}

In [17]:
#possible features we want to include
features = ['S1','S2', 'Hid_Ethnicity_Buckets','D4','Fan_magnitude']

## target VL1 responses we want to predict
targets = ['VL1r1','VL1r2','VL1r4','VL1r5','VL1r7',
           'VL1r10','VL1r11','VL1r12','VL1r13' ,'VL1r14']

In [18]:
## create a dictionary that will keep the accuracies of all the VL1's
all_accuracy = {}

## initiate our cross validating sets
kfold = KFold(n_splits=5, 
                        shuffle=True, 
                        random_state=5555)

## initiate the diciontaries to keep track of feature importance

xgbc_importance ={} ## feature importance for XGBoost

ada_importance ={} ## feature importance for AdaBoost

gbc_importance ={} ## feature importance for GradientBoost


## start to work through the models, first taking out a target VL1
for target in targets:
    print('Working on value question: ', target)

    accuracy = {} # initiate dictionary of accuracies for this specific VL1
    
    for name, model in models.items(): # grab one specific model
        accu = np.zeros((1,5))
        for i, (train_index, test_index) in enumerate(kfold.split(survey_df)):
            fans_tt = survey_df.iloc[train_index] ## cv training set
            fans_ho = survey_df.iloc[test_index] ## cv hold out set
    
            model.fit(fans_tt[features].values, fans_tt[target].values) ## fitting the model
        
            pred = model.predict(fans_ho[features].values) ## predicting 
        
            accu[0,i] = accuracy_score(fans_ho[target].values, pred) ## storing accuracies in a numpy array
        
        ## grabbing each models feature importances
        
        if name == 'xgbc': 
            xgbc_importance[target] = model.feature_importances_
        if name == 'ada':
            ada_importance[target] = model.feature_importances_
        if name == 'gbc':
            gbc_importance[target] = model.feature_importances_
            
        ## storing the average of the numpy array of accuracies for specifc model used
        accuracy[name] = accu.mean()
    ## storing the dictionary of average accuracies, with key VL1 
    all_accuracy[target] = accuracy

Working on value question:  VL1r1
Working on value question:  VL1r2
Working on value question:  VL1r4
Working on value question:  VL1r5
Working on value question:  VL1r7
Working on value question:  VL1r10
Working on value question:  VL1r11
Working on value question:  VL1r12
Working on value question:  VL1r13
Working on value question:  VL1r14


In [19]:
# printing out the dictionary for each VL
for value, scores in all_accuracy.items():
    print('For question ', value)
    print('here are the accuracy scores')
    print(scores)
    print('------')

For question  VL1r1
here are the accuracy scores
{'slc': 0.6537597923201857, 'svc': 0.6355422366492848, 'knn': 0.6179273076970675, 'ada': 0.6607569014882214, 'gbc': 0.65918787350226, 'rfc': 0.6193750532339681, 'etc': 0.6065872531126857, 'xgbc': 0.6367471260937079}
------
For question  VL1r2
here are the accuracy scores
{'slc': 0.6380745355632026, 'svc': 0.6348179634885958, 'knn': 0.6022436525090398, 'ada': 0.6438650809192715, 'gbc': 0.6449497070948776, 'rfc': 0.6087556318808317, 'etc': 0.6042910400224802, 'xgbc': 0.611411761194421}
------
For question  VL1r4
here are the accuracy scores
{'slc': 0.7999762676600277, 'svc': 0.7947881306268758, 'knn': 0.7740374752575796, 'ada': 0.7998551308081444, 'gbc': 0.7984077492641882, 'rfc': 0.77886424009557, 'etc': 0.7647486665118484, 'xgbc': 0.7868281145238281}
------
For question  VL1r5
here are the accuracy scores
{'slc': 0.6508633548647292, 'svc': 0.6497784374947677, 'knn': 0.6118966725948984, 'ada': 0.6477274100518835, 'gbc': 0.6515871912338851

In [20]:
## for each VL1, we want to print out the better performing model

for VL in all_accuracy.items():
    print('For ', VL[0])
    
    for i, value in enumerate(VL[1].items()):
        if i == 0:
            name = value[0]
            score = value[1]
        if i > 0:
            if value[1] > score:
                name = value[0]
                score = value[1]
        if value[0] == 'ada':
            print('Adaboost score was ', value[1])
    print('The best performing model was: ', name)
    print('with accuracy score ', score)
    print('-----')


For  VL1r1
Adaboost score was  0.6607569014882214
The best performing model was:  ada
with accuracy score  0.6607569014882214
-----
For  VL1r2
Adaboost score was  0.6438650809192715
The best performing model was:  gbc
with accuracy score  0.6449497070948776
-----
For  VL1r4
Adaboost score was  0.7998551308081444
The best performing model was:  slc
with accuracy score  0.7999762676600277
-----
For  VL1r5
Adaboost score was  0.6477274100518835
The best performing model was:  gbc
with accuracy score  0.6515871912338851
-----
For  VL1r7
Adaboost score was  0.6081535875508589
The best performing model was:  slc
with accuracy score  0.6094818706034203
-----
For  VL1r10
Adaboost score was  0.7252981648203731
The best performing model was:  slc
with accuracy score  0.7300019728417585
-----
For  VL1r11
Adaboost score was  0.7727113761626845
The best performing model was:  gbc
with accuracy score  0.7730735491423234
-----
For  VL1r12
Adaboost score was  0.9185669888974873
The best performing mod