## In this notebook, I exlore different models, determining the one of best performance, the tuning the hyper parameters



In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plot

## this is to suppress warnings I was getting in this code. 
import warnings
# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)


In [4]:
survey_df = pd.read_csv('survey_data_train.csv')
# fan_tt, fan_val = train_test_split(survey_df, 
#                                       shuffle = True,
#                                       random_state = 5555,
#                                       test_size = .2,)

In [9]:

##we should run stratified kfold splits here

In [5]:
replacement = True
estimators = 100
samples = .25

models = {'slc': LogisticRegression(penalty=None, max_iter=300),
          'svc': SVC(),
          'knn': Pipeline((('scale', StandardScaler()), ('knnc',KNeighborsClassifier()))),
        'bagged_lr': BaggingClassifier(LogisticRegression(penalty=None, max_iter=300), bootstrap = replacement, n_estimators = estimators, max_samples = samples),
         'ada': AdaBoostClassifier(),
        'bagged_svc': BaggingClassifier(SVC(),bootstrap = replacement, n_estimators = estimators, max_samples = samples),
         'bagged_knn': BaggingClassifier(KNeighborsClassifier(),bootstrap = replacement, n_estimators = estimators, max_samples = samples),
         'gbc': GradientBoostingClassifier(),
         'rfc': RandomForestClassifier(),
          'etc': ExtraTreesClassifier(),
         'xgbc': XGBClassifier()}

In [6]:
# from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [169]:
# fan_tt.columns

In [8]:
#features we want
features = ['S2', 'D4','S12r3','Team6_magnitude', 'Fan_magnitude']
#removed VL2, 'D6'
targets = ['VL1r1','VL1r2','VL1r4','VL1r5','VL1r7',
           'VL1r10','VL1r11','VL1r12','VL1r13' ,'VL1r14']

In [12]:
all_accuracy = {}

kfold = KFold(n_splits=5, 
                        shuffle=True, 
                        random_state=5555)

xgbc_importance ={}

ada_importance ={}

gbc_importance ={}

for target in targets:
    print('Working on value question: ', target)

    accuracy = {}
    
    for name, model in models.items():
        print('Working on model: ', name)
        accu = np.zeros((1,5))
        for i, (train_index, test_index) in enumerate(kfold.split(survey_df)):
            fans_tt = survey_df.iloc[train_index]
            fans_ho = survey_df.iloc[test_index]
    
            model.fit(fans_tt[features].values, fans_tt[target].values)
        
            pred = model.predict(fans_ho[features].values)
        
            accu[0,i] = accuracy_score(fans_ho[target].values, pred)
        
        if name == 'xgbc':
            xgbc_importance[target] = model.feature_importances_
        if name == 'ada':
            ada_importance[target] = model.feature_importances_
        if name == 'gbc':
            gbc_importance[target] = model.feature_importances_
        accuracy[name] = accu.mean()
    all_accuracy[target] = accuracy

Working on value question:  VL1r1
Working on model:  slc
Working on model:  svc
Working on model:  knn
Working on model:  bagged_lr
Working on model:  ada
Working on model:  bagged_svc
Working on model:  bagged_knn
Working on model:  gbc
Working on model:  rfc
Working on model:  etc
Working on model:  xgbc
Working on value question:  VL1r2
Working on model:  slc
Working on model:  svc
Working on model:  knn
Working on model:  bagged_lr
Working on model:  ada
Working on model:  bagged_svc
Working on model:  bagged_knn
Working on model:  gbc
Working on model:  rfc
Working on model:  etc
Working on model:  xgbc
Working on value question:  VL1r4
Working on model:  slc
Working on model:  svc
Working on model:  knn
Working on model:  bagged_lr
Working on model:  ada
Working on model:  bagged_svc
Working on model:  bagged_knn
Working on model:  gbc
Working on model:  rfc
Working on model:  etc
Working on model:  xgbc
Working on value question:  VL1r5
Working on model:  slc
Working on model:  

In [13]:
# print('Average Accuracy_score for VL1r1')
all_accuracy

{'VL1r1': {'slc': 0.6544832646963972,
  'svc': 0.6394023090256418,
  'knn': 0.6319238555879833,
  'bagged_lr': 0.6542419373742859,
  'ada': 0.6584647651189929,
  'bagged_svc': 0.6367485820654852,
  'bagged_knn': 0.6431410989529379,
  'gbc': 0.6614804466630219,
  'rfc': 0.6442271811003215,
  'etc': 0.640970390629948,
  'xgbc': 0.6390415920177803},
 'VL1r2': {'slc': 0.6403680551056199,
  'svc': 0.6350605283867178,
  'knn': 0.5993487438239498,
  'bagged_lr': 0.6403676911126756,
  'ada': 0.646037682005572,
  'bagged_svc': 0.6355423822464625,
  'bagged_knn': 0.6286642987712325,
  'gbc': 0.6431415357444712,
  'rfc': 0.6337286782033017,
  'etc': 0.6202187160804075,
  'xgbc': 0.6182888982879955},
 'VL1r4': {'slc': 0.7981661307477216,
  'svc': 0.7947881306268758,
  'knn': 0.77572727610248,
  'bagged_lr': 0.7984073852712438,
  'ada': 0.802026567117023,
  'bagged_svc': 0.7947881306268758,
  'bagged_knn': 0.7958735575869598,
  'gbc': 0.798648712593355,
  'rfc': 0.7896008671767907,
  'etc': 0.78984

In [14]:
for value, scores in all_accuracy.items():
    print('For question ', value)
    print('here are the accuracy scores')
    print(scores)
    
    print('------')

For question  VL1r1
here are the accuracy scores
{'slc': 0.6544832646963972, 'svc': 0.6394023090256418, 'knn': 0.6319238555879833, 'bagged_lr': 0.6542419373742859, 'ada': 0.6584647651189929, 'bagged_svc': 0.6367485820654852, 'bagged_knn': 0.6431410989529379, 'gbc': 0.6614804466630219, 'rfc': 0.6442271811003215, 'etc': 0.640970390629948, 'xgbc': 0.6390415920177803}
------
For question  VL1r2
here are the accuracy scores
{'slc': 0.6403680551056199, 'svc': 0.6350605283867178, 'knn': 0.5993487438239498, 'bagged_lr': 0.6403676911126756, 'ada': 0.646037682005572, 'bagged_svc': 0.6355423822464625, 'bagged_knn': 0.6286642987712325, 'gbc': 0.6431415357444712, 'rfc': 0.6337286782033017, 'etc': 0.6202187160804075, 'xgbc': 0.6182888982879955}
------
For question  VL1r4
here are the accuracy scores
{'slc': 0.7981661307477216, 'svc': 0.7947881306268758, 'knn': 0.77572727610248, 'bagged_lr': 0.7984073852712438, 'ada': 0.802026567117023, 'bagged_svc': 0.7947881306268758, 'bagged_knn': 0.79587355758695

In [42]:
# 'S2', 'D4','S12r3','Team6_magnitude', 'Fan_magnitude']
for VL in all_accuracy.items():
    print('For ', VL[0])

   
    for i, value in enumerate(VL[1].items()):
        if i == 0:
            name = value[0]
            score = value[1]
        if i > 0:
            if value[1] > score:
                name = value[0]
                score = value[1]
        if value[0] == 'ada':
            print('Adaboost score was ', value[1])
    print('The best performing model was: ', name)
    print('with accuracy score ', score)
    print('-----')


For  VL1r1
Adaboost score was  0.6584647651189929
The best performing model was:  gbc
with accuracy score  0.6614804466630219
-----
For  VL1r2
Adaboost score was  0.646037682005572
The best performing model was:  ada
with accuracy score  0.646037682005572
-----
For  VL1r4
Adaboost score was  0.802026567117023
The best performing model was:  ada
with accuracy score  0.802026567117023
-----
For  VL1r5
Adaboost score was  0.6502603641531013
The best performing model was:  gbc
with accuracy score  0.6524318732605687
-----
For  VL1r7
Adaboost score was  0.5997087328459225
The best performing model was:  gbc
with accuracy score  0.6063439602286749
-----
For  VL1r10
Adaboost score was  0.7301238376795305
The best performing model was:  slc
with accuracy score  0.7326558453990927
-----
For  VL1r11
Adaboost score was  0.7684884028207998
The best performing model was:  gbc
with accuracy score  0.7693335216390166
-----
For  VL1r12
Adaboost score was  0.9179637797900926
The best performing model w

In [20]:
# ['S2', 'D4','S12r3','Team6_magnitude', 'Fan_magnitude']
for VL, importance in ada_importance.items():
    print('For ', VL)
    print('the importance array is')
    print(importance)
    print('----')

For  VL1r1
the importance array is
[0.14 0.2  0.18 0.08 0.4 ]
----
For  VL1r2
the importance array is
[0.24 0.1  0.14 0.16 0.36]
----
For  VL1r4
the importance array is
[0.26 0.06 0.16 0.12 0.4 ]
----
For  VL1r5
the importance array is
[0.14 0.06 0.3  0.08 0.42]
----
For  VL1r7
the importance array is
[0.22 0.06 0.16 0.24 0.32]
----
For  VL1r10
the importance array is
[0.28 0.08 0.16 0.08 0.4 ]
----
For  VL1r11
the importance array is
[0.18 0.12 0.2  0.22 0.28]
----
For  VL1r12
the importance array is
[0.2  0.12 0.14 0.24 0.3 ]
----
For  VL1r13
the importance array is
[0.18 0.12 0.16 0.12 0.42]
----
For  VL1r14
the importance array is
[0.3  0.06 0.16 0.16 0.32]
----


In [21]:
# ['S2', 'D4','S12r3','Team6_magnitude', 'Fan_magnitude']
for VL, importance in gbc_importance.items():
    print('For ', VL)
    print('the importance array is')
    print(importance)
    print('----')

For  VL1r1
the importance array is
[0.18227069 0.18109599 0.0940722  0.05824218 0.48431894]
----
For  VL1r2
the importance array is
[0.3479413  0.097618   0.0371088  0.09155629 0.4257756 ]
----
For  VL1r4
the importance array is
[0.2017354  0.04331803 0.10043807 0.0716396  0.5828689 ]
----
For  VL1r5
the importance array is
[0.12317968 0.06912027 0.07294377 0.07379971 0.66095657]
----
For  VL1r7
the importance array is
[0.22060782 0.04381869 0.09893242 0.1513415  0.48529957]
----
For  VL1r10
the importance array is
[0.3198282  0.03413372 0.04218711 0.05690625 0.54694472]
----
For  VL1r11
the importance array is
[0.30204036 0.07978521 0.0959685  0.04317995 0.47902598]
----
For  VL1r12
the importance array is
[0.18870323 0.09712926 0.1078432  0.08260891 0.52371539]
----
For  VL1r13
the importance array is
[0.16796939 0.07144975 0.04838857 0.13888596 0.57330633]
----
For  VL1r14
the importance array is
[0.29896727 0.03361259 0.08344747 0.05929675 0.52467592]
----


In [22]:
# ['S2', 'D4','S12r3','Team6_magnitude', 'Fan_magnitude']
for VL, importance in xgbc_importance.items():
    print('For ', VL)
    print('the importance array is')
    print(importance)
    print('----')

For  VL1r1
the importance array is
[0.17846404 0.23923336 0.1674664  0.17083648 0.24399978]
----
For  VL1r2
the importance array is
[0.22671977 0.18591028 0.16887118 0.17759718 0.2409016 ]
----
For  VL1r4
the importance array is
[0.18850158 0.18027602 0.17106912 0.17190938 0.2882439 ]
----
For  VL1r5
the importance array is
[0.1835938  0.17577393 0.16572104 0.16918705 0.30572417]
----
For  VL1r7
the importance array is
[0.1955036  0.1788336  0.18694891 0.20167771 0.2370362 ]
----
For  VL1r10
the importance array is
[0.23477803 0.15210143 0.16049273 0.15889888 0.2937289 ]
----
For  VL1r11
the importance array is
[0.23551014 0.18737839 0.1653772  0.149922   0.26181227]
----
For  VL1r12
the importance array is
[0.1899572  0.20415819 0.18482003 0.17180638 0.24925816]
----
For  VL1r13
the importance array is
[0.18661036 0.18541257 0.16404298 0.19274868 0.2711854 ]
----
For  VL1r14
the importance array is
[0.2550368  0.16265246 0.15630457 0.15083262 0.27517363]
----
