# Capstone Part 4 - Hyperparameter Optimization and Model Evaluation

For this notebook we will build, optimize and evalate our random forest classifier for the completely clean match data. The first step, as alawys, is to import the pertinent libraries.

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
# import the Dataset
Data_df = pd.read_csv("MatchDataClean_complete.csv")
Data_df

Unnamed: 0,stats.win,stats.kills,stats.deaths,stats.assists,stats.largestKillingSpree,stats.largestMultiKill,stats.killingSprees,stats.longestTimeSpentLiving,stats.doubleKills,stats.tripleKills,...,SummonAery,TasteOfBlood,TimeWarpTonic,Transcendence,Triumph,UltimateHunter,Unflinching,UnsealedSpellbook,Waterwalking,ZombieWard
0,0,1,5,1,0,1,0,405,0,0,...,1,0,0,1,0,0,0,0,0,0
1,1,3,3,2,0,1,0,556,0,0,...,1,0,0,1,0,0,0,0,0,0
2,0,4,10,4,0,1,0,484,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,6,6,4,2,2,2,1260,2,0,...,0,0,0,0,1,0,1,0,0,0
4,0,3,7,0,0,1,0,326,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12260,0,11,8,6,5,2,2,827,2,0,...,0,0,0,0,1,0,0,0,0,0
12261,0,4,9,9,4,2,1,928,1,0,...,0,0,0,0,1,0,0,0,0,0
12262,0,8,7,3,2,1,3,690,0,0,...,0,0,0,0,1,0,0,0,0,0
12263,0,1,3,0,0,1,0,547,0,0,...,0,0,0,0,1,0,0,0,0,0


## Hyperparameter Optimization


As stated above we will be using the Random Forest Classifier to develope a model for the dataset. We will be focussing on optimized the following hyperparameters:

1. n_estimators

2. max_depth

3. min_samples_leaf

In order to optimize the hyperparameters we will use a grid search using a 5 fold K cross validation with a test set of 20%.

In [3]:
# import Libraries for Hyperparameter Optimization
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [4]:
# Set target and features
y = Data_df['stats.win']
X = Data_df.drop('stats.win', axis=1)

In [19]:
# Split data into test and remainder set (for CV)
X_remain, X_test, y_remain, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 1. Make the pipeline (instantiate a placeholder)
mypipeline = Pipeline([('classifier', RandomForestClassifier())])

In [None]:
# Set up hyperparameter lists
n_estimators_list = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
max_depth_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]
min_samples_leaf_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [None]:
# 2. Setup the param grid
param_grid = [
    {
        'classifier':[RandomForestClassifier()],
        'classifier__n_estimators':n_estimators_list,
        'classifier__max_depth':max_depth_list,
        'classifier__min_samples_leaf':min_samples_leaf_list
    }
    ]

In [None]:
# 3. Make the gridsearch estimator (instantiate)
mygs = GridSearchCV(mypipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [None]:
# 4. Fit the grid serach
mygs.fit(X_remain, y_remain)

In [None]:
# Check the best estimator
mygs.best_estimator_

In [None]:
# 5. Score
print(f"The accuracy score on the test set is: {mygs.score(X_test, y_test)}")

In [None]:
# Output best model to pkl file
import joblib
joblib.dump(mygs.best_estimator_, 'Optimized Random Forest Classifier Model.pkl')

In [5]:
# Test a prediction score which will be important for the app
# import Model
import joblib
def load_model(path):
    model = joblib.load(path)
    return model

# Load the optimized model
model = load_model('Optimized Random Forest Classifier Model.pkl')



In [6]:
# Get first row
data = Data_df.drop('stats.win', axis=1).loc[0:1]
data

Unnamed: 0,stats.kills,stats.deaths,stats.assists,stats.largestKillingSpree,stats.largestMultiKill,stats.killingSprees,stats.longestTimeSpentLiving,stats.doubleKills,stats.tripleKills,stats.quadraKills,...,SummonAery,TasteOfBlood,TimeWarpTonic,Transcendence,Triumph,UltimateHunter,Unflinching,UnsealedSpellbook,Waterwalking,ZombieWard
0,1,5,1,0,1,0,405,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,3,3,2,0,1,0,556,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [7]:
# Check what order the classes are
model.classes_

array([0, 1], dtype=int64)

In [8]:
# Get probability score for win outcome
Performance_score = round((model.predict_proba(data)[1][1]*100),2)
print(Performance_score)

68.92


In [9]:
# In order to create the app, I will need top specify the columns correct columns to be used. So i will now pull a list of the feature columns and copy and paste it into my app
features_list =[]
for col in data.columns:
    features_list.append(col)
    
features_list

['stats.kills',
 'stats.deaths',
 'stats.assists',
 'stats.largestKillingSpree',
 'stats.largestMultiKill',
 'stats.killingSprees',
 'stats.longestTimeSpentLiving',
 'stats.doubleKills',
 'stats.tripleKills',
 'stats.quadraKills',
 'stats.pentaKills',
 'stats.magicDamageDealt',
 'stats.physicalDamageDealt',
 'stats.trueDamageDealt',
 'stats.largestCriticalStrike',
 'stats.totalHeal',
 'stats.totalUnitsHealed',
 'stats.damageSelfMitigated',
 'stats.damageDealtToObjectives',
 'stats.visionScore',
 'stats.timeCCingOthers',
 'stats.totalDamageTaken',
 'stats.goldEarned',
 'stats.turretKills',
 'stats.inhibitorKills',
 'stats.totalMinionsKilled',
 'stats.neutralMinionsKilled',
 'stats.neutralMinionsKilledTeamJungle',
 'stats.neutralMinionsKilledEnemyJungle',
 'stats.totalTimeCrowdControlDealt',
 'stats.visionWardsBoughtInGame',
 'stats.wardsPlaced',
 'stats.wardsKilled',
 'stats.firstBloodKill',
 'stats.firstBloodAssist',
 'stats.firstTowerKill',
 'stats.firstTowerAssist',
 'stats.firstInhi

In [10]:
for feature in features_list:
    print(feature)

stats.kills
stats.deaths
stats.assists
stats.largestKillingSpree
stats.largestMultiKill
stats.killingSprees
stats.longestTimeSpentLiving
stats.doubleKills
stats.tripleKills
stats.quadraKills
stats.pentaKills
stats.magicDamageDealt
stats.physicalDamageDealt
stats.trueDamageDealt
stats.largestCriticalStrike
stats.totalHeal
stats.totalUnitsHealed
stats.damageSelfMitigated
stats.damageDealtToObjectives
stats.visionScore
stats.timeCCingOthers
stats.totalDamageTaken
stats.goldEarned
stats.turretKills
stats.inhibitorKills
stats.totalMinionsKilled
stats.neutralMinionsKilled
stats.neutralMinionsKilledTeamJungle
stats.neutralMinionsKilledEnemyJungle
stats.totalTimeCrowdControlDealt
stats.visionWardsBoughtInGame
stats.wardsPlaced
stats.wardsKilled
stats.firstBloodKill
stats.firstBloodAssist
stats.firstTowerKill
stats.firstTowerAssist
stats.firstInhibitorKill
stats.firstInhibitorAssist
gameDuration
Cleanse
Exhaust
Flash
Ghost
Heal
Ignite
Smite
Teleport
'Your Cut'
Abyssal Mask
Adaptive Helm
Aegis o

In [11]:
# see length of this list
len(features_list)

278

## Model Evaluation: Feature Importance using the default SKlearn Package

In [13]:
# Create a dictionary for coefficients of the features dataframe
Coefficients = {'Feature':  list(X.columns),
        'Coefficients': model.named_steps['classifier'].feature_importances_}

In [14]:
# create the coefficient dataframe
coef_df = pd.DataFrame(Coefficients, columns = ['Feature','Coefficients'])

# check
coef_df.head()

Unnamed: 0,Feature,Coefficients
0,stats.kills,0.022469
1,stats.deaths,0.079365
2,stats.assists,0.031587
3,stats.largestKillingSpree,0.032314
4,stats.largestMultiKill,0.00726


In [15]:
# See Feature length
coef_df.shape

(278, 2)

In [16]:
# Sort the Dataframe and show top 10 contributing features
display(coef_df.sort_values(['Coefficients'], ascending=False).head(10))


Unnamed: 0,Feature,Coefficients
18,stats.damageDealtToObjectives,0.128662
23,stats.turretKills,0.119157
1,stats.deaths,0.079365
24,stats.inhibitorKills,0.079263
28,stats.neutralMinionsKilledEnemyJungle,0.050105
215,Fraction of damage dealt to turrets,0.040033
38,stats.firstInhibitorAssist,0.038421
3,stats.largestKillingSpree,0.032314
2,stats.assists,0.031587
21,stats.totalDamageTaken,0.031007


In [17]:
# Drop the rows values with coefficients of 0
# Find indexes to drop
indexNames = coef_df[coef_df['Coefficients'] == 0 ].index

In [18]:
# Create a list of non-contributing features
Non_contributing_features = coef_df['Feature'].iloc[indexNames].tolist()
# See how many features out of 278 are non contributing
len(Non_contributing_features)

101

## Model Evaluation: Feature Importance using the Permutation Feature Importance

In [20]:
from sklearn.metrics import r2_score
from rfpimp import permutation_importances

def r2(rf, X_train, y_train):
    return r2_score(y_train, rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(rf, X_remain, y_remain, r2)

ModuleNotFoundError: No module named 'rfpimp'