In [None]:
# Import necessary libraries
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import joblib
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [3]:
# Set display options to show all columns if requested
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

In [4]:
# Load the dataset
df = pd.read_csv('prepared_dataset_unfinished_game.csv')

# View the first 20 rows of the dataset
print(df.head(20))

    veteran  hotStreak  win.x   item0  firstBloodKill  firstTowerKill  firstTowerAssist  creepsPerMinDeltas_010  xpPerMinDeltas_010  goldPerMinDeltas_010  csDiffPerMinDeltas_010  damageTakenPerMinDeltas_010  damageTakenDiffPerMinDeltas_010  creepsPerMinDeltas_1020  xpPerMinDeltas_1020  goldPerMinDeltas_1020  csDiffPerMinDeltas_1020  damageTakenPerMinDeltas_1020  damageTakenDiffPerMinDeltas_1020  firstBlood  firstTower  firstDragon  firstRiftHerald  spell1Id  spell2Id  tier_CHALLENGER  tier_DIAMOND  tier_GOLD  tier_GRANDMASTER  tier_IRON  tier_MASTER  tier_PLATINUM  tier_SILVER  role_DUO_CARRY  role_DUO_SUPPORT  role_NONE  role_SOLO  lane_JUNGLE  lane_MIDDLE  lane_TOP
0       0.0        0.0    0.0  3157.0             0.0             0.0               0.0                     0.8               269.4                 227.6                  -0.065                        464.0                             8.15                      2.0                333.1                  283.4                

## Prepare data before training models

In [5]:
# Normalize the data to later use it in the models
# Not all of the columns need to be normalized, so it will be selected the ones that need it
cols_to_normalize = [
    'item0', 'creepsPerMinDeltas_010', 'xpPerMinDeltas_010', 'goldPerMinDeltas_010', 'csDiffPerMinDeltas_010', 'damageTakenPerMinDeltas_010', 'damageTakenDiffPerMinDeltas_010',
    'creepsPerMinDeltas_1020', 'xpPerMinDeltas_1020', 'goldPerMinDeltas_1020', 'csDiffPerMinDeltas_1020', 'damageTakenPerMinDeltas_1020', 'damageTakenDiffPerMinDeltas_1020',
    'spell1Id', 'spell2Id', 
]

# Normalize the selected columns using RobustScaler in order to reduce the influence of outliers
scaler = RobustScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

# View the first 20 rows of the dataset after normalization
print(df.head(20))

    veteran  hotStreak  win.x     item0  firstBloodKill  firstTowerKill  firstTowerAssist  creepsPerMinDeltas_010  xpPerMinDeltas_010  goldPerMinDeltas_010  csDiffPerMinDeltas_010  damageTakenPerMinDeltas_010  damageTakenDiffPerMinDeltas_010  creepsPerMinDeltas_1020  xpPerMinDeltas_1020  goldPerMinDeltas_1020  csDiffPerMinDeltas_1020  damageTakenPerMinDeltas_1020  damageTakenDiffPerMinDeltas_1020  firstBlood  firstTower  firstDragon  firstRiftHerald  spell1Id  spell2Id  tier_CHALLENGER  tier_DIAMOND  tier_GOLD  tier_GRANDMASTER  tier_IRON  tier_MASTER  tier_PLATINUM  tier_SILVER  role_DUO_CARRY  role_DUO_SUPPORT  role_NONE  role_SOLO  lane_JUNGLE  lane_MIDDLE  lane_TOP
0       0.0        0.0    0.0  0.049397             0.0             0.0               0.0               -0.737705           -0.495935             -0.504172               -0.056522                     0.292319                         0.062289                -0.545455            -0.812842              -0.692815            

In [6]:
# Split the dataset into training and testing sets using cross-validation
# The target variable is the win.x column, which indicates whether the player won or lost the game
# The features are all the other columns in the dataset
target = 'win.x'
X = df.drop(columns=[target])
y = df[target]

# 80% of the data will be used for training and 20% for testing
# The X_train and X_test variables will contain the features for the training and testing sets, respectively
# The y_train and y_test variables will contain the target variable for the training and testing sets, respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the X training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (194699, 39)
X_test shape: (48675, 39)


## Training of the base models

In [8]:
# Training base models with little tuning

# Logistic Regression
logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(X_train, y_train) # Train the model

# Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=500, random_state=42)
random_forest.fit(X_train, y_train) # Train the model

# LightGBM
lgb_model = lgb.LGBMClassifier(n_estimators=1000, random_state=42)
lgb_model.fit(X_train, y_train) # Train the model

# MLP Classifier
mlpc = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42, early_stopping=True)
mlpc.fit(X_train, y_train) # Train the model

[LightGBM] [Info] Number of positive: 97470, number of negative: 97229
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3103
[LightGBM] [Info] Number of data points in the train set: 194699, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500619 -> initscore=0.002476
[LightGBM] [Info] Start training from score 0.002476


In [None]:
# Create directories to save the models
folders = ["base", "randomized", "gridsearch"]
for folder in folders:
    os.makedirs(f'./models/{folder}', exist_ok=True)

# Save the trained base models to the './models/base/' folder
joblib.dump(logistic_regression, './models/base/logistic_regression_base_model.pkl')
joblib.dump(random_forest, './models/base/random_forest_base_model.pkl')
joblib.dump(lgb_model, './models/base/lgbm_base_model.pkl')
joblib.dump(mlpc, './models/base/mlp_base_model.pkl')

['./models/base/mlp_base_model.pkl']

## Search best model configuration using RandomizedSearchCV

In [None]:
# Define a parameter grid to explore with RandomizedSearchCV
param_grid_lr_randomized = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500, 1000],
}

# Perform RandomizedSearchCV for Logistic Regression
search_lr_randomized = RandomizedSearchCV(LogisticRegression(random_state=42), param_distributions=param_grid_lr_randomized, n_iter=10, cv=3, verbose=1, scoring='accuracy')
search_lr_randomized.fit(X_train, y_train)

# Show the best score and parameters for Logistic Regression
print("---------------------")
print("Best score for Logistic Regression:")
print(search_lr_randomized.best_score_)
print("Best parameters for Logistic Regression:")
print(search_lr_randomized.best_params_)

# Store the best Logistic Regression model
best_lr_model_randomized = search_lr_randomized.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
---------------------
Best score for Logistic Regression:
0.7069425149123303
Best parameters for Logistic Regression:
{'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 500, 'C': 1}


In [None]:
# Define a parameter grid to explore with RandomizedSearchCV
param_grid_rf_randomized = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4],
    'bootstrap': [True, False]
}

# Perform RandomizedSearchCV for Random Forest
search_rf_randomized = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_grid_rf_randomized, n_iter=10, cv=5, verbose=1, scoring='accuracy')
search_rf_randomized.fit(X_train, y_train)

# Show the best score and parameters for Random Forest
print("---------------------")
print("Best score for Random Forest:")
print(search_rf_randomized.best_score_)
print("Best parameters for Random Forest:")
print(search_rf_randomized.best_params_)

# Store the best Random Forest model
best_rf_model_randomized = search_rf_randomized.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
---------------------
Best score for Random Forest:
0.8298399131730319
Best parameters for Random Forest:
{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}


In [None]:
# Define a parameter grid to explore with RandomizedSearchCV
param_grid_lgb_randomized = {
    'n_estimators': [50, 100, 200, 250, 300, 400, 500, 600, 700, 800, 900, 1000],
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'objective': ['binary'],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [10, 20, 30, 50, 70, 100],
    'max_depth': [-1, 10, 20],
}

# Perform RandomizedSearchCV for LightGBM
search_lgb_randomized = RandomizedSearchCV(lgb.LGBMClassifier(random_state=42), param_distributions=param_grid_lgb_randomized, n_iter=10, cv=5, verbose=1, scoring='accuracy')
search_lgb_randomized.fit(X_train, y_train)

# Show the best score and parameters for LightGBM
print("---------------------")
print("Best score for LightGBM:")
print(search_lgb_randomized.best_score_)
print("Best parameters for LightGBM:")
print(search_lgb_randomized.best_params_)

# Store the best LightGBM model
best_lgb_model_randomized = search_lgb_randomized.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 77976, number of negative: 77783
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3101
[LightGBM] [Info] Number of data points in the train set: 155759, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500620 -> initscore=0.002478
[LightGBM] [Info] Start training from score 0.002478
[LightGBM] [Info] Number of positive: 77976, number of negative: 77783
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3103
[LightGBM] [Info] Number of data points in the trai

In [None]:
# Define a parameter grid to explore with RandomizedSearchCV
param_grid_mlp_randomized = {
    'hidden_layer_sizes': [(100,), (50, 50), (100, 50), (150, 100), (100, 100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'max_iter': [300, 500, 800, 1000],
}

# Perform RandomizedSearchCV for MLP Classifier
search_mlp_randomized = RandomizedSearchCV(MLPClassifier(early_stopping=True, random_state=42), param_distributions=param_grid_mlp_randomized, n_iter=10, cv=5, verbose=1, scoring='accuracy')
search_mlp_randomized.fit(X_train, y_train)

# Show the best score and parameters for MLP Classifier
print("---------------------")
print("Best score for MLP Classifier:")
print(search_mlp_randomized.best_score_)
print("Best parameters for MLP Classifier:")
print(search_mlp_randomized.best_params_)

# Store the best MLP Classifier model
best_mlp_model_randomized = search_mlp_randomized.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
---------------------
Best score for MLP Classifier:
0.7212723126803907
Best parameters for MLP Classifier:
{'solver': 'adam', 'max_iter': 800, 'hidden_layer_sizes': (150, 100), 'alpha': 0.0001, 'activation': 'relu'}


In [None]:
# Save the best models explored with RandomizedSearchCV to the './models/randomized/' folder
joblib.dump(best_lr_model_randomized, './models/randomized/best_lr_model_randomized.pkl')
joblib.dump(best_rf_model_randomized, './models/randomized/best_rf_model_randomized.pkl')
joblib.dump(best_lgb_model_randomized, './models/randomized/best_lgb_model_randomized.pkl')
joblib.dump(best_mlp_model_randomized, './models/randomized/best_mlp_model_randomized.pkl')

['./models/randomized/best_mlp_model_randomized.pkl']

## Search best model configuration using GridSearchCV

In [None]:
# Define a parameter grid to explore with GridSearchCV
param_grid_lr_gridsearch = {
    'C': [0.5, 0.75, 1, 1.25, 1.5],
    'penalty': ['l1'],
    'solver': ['liblinear'],
    'max_iter': [400, 500, 600, 700, 800]
}

# Perform GridSearchCV for Logistic Regression
search_lr_gridsearch = GridSearchCV(LogisticRegression(random_state=42), param_grid=param_grid_lr_gridsearch, cv=3, verbose=1, scoring='accuracy')
search_lr_gridsearch.fit(X_train, y_train)

# Show the best score and parameters for Logistic Regression
print("---------------------")
print("Best score for Logistic Regression:")
print(search_lr_gridsearch.best_score_)
print("Best parameters for Logistic Regression:")
print(search_lr_gridsearch.best_params_)

# Store the best Logistic Regression model
best_lr_model_gridsearch = search_lr_gridsearch.best_estimator_

Fitting 3 folds for each of 25 candidates, totalling 75 fits
---------------------
Best score for Logistic Regression:
0.7069425149123303
Best parameters for Logistic Regression:
{'C': 1, 'max_iter': 400, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:
# Define a parameter grid to explore with GridSearchCV
param_grid_rf_gridsearch = {
    'n_estimators': [250, 300, 350],
    'max_depth': [None, 20, 30],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2],
    'bootstrap': [False]
}

# Perform GridSearchCV for Random Forest
search_rf_gridsearch = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid_rf_gridsearch, cv=3, verbose=1, scoring='accuracy')
search_rf_gridsearch.fit(X_train, y_train)

# Show the best score and parameters for Random Forest
print("---------------------")
print("Best score for Random Forest:")
print(search_rf_gridsearch.best_score_)
print("Best parameters for Random Forest:")
print(search_rf_gridsearch.best_params_)

# Store the best Random Forest model
best_rf_model_gridsearch = search_rf_gridsearch.best_estimator_

Fitting 3 folds for each of 36 candidates, totalling 108 fits
---------------------
Best score for Random Forest:
0.8155563190191312
Best parameters for Random Forest:
{'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}


In [None]:
# Define a parameter grid to explore with GridSearchCV
param_grid_lgb_gridsearch = {
    'n_estimators': [800, 1000, 1200],
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'learning_rate': [0.1, 0.15, 0.2, 0.25],
    'num_leaves': [80, 100, 120],
    'max_depth': [-1, 8, 10, 12],
}

# Perform GridSearchCV for LightGBM
search_lgb_gridsearch = GridSearchCV(lgb.LGBMClassifier(random_state=42), param_grid=param_grid_lgb_gridsearch, cv=3, verbose=1, scoring='accuracy')
search_lgb_gridsearch.fit(X_train, y_train)

# Show the best score and parameters for LightGBM
print("---------------------")
print("Best score for LightGBM:")
print(search_lgb_gridsearch.best_score_)
print("Best parameters for LightGBM:")
print(search_lgb_gridsearch.best_params_)

# Store the best LightGBM model
best_lgb_model_gridsearch = search_lgb_gridsearch.best_estimator_

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[LightGBM] [Info] Number of positive: 64980, number of negative: 64819
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3099
[LightGBM] [Info] Number of data points in the train set: 129799, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500620 -> initscore=0.002481
[LightGBM] [Info] Start training from score 0.002481
[LightGBM] [Info] Number of positive: 64980, number of negative: 64819
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3097
[LightGBM] [Info] Number of data points in the tr

In [8]:
# Define a parameter grid to explore with GridSearchCV
param_grid_mlp_gridsearch = {
    'hidden_layer_sizes': [(100, 100), (150, 100), (150, 75), (150, 100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.0005, 0.001, 0.05],
    'max_iter': [700, 800, 900, 1000]
}

# Perform GridSearchCV for MLP Classifier
search_mlp_gridsearch = GridSearchCV(MLPClassifier(early_stopping=True, random_state=42), param_grid=param_grid_mlp_gridsearch, cv=3, verbose=1, scoring='accuracy')
search_mlp_gridsearch.fit(X_train, y_train)

# Show the best score and parameters for MLP Classifier
print("---------------------")
print("Best score for MLP Classifier:")
print(search_mlp_gridsearch.best_score_)
print("Best parameters for MLP Classifier:")
print(search_mlp_gridsearch.best_params_)

# Store the best MLP Classifier model
best_mlp_model_gridsearch = search_mlp_gridsearch.best_estimator_

Fitting 3 folds for each of 128 candidates, totalling 384 fits
---------------------
Best score for MLP Classifier:
0.7191819221480356
Best parameters for MLP Classifier:
{'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (150, 100), 'max_iter': 700, 'solver': 'adam'}


In [None]:
# Save the best models explored with GridSearchCV to the './models/gridsearch/' folder
joblib.dump(best_lr_model_gridsearch, './models/gridsearch/best_lr_model_gridsearch.pkl')
joblib.dump(best_rf_model_gridsearch, './models/gridsearch/best_rf_model_gridsearch.pkl')
joblib.dump(best_lgb_model_gridsearch, './models/gridsearch/best_lgb_model_gridsearch.pkl')
joblib.dump(best_mlp_model_gridsearch, './models/gridsearch/best_mlp_model_gridsearch.pkl')

['./models/gridsearch/best_mlp_model_gridsearch.pkl']