# Importing Dependencies

In [1]:
import pandas as pd
import numpy as np

import os
import sys

path = os.getcwd().replace('\\notebooks', '')
script_path = os.path.join(path, 'utils')

sys.path.append(script_path)

from preprocess import *


### Loading data

In [2]:
full_train = get_data('processed_data', 'full_train.csv')

full_train.head()

Unnamed: 0,game_id,nickname,score,bot_nickname,bot_score,bot_rating,rating,first,time_control_name,game_end_reason,...,bot_std,turn_number,Player_Exchanged,Player_Passed,Player_Six_Rule,Player_Challenged,Bot_Exchanged,Bot_Passed,Bot_Six_Rule,Bot_Challenged
0,1,stevy,429,BetterBot,335,1637,1500,1,regular,STANDARD,...,16.457794,27,0,0,0,0,0,0,0,0
1,3,davidavid,440,BetterBot,318,2071,1811,1,regular,STANDARD,...,18.05831,27,0,0,0,0,1,0,0,0
2,4,Inandoutworker,119,BetterBot,478,1936,1473,1,regular,RESIGNED,...,20.945053,29,1,0,0,0,0,0,0,0
3,5,stevy,325,STEEBot,427,1844,1500,1,regular,STANDARD,...,20.346522,31,1,0,0,0,0,0,0,0
4,6,HivinD,378,STEEBot,427,2143,2029,0,regular,STANDARD,...,23.572039,25,0,0,0,0,1,0,0,0


## Splitting data into train and test set

In [3]:
X_train, X_test, y_train, y_test = split_data(full_train)

In [4]:
X_train.head()

Unnamed: 0,game_id,nickname,score,bot_nickname,bot_score,bot_rating,first,time_control_name,game_end_reason,winner,...,bot_std,turn_number,Player_Exchanged,Player_Passed,Player_Six_Rule,Player_Challenged,Bot_Exchanged,Bot_Passed,Bot_Six_Rule,Bot_Challenged
19142,27623,BB-8,440,HastyBot,333,1664,1,regular,STANDARD,1,...,19.49589,27,0,0,0,0,1,1,0,0
8167,11732,jodel,380,BetterBot,425,1966,1,regular,STANDARD,0,...,16.829679,28,0,0,0,0,0,0,0,0
32417,46774,BB-8,430,HastyBot,307,1640,1,regular,STANDARD,1,...,20.559568,33,0,0,0,0,1,1,0,0
30826,44462,Yuki86,260,STEEBot,630,2071,0,regular,STANDARD,0,...,35.405345,27,0,0,0,0,0,0,0,0
20960,30222,PVMG,395,BetterBot,364,1810,1,blitz,STANDARD,1,...,22.391619,29,0,0,0,0,0,0,0,0


In [5]:
def drop_columns(df):
  df_new = df.drop(columns=['game_id', 'nickname'])
  return df_new

train_x = drop_columns(X_train)

train_x.head()

Unnamed: 0,score,bot_nickname,bot_score,bot_rating,first,time_control_name,game_end_reason,winner,lexicon,initial_time_seconds,...,bot_std,turn_number,Player_Exchanged,Player_Passed,Player_Six_Rule,Player_Challenged,Bot_Exchanged,Bot_Passed,Bot_Six_Rule,Bot_Challenged
19142,440,HastyBot,333,1664,1,regular,STANDARD,1,ECWL,900,...,19.49589,27,0,0,0,0,1,1,0,0
8167,380,BetterBot,425,1966,1,regular,STANDARD,0,CSW21,1200,...,16.829679,28,0,0,0,0,0,0,0,0
32417,430,HastyBot,307,1640,1,regular,STANDARD,1,ECWL,900,...,20.559568,33,0,0,0,0,1,1,0,0
30826,260,STEEBot,630,2071,0,regular,STANDARD,0,CSW21,900,...,35.405345,27,0,0,0,0,0,0,0,0
20960,395,BetterBot,364,1810,1,blitz,STANDARD,1,CSW21,300,...,22.391619,29,0,0,0,0,0,0,0,0


In [6]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35287 entries, 19142 to 15725
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   score                  35287 non-null  int64  
 1   bot_nickname           35287 non-null  object 
 2   bot_score              35287 non-null  int64  
 3   bot_rating             35287 non-null  int64  
 4   first                  35287 non-null  int64  
 5   time_control_name      35287 non-null  object 
 6   game_end_reason        35287 non-null  object 
 7   winner                 35287 non-null  int64  
 8   lexicon                35287 non-null  object 
 9   initial_time_seconds   35287 non-null  int64  
 10  increment_seconds      35287 non-null  int64  
 11  rating_mode            35287 non-null  object 
 12  max_overtime_minutes   35287 non-null  int64  
 13  game_duration_seconds  35287 non-null  float64
 14  game_created_time      35287 non-null  float64
 15

Since we will be using a tree based ML model, we donot really need to standardize our numeric features

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Preprocess = ColumnTransformer([
    ('oneHotEncoding', OneHotEncoder(), [1,5,6,8,11])
], remainder= 'passthrough')

training_data = Preprocess.fit_transform(train_x)

## Using Random Forest Regressor

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state=123, criterion='squared_error')


In [9]:
n_estimators = [int(i) for i in np.linspace(100, 2000, num = 15)]
max_features = ['auto', 'sqrt']
max_depth = [int(i) for i in np.linspace(10,100, num = 10)]
min_sample_split = [2,5,10]
min_sample_leaf = [2,4,6]
bootstrap = [True, False]

param_grid = {'n_estimators':n_estimators,
'max_features': max_features,
'max_depth':max_depth,
'min_samples_split': min_sample_split,
'min_samples_leaf': min_sample_leaf,
'bootstrap': bootstrap
}

rf_random = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter = 10, cv = 5, random_state=123)

In [10]:
rf_random.fit(training_data, y_train)

In [11]:
rf_random.best_params_

{'n_estimators': 642,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': False}

In [12]:
best_model = rf_random.best_estimator_

save_model_or_pipeline(best_model, 'full_rf_model', 'models')

In [21]:
test_x = drop_columns(X_test)

testing_data = Preprocess.fit_transform(test_x)

In [25]:
def evaluate_model(test_data,true_labels, model):
    """
        evaluate model and return the RMSE of the model
    """
    # test_X = Preprocess.transform(test_data)
    predictions = model.predict(test_data)
    error = abs(predictions - true_labels)
    rmse = np.sqrt(np.mean(error))
    return rmse

In [26]:
evaluate_model(testing_data, y_test, best_model)

8.651530003176926

In [34]:
test_data = get_data('processed_data', 'full_test.csv')

test = drop_columns(test_data)

testing_df = Preprocess.fit_transform(test)

#testing_df = get_full_test_set(test_data) 

In [None]:
save_submission('full_rf_prediction', best_model, testing_df)