# Loading Required dependencies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

import os

# Function to load data

All the data are saved in the `data` folder. It contains not only the raw data, but also all the preprocessed data since I donot want to have to keep running the prepocessing steps. So I would like to create a function that loads all the data from the subfolders in the data folder as needed.

In [6]:
def get_data(folder_name, file_name):
    """
        loads the 'file_name' csv file from the 'folder_name' folder
        get_data(folder_name, file_name) : Str Str -> Pandas DataFrame 
    """
    source_path = os.path.join('..\data', folder_name)
    if file_name == 'sample_submission.csv':
        path_data = os.path.join(source_path, file_name)
        data_df = pd.read_csv(path_data)
    elif file_name == 'train.csv':
        path_data = os.path.join(source_path, file_name)
        data_df = pd.read_csv(path_data)
    elif file_name == 'games.csv':
        path_data = os.path.join(source_path, file_name)
        data_df = pd.read_csv(path_data, parse_dates = ['created_at'])
    elif file_name == 'turns.csv':
        path_data = os.path.join(source_path, file_name)
        data_df = pd.read_csv(path_data)
    else:
        path_data = os.path.join(source_path, file_name)
        data_df = pd.read_csv(path_data)
    return data_df

## Test
train_data = get_data('main_data', 'train.csv')
train_data.head()

Unnamed: 0,game_id,nickname,score,rating
0,1,BetterBot,335,1637
1,1,stevy,429,1500
2,3,davidavid,440,1811
3,3,BetterBot,318,2071
4,4,Inandoutworker,119,1473


# Creating a custom Scikit-Learn Transformer:

we need to predict the ratings for the human players but we are given data for both human and bot players for each game. So, the data for the bot players will clearly be helpful in predicting the human players ratings, so I would like to extract the data for the bots and add them as feature columns for the human players

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

class AddBotFeatures (BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X):
        return self
    def transform (self, X):
        bot_names = ['BetterBot', 'STEEBot', 'HastyBot']
        bot_data = X.loc[X['nickname'].isin(bot_names)].copy()
        bot_data.rename(columns={'nickname':'bot_nickname', 'score': 'bot_score', 'rating': 'bot_rating'}, inplace= True )
        human_data = X.loc[~X['nickname'].isin(bot_names)].copy()
        
        # Join the two dataframe
        new_df = human_data.join(bot_data.set_index('game_id'), on='game_id')

        # Move the rating column to the end
        column_to_move = new_df.pop("rating")

        new_df.insert(6, "rating", column_to_move)

        return new_df


# Test
new_train = AddBotFeatures()

training_data = new_train.fit_transform(train_data)

training_data.head()
        

Unnamed: 0,game_id,nickname,score,bot_nickname,bot_score,bot_rating,rating
1,1,stevy,429,BetterBot,335,1637,1500
2,3,davidavid,440,BetterBot,318,2071,1811
4,4,Inandoutworker,119,BetterBot,478,1936,1473
6,5,stevy,325,STEEBot,427,1844,1500
8,6,HivinD,378,STEEBot,427,2143,2029


# Split data into train-test-val

In [21]:
from sklearn.model_selection import train_test_split


def split_data(df):
    """Splits df into training, testing and validation sets
        split_data: Pandas DataFrame -> Pandas DataFrame, Pandas DataFrame, Pandas DataFrame, Pandas DataFrame
    """
    X_data = df.drop(columns = "rating")
    train_y = df["rating"].copy()
    X_train, X_test, y_train, y_test = train_test_split(X_data, train_y, test_size=0.3, random_state=123)
    return X_train, X_test, y_train, y_test

# Test
X_train, X_test, y_train, y_test = split_data(training_data)

X_test.shape                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

(15123, 6)

In [22]:
X_train

Unnamed: 0,game_id,nickname,score,bot_nickname,bot_score,bot_rating
38284,27623,BB-8,440,HastyBot,333,1664
16334,11732,jodel,380,BetterBot,425,1966
64835,46774,BB-8,430,HastyBot,307,1640
61653,44462,Yuki86,260,STEEBot,630,2071
41920,30222,PVMG,395,BetterBot,364,1810


In [51]:
# Standardize the score, bot_score and bot_rating
Preprocess = ColumnTransformer([
    ('StandardScaler', StandardScaler(), [2,4]),
    ('oneHotEncoding', OneHotEncoder(), [3])
])

train_X = Preprocess.fit_transform(X_train)

train_X

array([[ 0.8784874 , -1.26119438,  0.        ,  1.        ,  0.        ],
       [ 0.05168372,  0.04482351,  1.        ,  0.        ,  0.        ],
       [ 0.74068679, -1.63028639,  0.        ,  1.        ,  0.        ],
       ...,
       [-1.2574221 , -0.09713496,  0.        ,  1.        ,  0.        ],
       [ 1.04384814, -0.12552665,  0.        ,  1.        ,  0.        ],
       [-0.71999971,  1.43601647,  0.        ,  0.        ,  1.        ]])

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state=123, criterion='squared_error')


In [9]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 123,
 'verbose': 0,
 'warm_start': False}

In [10]:
n_estimators = [int(i) for i in np.linspace(100, 2000, num = 15)]
max_features = ['auto', 'sqrt']
max_depth = [int(i) for i in np.linspace(10,100, num = 10)]
min_sample_split = [2,5,10]
min_sample_leaf = [2,4,6]
bootstrap = [True, False]

param_grid = {'n_estimators':n_estimators,
'max_features': max_features,
'max_depth':max_depth,
'min_samples_split': min_sample_split,
'min_samples_leaf': min_sample_leaf,
'bootstrap': bootstrap
}

rf_random = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter = 100, cv = 5, random_state=123, n_jobs=-1)

In [68]:
rf_random.fit(train_X, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=123),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 4, 6],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 235, 371, 507,
                                                         642, 778, 914, 1050,
                                                         1185, 1321, 1457, 1592,
                                                         1728, 1864, 2000]},
                   random_state=123)

In [37]:
from sklearn.model_selection import GridSearchCV

n_estimators = [int(i) for i in np.linspace(1700, 1850, num = 4)]
n_estimators.append(1728)
max_features = ['auto', 'sqrt']
max_depth = [int(i) for i in np.linspace(2,12, num = 4)]
min_sample_split = [3,4,5,6]
min_sample_leaf = [1,2,3,4,5]
bootstrap = [True, False]

grid_param = {'n_estimators':n_estimators,
'max_features': max_features,
'max_depth':max_depth,
'min_samples_split': min_sample_split,
'min_samples_leaf': min_sample_leaf,
'bootstrap': bootstrap
}

rf_search = RandomizedSearchCV(estimator = rf, param_distributions = grid_param, n_iter = 10, cv = 3, scoring = 'neg_mean_squared_error', return_train_score = True)

In [38]:
rf_search.fit(train_X, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=123),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 5, 8, 12],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [3, 4, 5, 6],
                                        'n_estimators': [1700, 1750, 1800, 1850,
                                                         1728]},
                   return_train_score=True, scoring='neg_mean_squared_error')

In [62]:
rf_search.best_params_

{'n_estimators': 1728,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 5,
 'bootstrap': True}

In [None]:
test_X = Preprocess.transform(X_test)

best_model = rf_search.best_estimator_

predictions = best_model.predict(test_X)

error = abs(predictions - y_test)

rmse = np.sqrt(np.mean(error))

rmse

In [80]:
def evaluate_model(test_data,true_labels, model):
    """
        evaluate model return the RMSE of the model
    """
    test_X = Preprocess.transform(test_data)
    best_model = model.best_estimator_
    predictions = best_model.predict(test_X)
    error = abs(predictions - true_labels)
    rmse = np.sqrt(np.mean(error))
    return rmse

# Test
evaluate_model(X_test, y_test, rf_search)

12.843616581358512

In [42]:
import joblib

def save_model(model, model_name):
    """
        save_model saves the model in the model folder
        save_model(model,model_name): ML model Str -> None
    """
    model_n = model_name + '.pkl'
    model_path = os.path.join('../model', model_n)
    joblib.dump(model, model_path)

In [73]:
def load_model(model_name):
    """
        load_model loads the model named 'model_name'
    """
    model_n = model_name + '.pkl'
    model_path = os.path.join('../model', model_n)
    load_rf = joblib.load(model_path)
    return load_rf

In [74]:
# Test
new_model = load_model('random_forest')

In [43]:
save_model(best_model, 'new_rf_model')

In [83]:
def save_submission(submission_name, model):

    sample_submission = get_data('main_data', 'sample_submission.csv')
    
    test_data = get_data('main_data', 'test.csv')

    testing_data = new_train.transform(test_data)
    testing_data.drop(columns = 'rating', inplace = True)

    scaled_testing = Preprocess.transform(testing_data)

    rating = model.predict(scaled_testing)
    sample_submission['rating'] = rating

    file_name = submission_name + '.csv'
    prediction_path = os.path.join('../predictions', file_name)
    sample_submission.to_csv(prediction_path, index = False)

    return sample_submission

In [84]:
# Test
save_submission('baseline', best_model)

Unnamed: 0,game_id,rating
0,2,1904.567030
1,7,1807.950146
2,11,1938.687407
3,14,1803.219463
4,27,1929.676496
...,...,...
22358,72760,1977.090705
22359,72761,1690.101873
22360,72762,1739.582866
22361,72768,1852.863651
