In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c scrabble-player-rating
!unzip scrabble-player-rating

Downloading scrabble-player-rating.zip to /content
 88% 33.0M/37.3M [00:00<00:00, 88.5MB/s]
100% 37.3M/37.3M [00:00<00:00, 92.8MB/s]
Archive:  scrabble-player-rating.zip
  inflating: games.csv               
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
  inflating: turns.csv               


# Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

### Loading data

In [None]:
full_train = pd.read_csv('full_train.csv')

full_train.head()

Unnamed: 0,game_id,nickname,score,bot_nickname,bot_score,bot_rating,rating,first,time_control_name,game_end_reason,...,bot_std,turn_number,Player_Exchanged,Player_Passed,Player_Six_Rule,Player_Challenged,Bot_Exchanged,Bot_Passed,Bot_Six_Rule,Bot_Challenged
0,1,stevy,429,BetterBot,335,1637,1500,1,regular,STANDARD,...,16.457794,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,davidavid,440,BetterBot,318,2071,1811,1,regular,STANDARD,...,18.05831,27.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,4,Inandoutworker,119,BetterBot,478,1936,1473,1,regular,RESIGNED,...,20.945053,29.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,stevy,325,STEEBot,427,1844,1500,1,regular,STANDARD,...,20.346522,31.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,HivinD,378,STEEBot,427,2143,2029,0,regular,STANDARD,...,23.572039,25.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Splitting data into train and test set

In [None]:
from sklearn.model_selection import train_test_split

def split_data(df):
    """Splits df into training, testing and validation sets
        split_data: Pandas DataFrame -> Pandas DataFrame, Pandas DataFrame, Pandas DataFrame, Pandas DataFrame
    """
    X_data = df.drop(columns = "rating")
    train_y = df["rating"].copy()
    X_train, X_test, y_train, y_test = train_test_split(X_data, train_y, test_size=0.3, random_state=123)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_data(full_train)

In [None]:
X_train.head()

Unnamed: 0,game_id,nickname,score,bot_nickname,bot_score,bot_rating,first,time_control_name,game_end_reason,winner,...,bot_std,turn_number,Player_Exchanged,Player_Passed,Player_Six_Rule,Player_Challenged,Bot_Exchanged,Bot_Passed,Bot_Six_Rule,Bot_Challenged
16914,24353,Glilac,476,STEEBot,512,2066,0,regular,STANDARD,0,...,19.941733,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18548,26745,MSHS-21KADMUSLEE,358,STEEBot,424,2087,1,regular,STANDARD,0,...,21.317201,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5729,8243,BB-8,417,HastyBot,381,1664,0,regular,STANDARD,1,...,17.205498,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8267,11877,HivinD,409,STEEBot,495,2162,1,regular,STANDARD,0,...,23.62807,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13101,18842,KongCH,381,HastyBot,414,2166,0,regular,STANDARD,0,...,17.648075,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def drop_columns(df):
  df_new = df.drop(columns=['game_id', 'nickname'])
  return df_new

train_x = drop_columns(X_train)

train_x.head()

Unnamed: 0,score,bot_nickname,bot_score,bot_rating,first,time_control_name,game_end_reason,winner,lexicon,initial_time_seconds,...,bot_std,turn_number,Player_Exchanged,Player_Passed,Player_Six_Rule,Player_Challenged,Bot_Exchanged,Bot_Passed,Bot_Six_Rule,Bot_Challenged
16914,476,STEEBot,512,2066,0,regular,STANDARD,0,CSW21,1200,...,19.941733,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18548,358,STEEBot,424,2087,1,regular,STANDARD,0,CSW21,1200,...,21.317201,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5729,417,HastyBot,381,1664,0,regular,STANDARD,1,ECWL,900,...,17.205498,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8267,409,STEEBot,495,2162,1,regular,STANDARD,0,CSW21,900,...,23.62807,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13101,381,HastyBot,414,2166,0,regular,STANDARD,0,CSW21,1200,...,17.648075,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13551 entries, 16914 to 15725
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   score                  13551 non-null  int64  
 1   bot_nickname           13551 non-null  object 
 2   bot_score              13551 non-null  int64  
 3   bot_rating             13551 non-null  int64  
 4   first                  13551 non-null  int64  
 5   time_control_name      13551 non-null  object 
 6   game_end_reason        13551 non-null  object 
 7   winner                 13551 non-null  int64  
 8   lexicon                13551 non-null  object 
 9   initial_time_seconds   13551 non-null  int64  
 10  increment_seconds      13551 non-null  int64  
 11  rating_mode            13551 non-null  object 
 12  max_overtime_minutes   13551 non-null  int64  
 13  game_duration_seconds  13551 non-null  float64
 14  game_created_time      13551 non-null  float64
 15

Since we will be using a tree based ML model, we donot really need to standardize our numeric features

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Preprocess = ColumnTransformer([
    ('oneHotEncoding', OneHotEncoder(), [1,5,6,8,11])
], remainder= 'passthrough')

training_data = Preprocess.fit_transform(train_x)

## Using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state=123, criterion='squared_error')


In [None]:
n_estimators = [int(i) for i in np.linspace(100, 2000, num = 15)]
max_features = ['auto', 'sqrt']
max_depth = [int(i) for i in np.linspace(10,100, num = 10)]
min_sample_split = [2,5,10]
min_sample_leaf = [2,4,6]
bootstrap = [True, False]

param_grid = {'n_estimators':n_estimators,
'max_features': max_features,
'max_depth':max_depth,
'min_samples_split': min_sample_split,
'min_samples_leaf': min_sample_leaf,
'bootstrap': bootstrap
}

rf_random = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter = 100, cv = 5, random_state=123, n_jobs=-1)

In [None]:
rf_random.fit(training_data, y_train)



In [None]:
rf_random.best_params_