# Surrogate Model
### Training of Random Forest, Gradient Boosting, and Extra Trees Classifier wrapped in Ordinal Classifier Framework 

In [1]:
import os
os.chdir("..")
#import pickle
import pandas as pd
import numpy as np
import random
import copy
#import seaborn as sn
#import matplotlib.pyplot as plt
#import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
#from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
#from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from src.preprocessing.transform_into_model_data_ff import *
from src.models.ordinal_classifier import *
#import matplotlib.pyplot as plt
#from matplotlib import pyplot
from pprint import pprint
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

### 1. Set seeds

In [2]:
# Set seeds in order to reproduce results
random.seed(73)
np.random.seed(73)

### 2. Load data

In [3]:
train_dataset = pd.read_csv("data/fitness_function/train_ff.csv")
test_dataset = pd.read_csv("data/fitness_function/test_ff.csv") 
train_dataset.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,1,1,1,0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,1,0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,1,1,1,0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0,0,0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,0,0


We need to decode rating labels as orgininal labels are required here.

In [4]:
X_train = train_dataset.iloc[:,:-4]
y_train = train_dataset.iloc[:,-4:]
y_train = pd.Series(decode_classes(y_train.to_numpy()).flatten())

X_test = test_dataset.iloc[:,:-4]
y_test = test_dataset.iloc[:,-4:]
y_test = pd.Series(decode_classes(y_test.to_numpy()).flatten())

In [5]:
X_train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_r,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,-4.541667,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,102.458333,0.362904,0.395994,0.501511,0.63794,0.714309,0.778553,24.0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,-4.541667,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,102.458333,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0


In [6]:
y_train.head()

0    3
1    3
2    3
3    0
4    2
dtype: int64

### 3. Train models

#### 3.1 Random Forest

In [7]:
# Wrap Random Forest into Ordinal Classifier framework
rf = OrdinalClassifier(n_estimators=1000)

# Fit the model
rf.fit(X_train, y_train)

In [16]:
y_pred_test = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

In [17]:
print(f'Accuracy of random forest classifier on train set without hyperparameter optimization: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of random forest classifier on test set without hyperparameter optimization: {accuracy_score(y_pred_test, y_test)}')

Accuracy of random forest classifier on train set without hyperparameter optimization: 0.9417969485778867
Accuracy of random forest classifier on test set without hyperparameter optimization: 0.41276252019386106


In [18]:
print(f'Label MAE of random forest classifier on train set without hyperparameter optimization: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of random forest classifier on test set without hyperparameter optimization: {mean_absolute_error(y_pred, y_test)}')

Label MAE of random forest classifier on train set without hyperparameter optimization: 0.09003578828404596
Label MAE of random forest classifier on test set without hyperparameter optimization: 0.8659127625201939


#### 3.2 Gradient Boosting Classifier

In [24]:
xgb = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [25]:
#print(f'Accuracy of random forest classifier on train set without hyperparameter optimization: {accuracy_score(y_pred, y_test)}')
print(f'Accuracy of random forest classifier on test set without hyperparameter optimization: {accuracy_score(y_pred, y_test)}')

Accuracy of random forest classifier on test set without hyperparameter optimization: 0.3667205169628433


In [26]:
print(f'Label MAE of random forest classifier on test set without hyperparameter optimization: {mean_absolute_error(y_pred, y_test)}')

Label MAE of random forest classifier on test set without hyperparameter optimization: 0.9135702746365105


#### 3.3 Random Search

In [8]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [20, 240, 460, 680, 900, 1120, 1340, 1560, 1780, 2000]}


In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = OrdinalClassifier()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 2, cv = 3, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.5s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.5s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; total time=  13.4s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; total time=  13.1s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; total time=  13.4s


RandomizedSearchCV(cv=3,
                   estimator=<src.models.ordinal_classifier.OrdinalClassifier object at 0x7fd88041f3a0>,
                   n_iter=2,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

In [10]:
rf_random.best_params_

{'n_estimators': 460,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 90,
 'bootstrap': False}

In [11]:
rf_random.best_score_

-0.8632442600594673