# Surrogate Model
### Training of Random Forest, Gradient Boosting, and Extra Trees Classifier wrapped in Ordinal Classifier Framework 

In [1]:
import os
os.chdir("..")
import pickle
import pandas as pd
import numpy as np
import random
import copy
#import seaborn as sn
#import matplotlib.pyplot as plt
#import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
#from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
#from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from src.preprocessing.transform_into_model_data_ff import *
from src.models.ordinal_classifier import *
#import matplotlib.pyplot as plt
#from matplotlib import pyplot
from pprint import pprint
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

### 1. Set seeds

In [2]:
# Set seeds in order to reproduce results
random.seed(73)
np.random.seed(73)

### 2. Load data

In [3]:
train_dataset = pd.read_csv("data/fitness_function/train_ff.csv")
test_dataset = pd.read_csv("data/fitness_function/test_ff.csv") 
train_dataset.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,1,1,1,0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,1,0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,1,1,1,0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0,0,0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,0,0


We need to decode rating labels as orgininal labels are required here.

In [4]:
X_train = train_dataset.iloc[:,:-4]
y_train = train_dataset.iloc[:,-4:]
y_train = pd.Series(decode_classes(y_train.to_numpy()).flatten())

X_test = test_dataset.iloc[:,:-4]
y_test = test_dataset.iloc[:,-4:]
y_test = pd.Series(decode_classes(y_test.to_numpy()).flatten())

In [5]:
X_train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_r,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,-4.541667,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,102.458333,0.362904,0.395994,0.501511,0.63794,0.714309,0.778553,24.0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,-4.541667,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,102.458333,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0


In [6]:
y_train.head()

0    3
1    3
2    3
3    0
4    2
dtype: int64

In [None]:
# TODO: upsampling of class 4 to account for imbalanced classes

### 3. Train models

#### 3.1 Random Forest

##### Define Grid for Random Search

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [20, 240, 460, 680, 900, 1120, 1340, 1560, 1780, 2000]}


##### Perform Random Search

In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestOC()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 3, cv = 5, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; total time=  15.9s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; tot

RandomizedSearchCV(cv=5,
                   estimator=<src.models.ordinal_classifier.RandomForestOC object at 0x7fe718253fd0>,
                   n_iter=3,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

##### Get best parameters and best evaluation score

In [10]:
rf_random.best_params_

{'n_estimators': 460,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 90,
 'bootstrap': False}

In [11]:
rf_random.best_score_

-0.8690855906466378

##### Train best model on whole training data

In [12]:
rf_best = rf_random.best_estimator_
rf_best.fit(X_train, y_train)

##### Evaluate best model on test data

In [13]:
y_pred_test = rf_best.predict(X_test)
y_pred_train = rf_best.predict(X_train)

In [15]:
print(f'Label MAE of best random forest classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best random forest classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.24618572235825956
Label MAE of best random forest classifier on test set: 0.8368336025848142


In [16]:
print(f'Accuracy of  best random forest classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best random forest classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.8372574872857412
Accuracy of best random forest classifier on test set: 0.41518578352180935


In [17]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.55      0.79      0.65       238
           1       0.24      0.15      0.19       205
           2       0.36      0.64      0.46       348
           3       0.49      0.19      0.28       375
           4       0.00      0.00      0.00        72

    accuracy                           0.42      1238
   macro avg       0.33      0.35      0.31      1238
weighted avg       0.39      0.42      0.37      1238



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(confusion_matrix(y_test, y_pred_test))

[[187  15  32   4   0]
 [ 53  31  98  23   0]
 [ 51  38 223  36   0]
 [ 43  36 223  73   0]
 [  4  10  44  14   0]]


##### Save best model

In [21]:
filename = 'models/sm_random_forest.sav'
pickle.dump(rf_best, open(filename, 'wb'))

#### 3.2 Gradient Boosting Classifier

##### Define Grid for Random Search

In [24]:
# TODO: define meaningful parameter grid

##### Perform Random Search

In [23]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
gb = GradientBoostingOC()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 3, cv = 5, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
gb_random.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.7s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; total time=  18.3s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; tot

RandomizedSearchCV(cv=5,
                   estimator=<src.models.ordinal_classifier.RandomForestOC object at 0x7fe71ffe7f40>,
                   n_iter=3,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

##### Get best parameters and best evaluation score

In [10]:
gb_random.best_params_

{'n_estimators': 460,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 90,
 'bootstrap': False}

In [11]:
gb_random.best_score_

-0.8690855906466378

##### Train best model on whole training data

In [12]:
gb_best = gb_random.best_estimator_
gb_best.fit(X_train, y_train)

##### Evaluate best model on test data

In [13]:
y_pred_test = gb_best.predict(X_test)
y_pred_train = gb_best.predict(X_train)

In [15]:
print(f'Label MAE of best gradient boosting classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best gradient boosting classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.24618572235825956
Label MAE of best random forest classifier on test set: 0.8368336025848142


In [16]:
print(f'Accuracy of  best gradient boosting classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best gradient boosting classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.8372574872857412
Accuracy of best random forest classifier on test set: 0.41518578352180935


In [17]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.55      0.79      0.65       238
           1       0.24      0.15      0.19       205
           2       0.36      0.64      0.46       348
           3       0.49      0.19      0.28       375
           4       0.00      0.00      0.00        72

    accuracy                           0.42      1238
   macro avg       0.33      0.35      0.31      1238
weighted avg       0.39      0.42      0.37      1238



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(confusion_matrix(y_test, y_pred_test))

[[187  15  32   4   0]
 [ 53  31  98  23   0]
 [ 51  38 223  36   0]
 [ 43  36 223  73   0]
 [  4  10  44  14   0]]


##### Save best model

In [21]:
filename = 'models/sm_gradient_boosting.sav'
pickle.dump(gb_best, open(filename, 'wb'))

#### 3.3 Extra Trees Classifier

##### Define Grid for Random Search

In [None]:
# TODO: define meaningful parameter grid

##### Perform Random Search

In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
et = ExtraTreesOC()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
et_random = RandomizedSearchCV(estimator = et, param_distributions = random_grid, n_iter = 3, cv = 5, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
et_random.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=True, max_depth=50, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=20; total time=   0.6s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; total time=  15.9s
[CV] END bootstrap=False, max_depth=90, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=460; tot

RandomizedSearchCV(cv=5,
                   estimator=<src.models.ordinal_classifier.RandomForestOC object at 0x7fe718253fd0>,
                   n_iter=3,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

##### Get best parameters and best evaluation score

In [10]:
et_random.best_params_

{'n_estimators': 460,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 90,
 'bootstrap': False}

In [11]:
et_random.best_score_

-0.8690855906466378

##### Train best model on whole training data

In [12]:
et_best = et_random.best_estimator_
et_best.fit(X_train, y_train)

##### Evaluate best model on test data

In [13]:
y_pred_test = et_best.predict(X_test)
y_pred_train = et_best.predict(X_train)

In [15]:
print(f'Label MAE of best extra trees classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best extra trees classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.24618572235825956
Label MAE of best random forest classifier on test set: 0.8368336025848142


In [16]:
print(f'Accuracy of  best extra trees classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best extra trees classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.8372574872857412
Accuracy of best random forest classifier on test set: 0.41518578352180935


In [17]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.55      0.79      0.65       238
           1       0.24      0.15      0.19       205
           2       0.36      0.64      0.46       348
           3       0.49      0.19      0.28       375
           4       0.00      0.00      0.00        72

    accuracy                           0.42      1238
   macro avg       0.33      0.35      0.31      1238
weighted avg       0.39      0.42      0.37      1238



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(confusion_matrix(y_test, y_pred_test))

[[187  15  32   4   0]
 [ 53  31  98  23   0]
 [ 51  38 223  36   0]
 [ 43  36 223  73   0]
 [  4  10  44  14   0]]


##### Save best model

In [21]:
filename = 'models/sm_extra_trees.sav'
pickle.dump(et_best, open(filename, 'wb'))