# Surrogate Model
### Training of Random Forest, Gradient Boosting, and Extra Trees Classifier wrapped in Ordinal Classifier Framework 

In [1]:
import os
os.chdir("..")
import pickle
import pandas as pd
import numpy as np
import random
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from src.preprocessing.sm_label_transformer import *
from src.models.ordinal_classifier_scikit import *
from pprint import pprint
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import IsolationForest

### 1. Set seeds

In [2]:
# Set seeds in order to reproduce results
random.seed(73)
np.random.seed(73)

### 2. Load data

In [5]:
train_dataset = pd.read_csv("data/surrogate_model/sm_train_data_augmented.csv") # change if augmentation should not be applied
test_dataset = pd.read_csv("data/surrogate_model/sm_test_data.csv") 
train_dataset.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,1.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,1.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.134364,0.847434,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.763775,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1.0,1.0,0.0,0.0


We need to decode rating labels as orgininal labels are required here.

In [6]:
X_train = train_dataset.iloc[:,:-4]
y_train = train_dataset.iloc[:,-4:]
y_train = pd.Series(decode_classes(y_train.to_numpy()).flatten())

X_test = test_dataset.iloc[:,:-4]
y_test = test_dataset.iloc[:,-4:]
y_test = pd.Series(decode_classes(y_test.to_numpy()).flatten())

In [7]:
X_train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_r,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg
0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,...,-4.541667,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0
1,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0
2,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,102.458333,0.362904,0.395994,0.501511,0.63794,0.714309,0.778553,24.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.134364,0.847434,-1.0,-1.0,...,-4.541667,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0
4,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.763775,...,102.458333,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0


In [8]:
y_train.head()

0    3
1    3
2    3
3    0
4    2
dtype: int64

### 3. Upscaling of class 4/"Very Good" (optional, was shown to not improve the performance^)

In [7]:
#unique_train, counts_train = np.unique(y_train, return_counts=True)
#label_counts = dict(zip(unique_train, counts_train))
#label_counts

In [8]:
# Get indices where data label equals 4
#i_class4 = np.where(y_train == 4)[0]
# Calculate upsample size (mean of class sizes 0-3 - class size 4)
#upsample_size = round(np.mean([label_counts[i] for i in range(4)])) - label_counts[4]
# Get upsample indices
#i_class4_upsampled = np.random.choice(i_class4, size=upsample_size, replace=True)

In [9]:
# Create upsampled dataframe
#y_train = pd.concat([y_train, y_train[i_class4_upsampled]]).reset_index(drop=True)
#X_train = pd.concat([X_train, X_train.iloc[i_class4_upsampled,:]]).reset_index(drop=True)

In [10]:
#y_train

### 4. Outlier removal (optional)

In [11]:
# Use of Isolation Forest
#ifo = IsolationForest(random_state=0).fit(X_train)
#X_train[['anomaly']] = ifo.predict(X_train)
#X_train.head()

In [12]:
# remove outliers
#out_ind = X_train[X_train['anomaly']==-1].index
#X_train.drop(out_ind, inplace=True, axis=0)
#y_train.drop(out_ind, inplace=True, axis=0)
#X_train.reset_index(drop=True, inplace=True)
#y_train.reset_index(drop=True, inplace=True)
#X_train.drop('anomaly', axis=1, inplace=True)

### 5. Train models

#### 5.1 Random Forest

##### Train Base Classifier

In [9]:
rf_base = RandomForestOC(n_estimators=1000)
rf_base.fit(X_train, y_train)

In [10]:
y_pred_test = rf_base.predict(X_test)
y_pred_train = rf_base.predict(X_train)

In [11]:
print(f'Label MAE of best random forest classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best random forest classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.08916258500166437
Label MAE of best random forest classifier on test set: 0.8796445880452343


In [12]:
print(f'Accuracy of  best random forest classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best random forest classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.9426030719482619
Accuracy of best random forest classifier on test set: 0.41518578352180935


In [13]:
print(confusion_matrix(y_test, y_pred_test))

[[194  19  20   5   0]
 [ 56  52  70  27   0]
 [ 54  59 185  48   2]
 [ 55  58 177  83   2]
 [  7  14  35  16   0]]


##### Define Grid for Random Search

In [13]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [20, 240, 460, 680, 900, 1120, 1340, 1560, 1780, 2000]}


##### Perform Random Search

In [14]:
# Use the random grid to search for best hyperparameters
# Define stratified cross validation
cross_val = StratifiedKFold(n_splits=5)
# First create the base model to tune
rf = RandomForestOC()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = cross_val, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   8.2s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   7.7s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   7.4s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   7.1s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   6.9s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=  45.9s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; tot

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=<src.models.ordinal_classifier_scikit.RandomForestOC object at 0x7f8af21efc40>,
                   n_iter=50,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=42, scoring='neg

##### Get best parameters and best evaluation score

In [15]:
rf_random.best_params_

{'n_estimators': 1560,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 10,
 'bootstrap': True}

In [16]:
rf_random.best_score_

-0.8713451226590413

##### Train best model on whole training data

In [17]:
rf_best = rf_random.best_estimator_
rf_best.fit(X_train, y_train)

##### Evaluate best model on test data

In [18]:
y_pred_test = rf_best.predict(X_test)
y_pred_train = rf_best.predict(X_train)

In [19]:
print(f'Label MAE of best random forest classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best random forest classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.5394612921454135
Label MAE of best random forest classifier on test set: 0.8416801292407108


In [20]:
print(f'Accuracy of  best random forest classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best random forest classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.6389150499152383
Accuracy of best random forest classifier on test set: 0.3974151857835218


In [21]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.55      0.78      0.64       238
           1       0.11      0.03      0.05       205
           2       0.34      0.73      0.47       348
           3       0.46      0.13      0.20       375
           4       0.00      0.00      0.00        72

    accuracy                           0.40      1238
   macro avg       0.29      0.33      0.27      1238
weighted avg       0.36      0.40      0.32      1238



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
print(confusion_matrix(y_test, y_pred_test))

[[185  13  36   4   0]
 [ 57   7 124  17   0]
 [ 47  19 253  29   0]
 [ 46  14 268  47   0]
 [  4   8  54   6   0]]


##### Save best model

In [23]:
filename = 'models/sm_random_forest.sav'
pickle.dump(rf_best, open(filename, 'wb'))

#### 5.2 Gradient Boosting Classifier

##### Train Base Classifier

In [14]:
gb_base = GradientBoostingOC(n_estimators=1000)
gb_base.fit(X_train, y_train)

In [15]:
y_pred_test = gb_base.predict(X_test)
y_pred_train = gb_base.predict(X_train)

In [16]:
print(f'Label MAE of best random forest classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best random forest classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.3367254743449522
Label MAE of best random forest classifier on test set: 0.8521809369951535


In [17]:
print(f'Accuracy of  best random forest classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best random forest classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.7549574397260925
Accuracy of best random forest classifier on test set: 0.41437802907915994


In [18]:
print(confusion_matrix(y_test, y_pred_test))

[[160  44  24   8   2]
 [ 35  60  66  43   1]
 [ 33  73 157  83   2]
 [ 31  70 136 136   2]
 [  4  15  30  23   0]]


##### Define Grid for Random Search

In [24]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Boosting learning rate
learning_rate = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]

# Maximum number of levels in tree
max_depth = range(5,16,2)

min_samples_split = range(200,1401,200)

min_samples_leaf = range(30,71,10)

max_features = range(7,20,2)

subsample = [0.6,0.7,0.75,0.8,0.85,0.9]


random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features,
               'subsample': subsample}
pprint(random_grid)

{'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
 'max_depth': range(5, 16, 2),
 'max_features': range(7, 20, 2),
 'min_samples_leaf': range(30, 71, 10),
 'min_samples_split': range(200, 1401, 200),
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]}


##### Perform Random Search

In [25]:
# Use the random grid to search for best hyperparameters
# Define stratified cross validation
cross_val = StratifiedKFold(n_splits=5)
# First create the base model to tune
gb = GradientBoostingOC()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 50, cv = cross_val, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
gb_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END learning_rate=0.1, max_depth=9, max_features=11, min_samples_leaf=30, min_samples_split=600, n_estimators=1400, subsample=0.75; total time=  42.5s
[CV] END learning_rate=0.1, max_depth=9, max_features=11, min_samples_leaf=30, min_samples_split=600, n_estimators=1400, subsample=0.75; total time=  42.9s
[CV] END learning_rate=0.1, max_depth=9, max_features=11, min_samples_leaf=30, min_samples_split=600, n_estimators=1400, subsample=0.75; total time=  42.4s
[CV] END learning_rate=0.1, max_depth=9, max_features=11, min_samples_leaf=30, min_samples_split=600, n_estimators=1400, subsample=0.75; total time=  42.2s
[CV] END learning_rate=0.1, max_depth=9, max_features=11, min_samples_leaf=30, min_samples_split=600, n_estimators=1400, subsample=0.75; total time=  43.4s
[CV] END learning_rate=0.1, max_depth=9, max_features=19, min_samples_leaf=70, min_samples_split=200, n_estimators=1800, subsample=0.85; total time= 1.8min
[C

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=<src.models.ordinal_classifier_scikit.GradientBoostingOC object at 0x7f8af2c1c0a0>,
                   n_iter=50,
                   param_distributions={'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.25, 0.3],
                                        'max_depth': range(5, 16, 2),
                                        'max_features': range(7, 20, 2),
                                        'min_samples_leaf': range(30, 71, 10),
                                        'min_samples_split': range(200, 1401, 200),
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000],
                                        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85,
       

##### Get best parameters and best evaluation score

In [26]:
gb_random.best_params_

{'subsample': 0.9,
 'n_estimators': 1200,
 'min_samples_split': 200,
 'min_samples_leaf': 40,
 'max_features': 7,
 'max_depth': 7,
 'learning_rate': 0.1}

In [27]:
gb_random.best_score_

-0.8950723387487554

##### Train best model on whole training data

In [28]:
gb_best = gb_random.best_estimator_
gb_best.fit(X_train, y_train)

##### Evaluate best model on test data

In [29]:
y_pred_test = gb_best.predict(X_test)
y_pred_train = gb_best.predict(X_train)

In [30]:
print(f'Label MAE of best gradient boosting classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best gradient boosting classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best gradient boosting classifier on train set: 0.11000188359389716
Label MAE of best gradient boosting classifier on test set: 0.8852988691437803


In [31]:
print(f'Accuracy of  best gradient boosting classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best gradient boosting classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best gradient boosting classifier on train set: 0.9248446035034846
Accuracy of best gradient boosting classifier on test set: 0.3764135702746365


In [32]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.57      0.49      0.53       238
           1       0.22      0.41      0.29       205
           2       0.37      0.45      0.40       348
           3       0.49      0.29      0.36       375
           4       0.00      0.00      0.00        72

    accuracy                           0.38      1238
   macro avg       0.33      0.33      0.32      1238
weighted avg       0.40      0.38      0.37      1238



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
print(confusion_matrix(y_test, y_pred_test))

[[117  82  29  10   0]
 [ 36  85  60  24   0]
 [ 28 101 155  64   0]
 [ 23  93 150 109   0]
 [  2  24  30  16   0]]


##### Save best model

In [34]:
filename = 'models/sm_gradient_boosting.sav'
pickle.dump(gb_best, open(filename, 'wb'))

#### 5.3 Extra Trees Classifier

##### Train Base Classifier

In [19]:
et_base = ExtraTreesOC(n_estimators=1000)
et_base.fit(X_train, y_train)

In [20]:
y_pred_test = et_base.predict(X_test)
y_pred_train = et_base.predict(X_train)

In [21]:
print(f'Label MAE of best random forest classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best random forest classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best random forest classifier on train set: 0.09477388368443578
Label MAE of best random forest classifier on test set: 0.901453957996769


In [22]:
print(f'Accuracy of  best random forest classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best random forest classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best random forest classifier on train set: 0.9426030719482619
Accuracy of best random forest classifier on test set: 0.39095315024232635


In [23]:
print(confusion_matrix(y_test, y_pred_test))

[[181  27  26   3   1]
 [ 54  44  87  20   0]
 [ 51  58 188  51   0]
 [ 50  63 190  71   1]
 [  8  16  37  11   0]]


##### Define Grid for Random Search

In [35]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [20, 240, 460, 680, 900, 1120, 1340, 1560, 1780, 2000]}


##### Perform Random Search

In [36]:
# Use the random grid to search for best hyperparameters
# Define stratified cross validation
cross_val = StratifiedKFold(n_splits=5)
# First create the base model to tune
et = ExtraTreesOC()
# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
et_random = RandomizedSearchCV(estimator = et, param_distributions = random_grid, n_iter = 50, cv = cross_val, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
et_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   3.0s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   3.0s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   3.0s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   3.0s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=240; total time=   3.0s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=  18.6s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; tot

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=<src.models.ordinal_classifier_scikit.ExtraTreesOC object at 0x7f8a9fe01100>,
                   n_iter=50,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=42, scoring='neg_m

##### Get best parameters and best evaluation score

In [37]:
et_random.best_params_

{'n_estimators': 1120,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 10,
 'bootstrap': False}

In [38]:
et_random.best_score_

-0.8607996932858353

##### Train best model on whole training data

In [39]:
et_best = et_random.best_estimator_
et_best.fit(X_train, y_train)

##### Evaluate best model on test data

In [40]:
y_pred_test = et_best.predict(X_test)
y_pred_train = et_best.predict(X_train)

In [41]:
print(f'Label MAE of best extra trees classifier on train set: {mean_absolute_error(y_pred_train, y_train)}')
print(f'Label MAE of best extra trees classifier on test set: {mean_absolute_error(y_pred_test, y_test)}')

Label MAE of best extra trees classifier on train set: 0.5763797325296666
Label MAE of best extra trees classifier on test set: 0.8416801292407108


In [42]:
print(f'Accuracy of  best extra trees classifier on train set: {accuracy_score(y_pred_train, y_train)}')
print(f'Accuracy of best extra trees classifier on test set: {accuracy_score(y_pred_test, y_test)}')

Accuracy of  best extra trees classifier on train set: 0.6123563759653419
Accuracy of best extra trees classifier on test set: 0.401453957996769


In [43]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.53      0.77      0.63       238
           1       0.12      0.02      0.03       205
           2       0.34      0.76      0.47       348
           3       0.50      0.12      0.20       375
           4       0.00      0.00      0.00        72

    accuracy                           0.40      1238
   macro avg       0.30      0.33      0.27      1238
weighted avg       0.37      0.40      0.32      1238



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
print(confusion_matrix(y_test, y_pred_test))

[[183   7  45   3   0]
 [ 56   4 134  11   0]
 [ 47  11 264  26   0]
 [ 52   6 271  46   0]
 [  6   4  56   6   0]]


##### Save best model

In [45]:
filename = 'models/sm_extra_trees.sav'
pickle.dump(et_best, open(filename, 'wb'))