In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from random import random
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib
import matplotlib.pyplot as plt

# 1. Load and divide data into train and test set

In [3]:
data = pd.read_csv(r"../../data/preprocessed_data.csv", sep=',')

# Divide training set into inputs and targets
input_features = data[list(data.columns)[:-2]]
targets = data[list(data.columns)[-2:]]

# Divide dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(input_features, targets, test_size=0.20, random_state=42)

# Correct dtype to avoid errors
y_train = y_train.astype('int')

# 2. Evaluation functions

In [4]:
 def evaluate_abs_mse(train_rf_predictions,y_train,rf_predictions,y_test): # Pass raw predictions to function
    pred_list_train = train_rf_predictions.tolist()
    target_list_train = y_train.values.tolist()

    pred_list_test = rf_predictions.tolist()
    target_list_test = y_test.values.tolist()

    print(f'Absolute mean square error on training set: {mean_absolute_error(target_list_train, pred_list_train)*2}')
    print(f'Absolute mean square error on test set: {mean_absolute_error(pred_list_test, target_list_test)*2}')
    return '-------------- Evaluated --------------'

# 3. Models

### 3.1. Random forest classifier

In [7]:
# Create model (takes a while)
rfc = RandomForestClassifier(n_estimators=4500, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 0)

# Train model
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=4500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [8]:
# Model parameter evaluation
n_nodes = []
max_depths = []

for ind_tree in rfc.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 506
Average maximum depth 29


In [9]:
# Testing model
train_rf_predictions = rfc.predict(X_train)
train_rf_probs = rfc.predict_proba(X_train)

rf_predictions = rfc.predict(X_test)
rf_probs = rfc.predict_proba(X_test)

In [10]:
evaluate_abs_mse(train_rf_predictions,y_train,rf_predictions,y_test)

Absolute mean square error on training set: 4.8088235294117645
Absolute mean square error on test set: 359.29166666666663


'-------------- Evaluated --------------'

### 3.2. Random forest regressor

In [11]:
# Create model (takes a while)
rfr = RandomForestRegressor(n_estimators=4500, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 0)

# Train model
rfr.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=4500, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [12]:
# Model parameter evaluation
n_nodes = []
max_depths = []

for ind_tree in rfr.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 507
Average maximum depth 23


In [13]:
# Test model
rfr_predictions = rfr.predict(X_test)
train_rfr_predictions = rfr.predict(X_train)

In [14]:
evaluate_abs_mse(train_rfr_predictions,y_train,rfr_predictions,y_test)

Absolute mean square error on training set: 105.02823805780729
Absolute mean square error on test set: 349.7649136788049


'-------------- Evaluated --------------'