In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from random import random
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib
import matplotlib.pyplot as plt

In [3]:
%run "../preprocessing/temporary_preprocessing_pipeline.ipynb"

# 1. Load and divide data into train and test set

In [12]:
# Divide training set into inputs and targets
input_features = final_df_train[final_df_train.columns[:-2]]
targets = final_df_train[final_df_train.columns[-2:]]

# Divide dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(input_features, targets, test_size=0.1, random_state=42)

# 2. Evaluation functions

In [5]:
 def evaluate_abs_mse(train_rf_predictions,y_train,rf_predictions,y_test): # Pass raw predictions to function
    pred_list_train = train_rf_predictions.tolist()
    target_list_train = y_train.tolist()

    pred_list_test = rf_predictions.tolist()
    target_list_test = y_test.tolist()

    print(f'Absolute mean square error on training set: {mean_absolute_error(target_list_train, pred_list_train)*2}')
    print(f'Absolute mean square error on test set: {mean_absolute_error(pred_list_test, target_list_test)*2}')
    return '-------------- Evaluated --------------'

# 3. Models

### 3.1. Random forest classifier

In [None]:
# Create model (takes a while)
rfc = RandomForestClassifier(n_estimators=4500, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 0)

# Train model
rfc.fit(X_train, y_train)

In [None]:
# Model parameter evaluation
n_nodes = []
max_depths = []

for ind_tree in rfc.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

In [None]:
# Testing model
train_rf_predictions = rfc.predict(X_train)
train_rf_probs = rfc.predict_proba(X_train)

rf_predictions = rfc.predict(X_test)
rf_probs = rfc.predict_proba(X_test)

In [None]:
# Reverse values to original values in euros
test_rf_predictions = min_max_scaler_targets.inverse_transform(rf_predictions)
train_rf_predictions = min_max_scaler_targets.inverse_transform(train_rf_predictions)

y_train = min_max_scaler_targets.inverse_transform(y_train)
y_test = min_max_scaler_targets.inverse_transform(y_test)

In [None]:
evaluate_abs_mse(train_rf_predictions,y_train,test_rf_predictions,y_test)

### 3.2. Random forest regressor

In [13]:
# Create model (takes a while)
rfr = RandomForestRegressor(n_estimators= 800,
                            min_samples_split= 5,
                            min_samples_leaf= 1,
                            max_features= 'sqrt',
                            max_depth= 90,
                            bootstrap= False,
                            n_jobs=-1,
                            verbose = 0)

# Train model
rfr.fit(X_train,y_train)

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=90, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      n_estimators=800, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [14]:
# Model parameter evaluation
n_nodes = []
max_depths = []

for ind_tree in rfr.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 409
Average maximum depth 26


In [15]:
# Test model
test_rfr_predictions = rfr.predict(X_test)
train_rfr_predictions = rfr.predict(X_train)

In [16]:
# Reverse values to original values in euros
test_rfr_predictions = min_max_scaler_targets.inverse_transform(test_rfr_predictions)
train_rfr_predictions = min_max_scaler_targets.inverse_transform(train_rfr_predictions)

y_train_reversed = min_max_scaler_targets.inverse_transform(y_train)
y_test_reversed = min_max_scaler_targets.inverse_transform(y_test)

In [17]:
evaluate_abs_mse(train_rfr_predictions,y_train_reversed,test_rfr_predictions,y_test_reversed)

Absolute mean square error on training set: 114.93737472766861
Absolute mean square error on test set: 340.5613725490201


'-------------- Evaluated --------------'

## 4. Test models on real test data

In [21]:
# Test the model on real test data
final_predictions = rfr.predict(final_df_test)
final_predictions_reversed = min_max_scaler_targets.inverse_transform(final_predictions)

In [37]:
submission_prices = pd.DataFrame(final_predictions_reversed, columns = targets.columns)
submission = pd.concat([pd.DataFrame(df_laptops_test['id']), submission_prices], axis=1)

In [38]:
# Write final df to csv
submission.to_csv(r'../../data/intermediate_submission_1.csv', index = False)

# Model experimentation

In [18]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rfr.get_params())

Parameters currently in use:

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 90,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 800,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [8]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfr3 = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rfr3, param_distributions = random_grid, n_iter = 130, cv = 4, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 4 folds for each of 130 candidates, totalling 520 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 520 out of 520 | elapsed:  9.3min finished


RandomizedSearchCV(cv=4, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [10]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [11]:
test_rfr_predictions = rf_random.predict(X_test)
train_rfr_predictions = rf_random.predict(X_train)

test_rfr_predictions = min_max_scaler_targets.inverse_transform(test_rfr_predictions)
train_rfr_predictions = min_max_scaler_targets.inverse_transform(train_rfr_predictions)

y_train = min_max_scaler_targets.inverse_transform(y_train)
y_test = min_max_scaler_targets.inverse_transform(y_test)

evaluate_abs_mse(train_rfr_predictions,y_train,test_rfr_predictions,y_test)

Absolute mean square error on training set: 118.36272617794435
Absolute mean square error on test set: 354.75540609199754


'-------------- Evaluated --------------'