In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import statistics
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import joblib


#to save models
import pickle

In [2]:
# Def of MME error
def mee(y_true, y_pred):
    return np.mean(np.sqrt(np.sum(np.square(y_true - y_pred), axis=1)))

In [3]:
MEE = make_scorer(mee, greater_is_better=False)

## Dataset

In [4]:
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'class1', 'class2']
mlcup_tr = pd.read_csv("data/ML-CUP22-INTERNAL-TR.csv", sep = ",", names=colnames)
mlcup_tr.drop("id", axis=1, inplace=True)

x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

## Random Forest

In [5]:
param_grid = {
    'n_estimators': [60, 65, 70, 75],
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001],
    'min_samples_split': [3, 4, 5],
    'min_samples_leaf': [2, 3],
}

rf = RandomForestRegressor(random_state=42)

grid = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring=MEE,
    return_train_score=False,
    verbose=4,
    refit=True
)

grid.fit(x_mlcup_tr, y_mlcup_tr)



Fitting 5 folds for each of 864 candidates, totalling 4320 fits
[CV 1/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=60;, score=-1.019 total time=   0.2s
[CV 2/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=60;, score=-0.974 total time=   0.2s
[CV 3/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=60;, score=-0.963 total time=   0.2s
[CV 4/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=60;, score=-1.002 total time=   0.2s
[CV 5/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=60;, score=-1.031 total time=   0.2s
[CV 1/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=65;, score=-1.028 total time=   0.2s
[CV 2/5] END ccp_alpha=0.0, min_samples_le

## Validation score

In [6]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'ccp_alpha': 0.001, 'min_samples_leaf': 2, 'min_samples_split': 4, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 75} with a score of -0.98414


In [7]:
rf = grid.best_estimator_
rf

## Check error on TR

In [8]:
# MEE for tr
y_pred = rf.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))

MEE on the training set: 0.56784
MSE on the test set: 0.22817


In [9]:
joblib.dump(rf, 'savedModels/rf_mlcup.z')

['savedModels/rf_mlcup.z']

## RandomSearchCV

In [10]:
param_grid = {
    'n_estimators': np.arange(2,100),
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.2, 0.02, 0.002, 0.0001, 0.0005, 0.0002],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001,0.2, 0.02, 0.002, 0.0001, 0.0005, 0.0002],
    'min_samples_split': np.arange(1,20),
    'min_samples_leaf': np.arange(1,20),
}

rf = RandomForestRegressor(random_state=42)

grid = RandomizedSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring=MEE,
    return_train_score=False,
    verbose=4,
    random_state=42,
    refit=True
)

grid.fit(x_mlcup_tr, y_mlcup_tr)









Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ccp_alpha=0.05, min_samples_leaf=12, min_samples_split=4, min_weight_fraction_leaf=0.1, n_estimators=83;, score=-1.293 total time=   0.1s
[CV 2/5] END ccp_alpha=0.05, min_samples_leaf=12, min_samples_split=4, min_weight_fraction_leaf=0.1, n_estimators=83;, score=-1.288 total time=   0.1s
[CV 3/5] END ccp_alpha=0.05, min_samples_leaf=12, min_samples_split=4, min_weight_fraction_leaf=0.1, n_estimators=83;, score=-1.231 total time=   0.1s
[CV 4/5] END ccp_alpha=0.05, min_samples_leaf=12, min_samples_split=4, min_weight_fraction_leaf=0.1, n_estimators=83;, score=-1.237 total time=   0.1s
[CV 5/5] END ccp_alpha=0.05, min_samples_leaf=12, min_samples_split=4, min_weight_fraction_leaf=0.1, n_estimators=83;, score=-1.385 total time=   0.1s
[CV 1/5] END ccp_alpha=0.2, min_samples_leaf=5, min_samples_split=15, min_weight_fraction_leaf=0.2, n_estimators=4;, score=-1.672 total time=   0.0s
[CV 2/5] END ccp_alpha=0.2, min_sam

## Validation score

In [11]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'n_estimators': 37, 'min_weight_fraction_leaf': 0.005, 'min_samples_split': 4, 'min_samples_leaf': 9, 'ccp_alpha': 0.0} with a score of -1.02882


In [12]:
rf = grid.best_estimator_
rf

In [13]:
# Check error on TR
# MEE for tr
y_pred = rf.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))

MEE on the training set: 0.80662
MSE on the training set: 0.50617


In [14]:
joblib.dump(rf, 'savedModels/rf_RandomSearchCV_mlcup.z')

['savedModels/rf_RandomSearchCV_mlcup.z']