In [5]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import statistics
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import joblib


#to save models
import pickle

In [6]:
# Def of MME error
def mee(y_true, y_pred):
    return np.mean(np.sqrt(np.sum(np.square(y_true - y_pred), axis=1)))

In [7]:
mee_score = make_scorer(mee, greater_is_better=False)

## Dataset

In [8]:
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'class1', 'class2']
mlcup_tr = pd.read_csv("../data/ML-CUP22-INTERNAL-TR.csv", sep = ",", names=colnames)
mlcup_tr.drop("id", axis=1, inplace=True)

x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

## Random Forest

In [19]:
param_grid = {
    'n_estimators': [10,20,30,40],
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001],
    'min_samples_split': [3, 4, 5],
    'min_samples_leaf': [2, 3],
}

rf = RandomForestRegressor(random_state=42)

grid = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring=mee_score,
    return_train_score=False,
    verbose=4,
    refit=True
)

grid.fit(x_mlcup_tr, y_mlcup_tr)



Fitting 5 folds for each of 864 candidates, totalling 4320 fits
[CV 1/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=10;, score=-1.669 total time=   0.0s
[CV 2/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=10;, score=-1.553 total time=   0.0s
[CV 3/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=10;, score=-1.481 total time=   0.0s
[CV 4/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=10;, score=-1.676 total time=   0.0s
[CV 5/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=10;, score=-1.507 total time=   0.0s
[CV 1/5] END ccp_alpha=0.0, min_samples_leaf=2, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=20;, score=-1.614 total time=   0.0s
[CV 2/5] END ccp_alpha=0.0, min_samples_le

## Validation score

In [20]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'ccp_alpha': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 40} with a score of -1.49470


In [21]:
rf = grid.best_estimator_
rf

## Check error on TR

In [22]:
# MEE for tr
y_pred = rf.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))

MEE on the training set: 0.72649
MSE on the training set: 0.46880


In [23]:
joblib.dump(rf, './../savedModels/rf_mlcup.z')

['savedModels/rf_mlcup.z']

## RandomSearchCV

In [9]:
#[0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.2, 0.02, 0.002, 0.0001, 0.0005, 0.0002]
param_grid = {
    'n_estimators': np.arange(2,100),
    'ccp_alpha': np.arange(0.0001,0.0009),
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001,0.2, 0.02, 0.002, 0.0001, 0.0005, 0.0002],
    'min_samples_split': np.arange(1,20),
    'min_samples_leaf': np.arange(1,20),
}

rf = RandomForestRegressor(random_state=42)

grid = RandomizedSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring=mee_score,
    return_train_score=False,
    verbose=4,
    random_state=42,
    n_iter=50,
    refit=True
)

grid.fit(x_mlcup_tr, y_mlcup_tr)









Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=6, min_samples_split=9, min_weight_fraction_leaf=0.002, n_estimators=48;, score=-1.627 total time=   0.2s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=6, min_samples_split=9, min_weight_fraction_leaf=0.002, n_estimators=48;, score=-1.473 total time=   0.2s
[CV 3/5] END ccp_alpha=0.0001, min_samples_leaf=6, min_samples_split=9, min_weight_fraction_leaf=0.002, n_estimators=48;, score=-1.386 total time=   0.2s
[CV 4/5] END ccp_alpha=0.0001, min_samples_leaf=6, min_samples_split=9, min_weight_fraction_leaf=0.002, n_estimators=48;, score=-1.629 total time=   0.2s
[CV 5/5] END ccp_alpha=0.0001, min_samples_leaf=6, min_samples_split=9, min_weight_fraction_leaf=0.002, n_estimators=48;, score=-1.451 total time=   0.2s
[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=7, min_samples_split=11, min_weight_fraction_leaf=0.0005, n_estimators=65;, score=-1.631 total time=   0.3s
[CV 2/5] END

[CV 4/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=18, min_weight_fraction_leaf=0.005, n_estimators=84;, score=-1.615 total time=   0.4s
[CV 5/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=18, min_weight_fraction_leaf=0.005, n_estimators=84;, score=-1.450 total time=   0.4s
[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=17, min_samples_split=15, min_weight_fraction_leaf=0.0001, n_estimators=23;, score=-1.736 total time=   0.1s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=17, min_samples_split=15, min_weight_fraction_leaf=0.0001, n_estimators=23;, score=-1.534 total time=   0.1s
[CV 3/5] END ccp_alpha=0.0001, min_samples_leaf=17, min_samples_split=15, min_weight_fraction_leaf=0.0001, n_estimators=23;, score=-1.594 total time=   0.1s
[CV 4/5] END ccp_alpha=0.0001, min_samples_leaf=17, min_samples_split=15, min_weight_fraction_leaf=0.0001, n_estimators=23;, score=-1.739 total time=   0.1s
[CV 5/5] END ccp_alpha=0.0001, min_samples_leaf=17, min_sample

[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=9, min_samples_split=13, min_weight_fraction_leaf=0.0002, n_estimators=87;, score=-1.511 total time=   0.3s
[CV 3/5] END ccp_alpha=0.0001, min_samples_leaf=9, min_samples_split=13, min_weight_fraction_leaf=0.0002, n_estimators=87;, score=-1.452 total time=   0.3s
[CV 4/5] END ccp_alpha=0.0001, min_samples_leaf=9, min_samples_split=13, min_weight_fraction_leaf=0.0002, n_estimators=87;, score=-1.612 total time=   0.3s
[CV 5/5] END ccp_alpha=0.0001, min_samples_leaf=9, min_samples_split=13, min_weight_fraction_leaf=0.0002, n_estimators=87;, score=-1.470 total time=   0.3s
[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=15, min_samples_split=13, min_weight_fraction_leaf=0.1, n_estimators=45;, score=-2.033 total time=   0.1s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=15, min_samples_split=13, min_weight_fraction_leaf=0.1, n_estimators=45;, score=-1.962 total time=   0.1s
[CV 3/5] END ccp_alpha=0.0001, min_samples_leaf=15, min_samples_spli

[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=12, min_samples_split=12, min_weight_fraction_leaf=0.0, n_estimators=77;, score=-1.685 total time=   0.3s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=12, min_samples_split=12, min_weight_fraction_leaf=0.0, n_estimators=77;, score=-1.524 total time=   0.3s
[CV 3/5] END ccp_alpha=0.0001, min_samples_leaf=12, min_samples_split=12, min_weight_fraction_leaf=0.0, n_estimators=77;, score=-1.505 total time=   0.3s
[CV 4/5] END ccp_alpha=0.0001, min_samples_leaf=12, min_samples_split=12, min_weight_fraction_leaf=0.0, n_estimators=77;, score=-1.627 total time=   0.3s
[CV 5/5] END ccp_alpha=0.0001, min_samples_leaf=12, min_samples_split=12, min_weight_fraction_leaf=0.0, n_estimators=77;, score=-1.488 total time=   0.3s
[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=18, min_samples_split=11, min_weight_fraction_leaf=0.0002, n_estimators=68;, score=-1.741 total time=   0.3s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=18, min_samples_split=11,

[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=15, min_weight_fraction_leaf=0.0002, n_estimators=82;, score=-1.642 total time=   0.3s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=15, min_weight_fraction_leaf=0.0002, n_estimators=82;, score=-1.492 total time=   0.3s
[CV 3/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=15, min_weight_fraction_leaf=0.0002, n_estimators=82;, score=-1.412 total time=   0.3s
[CV 4/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=15, min_weight_fraction_leaf=0.0002, n_estimators=82;, score=-1.616 total time=   0.4s
[CV 5/5] END ccp_alpha=0.0001, min_samples_leaf=4, min_samples_split=15, min_weight_fraction_leaf=0.0002, n_estimators=82;, score=-1.443 total time=   0.4s
[CV 1/5] END ccp_alpha=0.0001, min_samples_leaf=18, min_samples_split=2, min_weight_fraction_leaf=0.005, n_estimators=76;, score=-1.742 total time=   0.3s
[CV 2/5] END ccp_alpha=0.0001, min_samples_leaf=18, min_samples_s

10 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/diego/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/diego/.local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
    trees = Parallel(
  File "/home/diego/.local/lib/python3.10/site-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/diego/.local/lib/python3.10/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "

## Validation score

In [10]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'n_estimators': 95, 'min_weight_fraction_leaf': 0.0001, 'min_samples_split': 8, 'min_samples_leaf': 2, 'ccp_alpha': 0.0001} with a score of -1.49606


In [11]:
rf = grid.best_estimator_
rf

In [12]:
# Check error on TR
# MEE for tr
y_pred = rf.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))

MEE on the training set: 0.82239
MSE on the training set: 0.58778


In [13]:
joblib.dump(rf, './../savedModels/rf_RandomSearchCV_mlcup.z')

['./../savedModels/rf_RandomSearchCV_mlcup.z']