In [42]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import statistics
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import joblib
from numpy import linalg as LA

#to save models
from sklearn.linear_model import LogisticRegression
import pickle

In [43]:
# Def of MME error
def mee(y_true, y_pred):
    return np.mean(np.sqrt(np.sum(np.square(y_true - y_pred), axis=1)))

In [44]:
MEE = make_scorer(mee, greater_is_better=False)

## Dataset

In [45]:
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'class1', 'class2']
mlcup_tr = pd.read_csv("data/ML-CUP22-INTERNAL-TR.csv", sep = ",", names=colnames)
mlcup_ts = pd.read_csv("data/ML-CUP22-INTERNAL-TS.csv", sep = ",", names=colnames)

x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

x_mlcup_ts = mlcup_ts.iloc[:, 0:9].values
y_mlcup_ts = mlcup_ts.iloc[:, 9:11].values

## Decision tree

In [46]:
param_grid = {
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2, 0.0002],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_depth': [5,7,10, 20, 30, 40, 50]
}


dt = DecisionTreeRegressor()

grid = GridSearchCV(
    dt,
    param_grid,
    cv=5,
    scoring=MEE,
    verbose=4,
    refit = True
)
grid.fit(x_mlcup_tr, y_mlcup_tr)

Fitting 5 folds for each of 9240 candidates, totalling 46200 fits
[CV 1/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0;, score=-1.171 total time=   0.0s
[CV 2/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0;, score=-1.147 total time=   0.0s
[CV 3/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0;, score=-1.223 total time=   0.0s
[CV 4/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0;, score=-1.233 total time=   0.0s
[CV 5/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0;, score=-1.302 total time=   0.0s
[CV 1/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.1;, score=-1.503 total time=   0.0s
[CV 2/5] END ccp_alpha=0.0, max_depth=5, min_samples_leaf=1, min

## Validation score

In [47]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'ccp_alpha': 0.001, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.01} with a score of -1.18724


In [48]:
dt = grid.best_estimator_
dt

## Check error on TR

In [49]:
# MEE for tr
y_pred = dt.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the test set: %0.5f" %(mseTR))

MEE on the training set: 0.92258
MSE on the test set: 0.66102


In [50]:
joblib.dump(dt, 'savedModels/dt_mlcup.z')

['savedModels/dt_mlcup.z']

## Randomized GridSearchCV

In [51]:

param_grid = {
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2, 0.0002],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
    'min_samples_split': np.arange(1,20),
    'min_samples_leaf': np.arange(1,20),
    'max_depth': np.arange(5,90)
}


dt = DecisionTreeRegressor()

grid = RandomizedSearchCV(
    dt,
    param_grid,
    cv=5,
    scoring=MEE,
    verbose=4,
    refit = True
)


grid.fit(x_mlcup_tr, y_mlcup_tr)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ccp_alpha=0.0005, max_depth=22, min_samples_leaf=10, min_samples_split=18, min_weight_fraction_leaf=0.0001;, score=-1.208 total time=   0.0s
[CV 2/5] END ccp_alpha=0.0005, max_depth=22, min_samples_leaf=10, min_samples_split=18, min_weight_fraction_leaf=0.0001;, score=-1.123 total time=   0.0s
[CV 3/5] END ccp_alpha=0.0005, max_depth=22, min_samples_leaf=10, min_samples_split=18, min_weight_fraction_leaf=0.0001;, score=-1.200 total time=   0.0s
[CV 4/5] END ccp_alpha=0.0005, max_depth=22, min_samples_leaf=10, min_samples_split=18, min_weight_fraction_leaf=0.0001;, score=-1.248 total time=   0.0s
[CV 5/5] END ccp_alpha=0.0005, max_depth=22, min_samples_leaf=10, min_samples_split=18, min_weight_fraction_leaf=0.0001;, score=-1.224 total time=   0.0s
[CV 1/5] END ccp_alpha=0.2, max_depth=66, min_samples_leaf=6, min_samples_split=3, min_weight_fraction_leaf=0.0001;, score=-1.474 total time=   0.0s
[CV 2/5] END ccp_alp

## Validation score

In [52]:

print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'min_weight_fraction_leaf': 0.0001, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_depth': 14, 'ccp_alpha': 0.0} with a score of -1.19378


In [53]:
dt = grid.best_estimator_
dt

In [54]:
# MEE for tr
y_pred = dt.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))

MEE on the training set: 0.78722
MSE on the training set: 0.49515


In [55]:
joblib.dump(dt, 'savedModels/dt_randomSearchCV_mlcup.z')

['savedModels/dt_randomSearchCV_mlcup.z']