In [5]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import statistics
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import joblib
from numpy import linalg as LA

#to save models
from sklearn.linear_model import LogisticRegression
import pickle

In [6]:
# Def of MME error
def mee(y_true, y_pred):
    return np.mean(np.sqrt(np.sum(np.square(y_true - y_pred), axis=1)))

In [7]:
MEE = make_scorer(mee, greater_is_better=False)

## Dataset

In [9]:
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'class1', 'class2']
mlcup_tr = pd.read_csv("data/ML-CUP22-INTERNAL-TR.csv", sep = ",", names=colnames)
mlcup_tr.drop("id", axis=1, inplace=True)

x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

## Decision tree

In [None]:
param_grid = {
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2, 0.0002],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_depth': [5,7,10, 20, 30, 40, 50]
}


dt = DecisionTreeRegressor()

grid = GridSearchCV(
    dt,
    param_grid,
    cv=5,
    scoring=MEE,
    verbose=4,
    refit = True
)
grid.fit(x_mlcup_tr, y_mlcup_tr)

## Validation score

In [None]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

In [None]:
dt = grid.best_estimator_
dt

## Check error on TR

In [None]:
# MEE for tr
y_pred = dt.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the test set: %0.5f" %(mseTR))

In [None]:
joblib.dump(dt, 'savedModels/dt_mlcup.z')

## Randomized GridSearchCV

In [None]:

param_grid = {
    'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2, 0.0002],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
    'min_samples_split': np.arange(1,20),
    'min_samples_leaf': np.arange(1,20),
    'max_depth': np.arange(5,90)
}


dt = DecisionTreeRegressor()

grid = RandomizedSearchCV(
    dt,
    param_grid,
    cv=5,
    scoring=MEE,
    verbose=4,
    refit = True
)


grid.fit(x_mlcup_tr, y_mlcup_tr)

## Validation score

In [None]:

print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

In [None]:
dt = grid.best_estimator_
dt

In [None]:
# MEE for tr
y_pred = dt.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))

In [None]:
joblib.dump(dt, 'savedModels/dt_randomSearchCV_mlcup.z')