### Imported libreries

In [None]:
# Data manipulation

import pandas as pd
import numpy as np

# Modeling

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Selection of the best dataset

In [2]:
X_train_WITH_outliers = pd.read_excel("../data/processed/X_train_WITH_outliers.xlsx")
X_train_WITH_outliers_norm = pd.read_excel("../data/processed/X_train_WITH_outliers_norm.xlsx")
X_train_WITH_outliers_scal = pd.read_excel("../data/processed/X_train_WITH_outliers_scal.xlsx")
X_train_WITHOUT_outliers = pd.read_excel("../data/processed/X_train_WITHOUT_outliers.xlsx")
X_train_WITHOUT_outliers_norm = pd.read_excel("../data/processed/X_train_WITHOUT_outliers_norm.xlsx")
X_train_WITHOUT_outliers_scal = pd.read_excel("../data/processed/X_train_WITHOUT_outliers_scal.xlsx")

y_train = pd.read_excel("../data/processed/y_train.xlsx")
y_test = pd.read_excel("../data/processed/y_test.xlsx")

In [9]:
train_datasets = {
    "X_train_WITH_outliers": X_train_WITH_outliers,
    "X_train_WITH_outliers_norm": X_train_WITH_outliers_norm,
    "X_train_WITH_outliers_scal": X_train_WITH_outliers_scal,
    "X_train_WITHOUT_outliers": X_train_WITHOUT_outliers,
    "X_train_WITHOUT_outliers_norm": X_train_WITHOUT_outliers_norm,
    "X_train_WITHOUT_outliers_scal": X_train_WITHOUT_outliers_scal
}

models = {}
metrics = {}

for name, dataset in train_datasets.items():
  model = DecisionTreeClassifier()
  model.fit(dataset, y_train)
  y_pred = model.predict(dataset)
  

  acc = accuracy_score(y_train, y_pred)
  f1 = f1_score(y_train, y_pred)
  prec = precision_score(y_train, y_pred)
  rec = recall_score(y_train, y_pred)

  models[name] = model
  metrics[name] = {"Accuracy": acc, "F1": f1, "Precision": prec, "Recall": rec}

  print(f"{name}: Accuracy={acc:.4f}, F1={f1:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

best_dataset_name = max(metrics, key=lambda k: metrics[k]["F1"])
best_X_train = train_datasets[best_dataset_name]

best_test_name = best_dataset_name.replace("X_train", "X_test")
best_X_test = pd.read_excel(f"../data/processed/{best_test_name}.xlsx")

print("\n================ CONCLUSION ================")
print(f"Best dataset: {best_dataset_name}")
print(f"Accuracy: {metrics[best_dataset_name]['Accuracy']:.4f}")
print(f"F1-score: {metrics[best_dataset_name]['F1']:.4f}")
print(f"Precision: {metrics[best_dataset_name]['Precision']:.4f}")
print(f"Recall: {metrics[best_dataset_name]['Recall']:.4f}")

X_train_WITH_outliers: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
X_train_WITH_outliers_norm: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
X_train_WITH_outliers_scal: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
X_train_WITHOUT_outliers: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
X_train_WITHOUT_outliers_norm: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000
X_train_WITHOUT_outliers_scal: Accuracy=1.0000, F1=1.0000, Precision=1.0000, Recall=1.0000

Best dataset: X_train_WITH_outliers
Accuracy: 1.0000
F1-score: 1.0000
Precision: 1.0000
Recall: 1.0000


### Hyperparameter optimization

In [17]:
alpha = [0.001, 0.01, 0.1, 1.0]
max_iter = [1000, 5000, 10000]

params_models = {
    "Ridge": {
        "model": Ridge(random_state=10),
        "params": {
            "alpha": alpha,
            "max_iter": max_iter
        }
    },
    "Lasso": {
        "model": Lasso(random_state=10),
        "params": {
            "alpha": alpha,
            "max_iter": max_iter
        }
    },
    "ElasticNet": {
        "model": ElasticNet(random_state=10),
        "params": {
            "alpha": alpha,
            "max_iter": max_iter,
            "l1_ratio": [0.25, 0.5, 0.75]
        }
    }
}

#### Grid Search

In [18]:
best_models_grid = {}
results_grid = {}

for name, config in params_models.items():
    print(f"\n===== GridSearchCV RESULTS for {name} =====")
    
    grid_search = GridSearchCV(
        estimator=config["model"],
        param_grid=config["params"],
        scoring="r2",
        n_jobs=-1
    )
    
    grid_search.fit(best_X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    
    best_models_grid[name] = best_model
    results_grid[name] = {
        "R2": best_score,
        "Params": best_params
    }
    
    print("Best params:", best_params)
    print("The best model is:", best_model)
    print("The R2 score (CV) for this model is:", best_score)

best_model_name = max(results_grid, key=lambda k: results_grid[k]["R2"])
best_model_info = results_grid[best_model_name]

print("\n================ FINAL CONCLUSION ================")
print(f"Best model: {best_model_name}")
print(f"Best R2 (CV): {best_model_info['R2']:.4f}")
print(f"Best parameters: {best_model_info['Params']}")
print(f"Model object: {best_models_grid[best_model_name]}")


===== GridSearchCV RESULTS for Ridge =====
Best params: {'alpha': 1.0, 'max_iter': 1000}
The best model is: Ridge(max_iter=1000, random_state=10)
The R2 score (CV) for this model is: 0.7451463256169447

===== GridSearchCV RESULTS for Lasso =====
Best params: {'alpha': 1.0, 'max_iter': 1000}
The best model is: Lasso(random_state=10)
The R2 score (CV) for this model is: 0.7450750560668469

===== GridSearchCV RESULTS for ElasticNet =====
Best params: {'alpha': 0.01, 'l1_ratio': 0.75, 'max_iter': 1000}
The best model is: ElasticNet(alpha=0.01, l1_ratio=0.75, random_state=10)
The R2 score (CV) for this model is: 0.7451504038500277

Best model: ElasticNet
Best R2 (CV): 0.7452
Best parameters: {'alpha': 0.01, 'l1_ratio': 0.75, 'max_iter': 1000}
Model object: ElasticNet(alpha=0.01, l1_ratio=0.75, random_state=10)


#### Random Search

In [19]:
best_models_random = {}
results_random = {}

for name, config in params_models.items():
    print(f"\n===== RandomizedSearchCV RESULTS for {name} =====")
    
    random_search = RandomizedSearchCV(
        estimator=config["model"],
        param_distributions=config["params"],
        n_iter=20,
        scoring="r2",
        n_jobs=-1,
        random_state=10
    )
    
    random_search.fit(best_X_train, y_train)
    
    best_model = random_search.best_estimator_
    best_score = random_search.best_score_
    best_params = random_search.best_params_
    
    best_models_random[name] = best_model
    results_random[name] = {
        "R2": best_score,
        "Params": best_params
    }
    
    print("Best params:", best_params)
    print("The best model is:", best_model)
    print("The R2 score (CV) for this model is:", best_score)

best_model_name = max(results_random, key=lambda k: results_random[k]["R2"])
best_model_info = results_random[best_model_name]

print("\n================ FINAL CONCLUSION (RANDOMIZED) ================")
print(f"Best model: {best_model_name}")
print(f"Best R2 (CV): {best_model_info['R2']:.4f}")
print(f"Best parameters: {best_model_info['Params']}")
print(f"Model object: {best_models_random[best_model_name]}")



===== RandomizedSearchCV RESULTS for Ridge =====




Best params: {'max_iter': 1000, 'alpha': 1.0}
The best model is: Ridge(max_iter=1000, random_state=10)
The R2 score (CV) for this model is: 0.7451463256169447

===== RandomizedSearchCV RESULTS for Lasso =====




Best params: {'max_iter': 1000, 'alpha': 1.0}
The best model is: Lasso(random_state=10)
The R2 score (CV) for this model is: 0.7450750560668469

===== RandomizedSearchCV RESULTS for ElasticNet =====
Best params: {'max_iter': 10000, 'l1_ratio': 0.25, 'alpha': 0.001}
The best model is: ElasticNet(alpha=0.001, l1_ratio=0.25, max_iter=10000, random_state=10)
The R2 score (CV) for this model is: 0.745126883567952

Best model: Ridge
Best R2 (CV): 0.7451
Best parameters: {'max_iter': 1000, 'alpha': 1.0}
Model object: Ridge(max_iter=1000, random_state=10)


In [20]:
final_model = best_models_grid[best_model_name]

final_model.fit(best_X_train, y_train)

y_pred_train = final_model.predict(best_X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)

print("===== TRAIN METRICS =====")
print(f"R2: {r2_train:.4f}")
print(f"MSE: {mse_train:.2f}")
print(f"MAE: {mae_train:.2f}")

y_pred_test = final_model.predict(best_X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)

print("\n===== TEST METRICS =====")
print(f"R2: {r2_test:.4f}")
print(f"MSE: {mse_test:.2f}")
print(f"MAE: {mae_test:.2f}")

===== TRAIN METRICS =====
R2: 0.7540
MSE: 36739333.29
MAE: 4190.05

===== TEST METRICS =====
R2: 0.7352
MSE: 35840246.02
MAE: 4057.47
