In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os

# Añadir el directorio raíz al path para poder importar src
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from src import config, data_loader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# 1. Cargar
df = data_loader.load_raw_data()

Cargando datos desde: C:\Users\PC\Documents\proyecto-final-ml\data\raw\dataset_practica_final.csv
Datos cargados. Shape inicial: (119390, 32)


In [3]:
# 2. Limpiar
df = data_loader.clean_data(df)

In [4]:
# 3. Transformar
X, y = data_loader.encode_data(df)

Transformando variables categóricas...
Encoding completado. Features: 247


In [5]:
# 4. Inspeccionar
print("\nEjemplo de las nuevas columnas:")
print(X.columns[:10]) # Muestra las primeras 10
print(f"\nNúmero total de columnas (Features): {X.shape[1]}")


Ejemplo de las nuevas columnas:
Index(['lead_time', 'arrival_date_year', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies',
       'is_repeated_guest'],
      dtype='object')

Número total de columnas (Features): 247


---------------------

In [6]:

# 1. Pipeline de Datos Completo
df = data_loader.load_raw_data()
df = data_loader.clean_data(df)
X, y = data_loader.encode_data(df)

# 2. Split y Escalado nuevo
X_train, X_test, y_train, y_test = data_loader.split_and_scale(X, y)

# 3. Comprobación visual
print("\nMedia de lead_time en Train (debe ser cercana a 0):")
print(X_train['lead_time'].mean())

print("\nDesviación estándar de lead_time en Train (debe ser cercana a 1):")
print(X_train['lead_time'].std())

Cargando datos desde: C:\Users\PC\Documents\proyecto-final-ml\data\raw\dataset_practica_final.csv
Datos cargados. Shape inicial: (119390, 32)
Transformando variables categóricas...
Encoding completado. Features: 247
Dividiendo datos (Test size: 0.2)...
Escalando datos (StandardScaler)...
Datos listos para entrar al modelo.
   Train shape: (69916, 247)
   Test shape:  (17480, 247)

Media de lead_time en Train (debe ser cercana a 0):
4.837495597886145e-17

Desviación estándar de lead_time en Train (debe ser cercana a 1):
1.000007151515585


In [7]:
from src import model_trainer, evaluator

# 4.1 Árboles
dt = model_trainer.train_decision_tree(X_train, y_train).model
rf = model_trainer.train_random_forest(X_train, y_train).model

# 4.2 Boosting
xgb = model_trainer.train_xgboost(X_train, y_train).model
lgbm = model_trainer.train_lightgbm(X_train, y_train).model

# 4.3 Red neuronal (usa validación interna sencilla: aquí reutilizamos test como val para parar early,
# si queréis hacerlo “bien”, hacemos un split train/val aparte)
nn = model_trainer.train_neural_network(X_train, y_train, X_test, y_test).model

# Comparación (Accuracy)
results = evaluator.evaluate_many(
    {
        "DecisionTree": dt,
        "RandomForest": rf,
        "XGBoost": xgb,
        "LightGBM": lgbm,
        "NeuralNet": nn,   # nn es el dict que devuelve el trainer (model + history)
    },
    X_test,
    y_test,
)

results


Entrenando Decision Tree...
Entrenamiento completado.
Modelo guardado en: C:\Users\PC\Documents\proyecto-final-ml\models\tree_decision.joblib
Entrenando Random Forest...
Entrenamiento completado.
Modelo guardado en: C:\Users\PC\Documents\proyecto-final-ml\models\tree_random_forest.joblib
Entrenando XGBoost...
Entrenamiento completado.
Modelo guardado en: C:\Users\PC\Documents\proyecto-final-ml\models\boost_xgboost.joblib
Entrenando LightGBM...
[LightGBM] [Info] Number of positive: 19220, number of negative: 50696
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1393
[LightGBM] [Info] Number of data points in the train set: 69916, number of used features: 134
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.274901 -> initscore=-0.969896
[LightGBM] [Info] Start training from score -0.



Entrenando red neuronal (MLP)...
Epoch 1/40
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7915 - loss: 0.4436 - val_accuracy: 0.8101 - val_loss: 0.3960
Epoch 2/40
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8139 - loss: 0.3929 - val_accuracy: 0.8177 - val_loss: 0.3792
Epoch 3/40
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8224 - loss: 0.3765 - val_accuracy: 0.8224 - val_loss: 0.3742
Epoch 4/40
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8254 - loss: 0.3687 - val_accuracy: 0.8249 - val_loss: 0.3653
Epoch 5/40
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8300 - loss: 0.3620 - val_accuracy: 0.8279 - val_loss: 0.3619
Epoch 6/40
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8328 - loss: 0.3557 - val_accuracy: 0.8319 - 

Unnamed: 0,model,accuracy
0,LightGBM,0.854233
1,XGBoost,0.852403
2,RandomForest,0.847998
3,NeuralNet,0.837757
4,DecisionTree,0.797941


In [8]:
# 5. Baseline (Regresión Logística)
from src import model_trainer, evaluator

train_result = model_trainer.train_logistic_regression(X_train, y_train)
baseline_model = train_result.model

# 6. Evaluación (Accuracy)
eval_result = evaluator.evaluate_classifier(baseline_model, X_test, y_test)
print("\nClassification report:\n")
print(eval_result.text_report)


Entrenando baseline (Regresión Logística)...
Entrenamiento completado.
Modelo guardado en: C:\Users\PC\Documents\proyecto-final-ml\models\baseline_logreg.joblib
Evaluando modelo...
Accuracy: 0.7943
Reporte guardado en: C:\Users\PC\Documents\proyecto-final-ml\outputs\baseline_report.txt

Classification report:

              precision    recall  f1-score   support

           0     0.8235    0.9118    0.8654     12675
           1     0.6756    0.4845    0.5643      4805

    accuracy                         0.7943     17480
   macro avg     0.7495    0.6981    0.7148     17480
weighted avg     0.7828    0.7943    0.7826     17480

