In [10]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
import os

folder = '../data/processed'
filename = 'dataset_final.csv'
file_path = os.path.join(folder, filename)

try:
    df = pd.read_csv(
        file_path,
        sep=',',
        decimal='.'
    )
    
    print(f"✅ DataFrame cargado exitosamente desde: {file_path}")
    print(f"Dimensiones: {df.shape}")
    print("\nPrimeras 5 filas:")
    print(df.head())

except FileNotFoundError:
    print(f"❌ Error: No se encontró el archivo en la ruta: {file_path}")
except Exception as e:
    print(f"❌ Error al leer el archivo: {e}")

✅ DataFrame cargado exitosamente desde: ../data/processed\dataset_final.csv
Dimensiones: (1184, 106)

Primeras 5 filas:
          DIA  Frio (Kw)  Hl de Mosto  Sala Maq (Kw)  Servicios (Kw)  \
0  2020-07-01    23954.0          0.0        17080.0         23848.0   
1  2020-07-02    28268.0       2907.0        27216.0         38033.0   
2  2020-07-03    24246.0       4829.0        31386.0         42565.5   
3  2020-07-04    29885.0       7828.0        28070.0         39650.0   
4  2020-07-05    24449.0       6406.0        33463.0         45385.0   

   KW Gral Planta  Planta (Kw)  Agua Planta (Hl)  Planta de agua (Hl)  \
0         59058.0     27637.27           10280.0             11241.40   
1        131184.0     54409.81           13970.0             22107.77   
2        136078.0     65685.59           36300.0             46955.43   
3        139714.0     67098.54           40120.0             51124.18   
4        146862.0     70600.64           38940.0             49146.08   

   KW Tr

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1️⃣ Cargar dataset definitivo
ruta_csv = "../data/processed/X_test_preproc.csv"
X_test = pd.read_csv(ruta_csv, sep=',', decimal='.')
ruta_csv = "../data/processed/X_train_preproc.csv"
X_train = pd.read_csv(ruta_csv, sep=',', decimal='.')
ruta_csv = "../data/processed/y_test.csv"
y_test = pd.read_csv(ruta_csv, sep=',', decimal='.')
ruta_csv = "../data/processed/y_train.csv"
y_train = pd.read_csv(ruta_csv, sep=',', decimal='.')


print("Shape dataset:", df.shape)

# 4️⃣ Crear y entrenar Random Forest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

# 5️⃣ Predicciones
y_pred_train = rf.predict(X_train)
y_pred_test  = rf.predict(X_test)

# 6️⃣ Evaluación
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test  = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test  = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test  = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}, MAE: {mae_train:.2f}, R2: {r2_train:.2f}")
print(f"Test  MSE: {mse_test:.2f}, MAE: {mae_test:.2f}, R2: {r2_test:.2f}")

# 7️⃣ Importancia de variables
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("Top 10 variables más importantes:")
print(importances.head(10))


Shape dataset: (1184, 106)


  return fit_method(estimator, *args, **kwargs)


Train MSE: 21222890427.78, MAE: 14019.85, R2: 0.87
Test  MSE: 172445316949.77, MAE: 50576.92, R2: -0.98
Top 10 variables más importantes:
0     0.577008
17    0.097612
13    0.092980
25    0.029337
26    0.028650
20    0.016748
9     0.013252
3     0.013176
28    0.013160
7     0.011290
dtype: float64
