In [2]:
import pandas as pd
import os

folder = 'dataset_FE'
filename = 'datos_procesados.csv'
file_path = os.path.join(folder, filename)

try:
    df = pd.read_csv(
        file_path,
        sep=',',
        decimal='.'
    )
    
    print(f"✅ DataFrame cargado exitosamente desde: {file_path}")
    print(f"Dimensiones: {df.shape}")
    print("\nPrimeras 5 filas:")
    print(df.head())

except FileNotFoundError:
    print(f"❌ Error: No se encontró el archivo en la ruta: {file_path}")
except Exception as e:
    print(f"❌ Error al leer el archivo: {e}")

✅ DataFrame cargado exitosamente desde: dataset_FE\datos_procesados.csv
Dimensiones: (1184, 38)

Primeras 5 filas:
          DIA  EE Planta / Hl  EE Elaboracion / Hl  EE Bodega / Hl  \
0  2020-07-01      642.727209            47.145349       69.023256   
1  2020-07-02        7.767254             0.769609        0.798838   
2  2020-07-03        8.801205             0.862593        0.835762   
3  2020-07-04        5.175639             0.439225        0.371077   
4  2020-07-05        7.924665             0.802365        0.717787   

   EE Cocina / Hl  EE Agua / Hl  ET Planta / Hl  ET Elab/Hl  ET Bodega/Hl  \
0        0.000000      4.372093     3506.412338  924.646747    146.731163   
1        0.319229     -0.023412       67.023237   17.419777      2.050417   
2        0.260924      0.126352       73.462669   20.504276      1.970632   
3        0.258048      0.077983       49.022234   17.832753      1.275730   
4        0.301592      0.114267       62.150576   25.156634      1.363221   

 

# Preprocesamiento basico
Vimos que hay columnas que tienen valores negativos y nulos que no deberian tenerlos. Para ello los vamos a imputar como NANS

In [3]:
df.columns

Index(['DIA', 'EE Planta / Hl', 'EE Elaboracion / Hl', 'EE Bodega / Hl',
       'EE Cocina / Hl', 'EE Agua / Hl', 'ET Planta / Hl', 'ET Elab/Hl',
       'ET Bodega/Hl', 'ET Cocina/Hl', 'ET Envasado/Hl', 'Hl de Mosto',
       'Cocimientos Diarios', 'Planta (Kw)', 'Bodega (Kw)', 'Calderas (Kw)',
       'Efluentes (Kw)', 'Frio (Kw)', 'Prod Agua (Kw)', 'KW CO2',
       'KW Enfluentes Hidr', 'Kw Compresores Aire', 'Produccion (Hl)',
       'Temp Tq Intermedio', 'Gas Planta (Mj)', 'ET Envasado (Mj)',
       'ET Servicios (Mj)', 'Tot L3. L4 y Planta de CO2',
       'Tot A40/240/50/60/Centec/Filtro', 'Tot  A130/330/430', 'Tot  Trasiego',
       'Anio', 'Mes', 'Dia', 'Dia_semana', 'Temperatura_amb',
       'Tarifa_electrica', 'estacion'],
      dtype='object')

In [7]:
import numpy as np

def reemplazar_ceros_y_negativos_excepto(df, columnas_excluidas):
    df_modificado = df.copy()

    # Filtramos las columnas numéricas que no están en la lista excluida
    columnas_a_modificar = [
        col for col in df_modificado.select_dtypes(include=[np.number]).columns
        if col not in columnas_excluidas
    ]

    for col in columnas_a_modificar:
        df_modificado.loc[df_modificado[col] <= 0, col] = np.nan

    return df_modificado


df_limpiado=reemplazar_ceros_y_negativos_excepto(df, columnas_excluidas=['Tarifa_electrica',"Temperatura_amb"])

df_limpiado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184 entries, 0 to 1183
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DIA                              1184 non-null   object 
 1   EE Planta / Hl                   1157 non-null   float64
 2   EE Elaboracion / Hl              1157 non-null   float64
 3   EE Bodega / Hl                   1152 non-null   float64
 4   EE Cocina / Hl                   1049 non-null   float64
 5   EE Agua / Hl                     1152 non-null   float64
 6   ET Planta / Hl                   1157 non-null   float64
 7   ET Elab/Hl                       1157 non-null   float64
 8   ET Bodega/Hl                     1151 non-null   float64
 9   ET Cocina/Hl                     950 non-null    float64
 10  ET Envasado/Hl                   1157 non-null   float64
 11  Hl de Mosto                      1049 non-null   float64
 12  Cocimientos Diarios 

In [16]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats

# ============================================
# 1️⃣ Custom Transformer: eliminar outliers por Z-score
# ============================================

class ZScoreOutlierRemover(BaseEstimator, TransformerMixin):
    """Reemplaza por NaN los valores cuyo |z-score| > threshold."""
    def __init__(self, threshold=3.0):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_numeric = pd.DataFrame(X).copy()
        z_scores = np.abs(stats.zscore(X_numeric, nan_policy='omit'))
        X_numeric[z_scores > self.threshold] = np.nan
        return X_numeric.values

# ============================================
# 2️⃣ Custom Transformer: codificación seno/coseno de fechas
# ============================================

class DateSinCosEncoder(BaseEstimator, TransformerMixin):
    """
    Convierte una columna de fechas en dos columnas: día del año codificado con seno y coseno.
    Acepta tanto DataFrame como ndarray (como lo envía ColumnTransformer).
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Si X viene como ndarray (una sola columna), convertir a Serie
        if isinstance(X, np.ndarray):
            X = pd.Series(X.ravel())
        elif isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]  # Tomar la primera columna
        
        X = pd.to_datetime(X, errors='coerce')
        day_of_year = X.dt.dayofyear.fillna(0)
        
        sin_day = np.sin(2 * np.pi * day_of_year / 365)
        cos_day = np.cos(2 * np.pi * day_of_year / 365)
        
        return np.column_stack((sin_day, cos_day))

# ============================================
# 3️⃣ Preparar los datos
# ============================================

target = "Frio (Kw)"
categorical_features = ["Dia_semana", "estacion"]
date_feature = "DIA"

X = df_limpiado.drop(columns=[target])  # mantenemos la fecha
y = df_limpiado[target]

numeric_features = X.select_dtypes(include=[np.number]).columns.difference(categorical_features + [date_feature]).tolist()

# ============================================
# 4️⃣ Pipeline numérico
# ============================================

num_pipeline = Pipeline(steps=[
    ("outlier_remover", ZScoreOutlierRemover(threshold=3.0)),
    ("imputer", KNNImputer(n_neighbors=5)),
    ("yeojohnson", PowerTransformer(method='yeo-johnson')),
    ("scaler", StandardScaler())
])

# ============================================
# 5️⃣ Pipeline categórico
# ============================================

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# ============================================
# 6️⃣ Pipeline de fecha (seno/coseno)
# ============================================

date_pipeline = Pipeline(steps=[
    ("date_encoder", DateSinCosEncoder())
])

# ============================================
# 7️⃣ ColumnTransformer combinado
# ============================================

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, numeric_features),
    ("cat", cat_pipeline, categorical_features),
    ("date", date_pipeline, [date_feature])
])

# ============================================
# 8️⃣ Pipeline completo con modelo
# ============================================

full_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42))
])

# ============================================
# 9️⃣ Entrenar y evaluar el modelo
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)

# --- Métricas ---
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("✅ Transformación y entrenamiento completados.")
print(f"MAE  : {mae:.4f}")
print(f"MSE  : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")


✅ Transformación y entrenamiento completados.
MAE  : 82544.6760
MSE  : 222242518330.8972
RMSE : 471426.0476
R²   : -0.6986


