In [None]:
import pandas as pd
import os

folder = 'dataset_FE'
filename = 'datos_procesados.csv'
file_path = os.path.join(folder, filename)

try:
    df = pd.read_csv(
        file_path,
        sep=',',
        decimal='.'
    )
    
    print(f"✅ DataFrame cargado exitosamente desde: {file_path}")
    print(f"Dimensiones: {df.shape}")
    print("\nPrimeras 5 filas:")
    print(df.head())

except FileNotFoundError:
    print(f"❌ Error: No se encontró el archivo en la ruta: {file_path}")
except Exception as e:
    print(f"❌ Error al leer el archivo: {e}")

✅ DataFrame cargado exitosamente desde: dataset_FE\datos_procesados.csv
Dimensiones: (1184, 38)

Primeras 5 filas:
          DIA  Servicios (Kw)  KW Gral Planta  Planta (Kw)  Agua Planta (Hl)  \
0  2020-07-01         23848.0         59058.0     27637.27           10280.0   
1  2020-07-02         38033.0        131184.0     54409.81           13970.0   
2  2020-07-03         42565.5        136078.0     65685.59           36300.0   
3  2020-07-04         39650.0        139714.0     67098.54           40120.0   
4  2020-07-05         45385.0        146862.0     70600.64           38940.0   

   Planta de agua (Hl)  Frio (Kw)  KW Trafo 10  Produccion (Hl)  \
0             11241.40    23954.0      6046.25           2398.0   
1             22107.77    28268.0     10108.13          25148.5   
2             46955.43    24246.0      9177.75          29365.0   
3             51124.18    29885.0      6717.25          32150.8   
4             49146.08    24449.0      9527.25          30793.9   

 

# Preprocesamiento basico
Vimos que hay columnas que tienen valores negativos y nulos que no deberian tenerlos. Para ello los vamos a imputar como NANS

In [None]:
df.columns

Index(['DIA', 'Servicios (Kw)', 'KW Gral Planta', 'Planta (Kw)',
       'Agua Planta (Hl)', 'Planta de agua (Hl)', 'Frio (Kw)', 'KW Trafo 10',
       'Produccion (Hl)', 'Totalizador Sistema Kaeser', 'Aire Producido (M3)',
       'Kw Compresores Aire', 'Hl de Mosto', 'Aire (Kw)', 'Elaboracion (Kw)',
       'Aire Planta (M3)', 'Temp Tq Intermedio', 'Pta Agua / Eflu (Kw)',
       'KW Secador Kaeser', 'Bodega (Kw)', 'KW Trafo 11', 'KW Trafo 9',
       'Efluentes (Kw)', 'KW Trafo 5', 'Agua Cond REC', 'KW Enfluente Efl',
       'KW Obrador Contratistas', 'Hl Producido Bodega', 'KW Cond 5. 6 y 9',
       'KW Planta de Agua', 'Tot L3. L4 y Planta de CO2', 'Anio', 'Mes', 'Dia',
       'Dia_semana', 'Temperatura_amb', 'Tarifa_electrica', 'estacion'],
      dtype='object')

In [None]:
import numpy as np

def reemplazar_ceros_y_negativos_excepto(df, columnas_excluidas):
    df_modificado = df.copy()

    # Filtramos las columnas numéricas que no están en la lista excluida
    columnas_a_modificar = [
        col for col in df_modificado.select_dtypes(include=[np.number]).columns
        if col not in columnas_excluidas
    ]

    for col in columnas_a_modificar:
        df_modificado.loc[df_modificado[col] <= 0, col] = np.nan

    return df_modificado


df_limpiado=reemplazar_ceros_y_negativos_excepto(df, columnas_excluidas=['Tarifa_electrica',"Temperatura_amb"])

df_limpiado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184 entries, 0 to 1183
Data columns (total 38 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   DIA                         1184 non-null   object 
 1   Servicios (Kw)              1184 non-null   float64
 2   KW Gral Planta              1184 non-null   float64
 3   Planta (Kw)                 1184 non-null   float64
 4   Agua Planta (Hl)            1184 non-null   float64
 5   Planta de agua (Hl)         1184 non-null   float64
 6   Frio (Kw)                   1184 non-null   float64
 7   KW Trafo 10                 1184 non-null   float64
 8   Produccion (Hl)             1184 non-null   float64
 9   Totalizador Sistema Kaeser  1184 non-null   float64
 10  Aire Producido (M3)         1184 non-null   float64
 11  Kw Compresores Aire         1184 non-null   float64
 12  Hl de Mosto                 1049 non-null   float64
 13  Aire (Kw)                   1184 

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats

# ============================================
# 1️⃣ Custom Transformer: eliminar outliers por Z-score
# ============================================

class ZScoreOutlierRemover(BaseEstimator, TransformerMixin):
    """Reemplaza por NaN los valores cuyo |z-score| > threshold."""
    def __init__(self, threshold=3.0):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_numeric = pd.DataFrame(X).copy()
        z_scores = np.abs(stats.zscore(X_numeric, nan_policy='omit'))
        X_numeric[z_scores > self.threshold] = np.nan
        return X_numeric.values

# ============================================
# 2️⃣ Custom Transformer: codificación seno/coseno de fechas
# ============================================

class DateSinCosEncoder(BaseEstimator, TransformerMixin):
    """
    Convierte una columna de fechas en dos columnas: día del año codificado con seno y coseno.
    Acepta tanto DataFrame como ndarray (como lo envía ColumnTransformer).
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Si X viene como ndarray (una sola columna), convertir a Serie
        if isinstance(X, np.ndarray):
            X = pd.Series(X.ravel())
        elif isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]  # Tomar la primera columna
        
        X = pd.to_datetime(X, errors='coerce')
        day_of_year = X.dt.dayofyear.fillna(0)
        
        sin_day = np.sin(2 * np.pi * day_of_year / 365)
        cos_day = np.cos(2 * np.pi * day_of_year / 365)
        
        return np.column_stack((sin_day, cos_day))

# ============================================
# 3️⃣ Preparar los datos
# ============================================

target = "Frio (Kw)"
categorical_features = ["Dia_semana", "estacion"]
date_feature = "DIA"

X = df_limpiado.drop(columns=[target])  # mantenemos la fecha
y = df_limpiado[target]

numeric_features = X.select_dtypes(include=[np.number]).columns.difference(categorical_features + [date_feature]).tolist()

# ============================================
# 4️⃣ Pipeline numérico
# ============================================

num_pipeline = Pipeline(steps=[
    ("outlier_remover", ZScoreOutlierRemover(threshold=5.0)),
    ("imputer", KNNImputer(n_neighbors=50)),
    ("yeojohnson", PowerTransformer(method='yeo-johnson')),
    ("scaler", StandardScaler())
])

# ============================================
# 5️⃣ Pipeline categórico
# ============================================

cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# ============================================
# 6️⃣ Pipeline de fecha (seno/coseno)
# ============================================

date_pipeline = Pipeline(steps=[
    ("date_encoder", DateSinCosEncoder())
])

# ============================================
# 7️⃣ ColumnTransformer combinado
# ============================================

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, numeric_features),
    ("cat", cat_pipeline, categorical_features),
    ("date", date_pipeline, [date_feature])
])

# ============================================
# 8️⃣ Pipeline completo con modelo
# ============================================

full_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42))
])

# ============================================
# 9️⃣ Entrenar y evaluar el modelo
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)

# --- Métricas ---
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("✅ Transformación y entrenamiento completados.")
print(f"MAE  : {mae:.4f}")
print(f"MSE  : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")


✅ Transformación y entrenamiento completados.
MAE  : 58333.4826
MSE  : 156015156971.0107
RMSE : 394987.5403
R²   : -0.1924


