In [2]:
import pandas as pd
import numpy as np

# 📥 Cargar el dataset local
df = pd.read_csv("datasetwg.csv")

# Asegurarse de que 'periodo' esté en formato de período mensual
df['periodo'] = pd.to_datetime(df['periodo']).dt.to_period('M')

# Ordenar para asegurar consistencia
df = df.sort_values(['product_id', 'periodo']).reset_index(drop=True)

# Agrupar por producto
group = df.groupby('product_id')

def generate_feats(g):
    g = g.sort_values('periodo').copy()
    g['tn'] = g['tn'].fillna(0)

    for lag in [5, 8, 9, 10]:
        g[f'tn_lag_{lag}'] = g['tn'].shift(lag)
        g[f'delta_tn_{lag}'] = g['tn'] - g[f'tn_lag_{lag}']
        g[f'cat_delta_tn_lag_{lag}'] = pd.cut(
            g[f'delta_tn_{lag}'], bins=[-np.inf, -5, 0, 5, np.inf],
            labels=['baja', 'negativo', 'positivo', 'alta']
        )

    for window in [3, 6, 12]:
        g[f'delta1_media_movil_{window}'] = g['tn'].diff().rolling(window).mean()
        g[f'tn_std_movil_{int(window/2)}'] = g['tn'].rolling(int(window/2)).std()
        g[f'tn_min_movil_{window}'] = g['tn'].rolling(window).min()

    g['avg_tn'] = g['tn'].expanding().mean()
    g['total_total_tn_lag_9'] = g['tn'].shift(9).expanding().sum()
    g['total_total_tn_diff_4'] = g['tn'].diff(4).fillna(0)

    return g

# Aplicar ingeniería de features
df_feat = group.apply(generate_feats).reset_index(drop=True)

# Guardar el resultado
df_feat.to_csv("datasetwg_features.csv", index=False)
print("✅ Dataset enriquecido guardado como 'datasetwg_features.csv'")


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [3]:
df1= pd.read_csv("datasetwg_features.csv")

In [4]:
df1

Unnamed: 0,customer_id,product_id,periodo,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,stock_final,cat1,cat2,...,tn_min_movil_3,delta1_media_movil_6,tn_std_movil_3,tn_min_movil_6,delta1_media_movil_12,tn_std_movil_6,tn_min_movil_12,avg_tn,total_total_tn_lag_9,total_total_tn_diff_4
0,10004,20001,2017-01,0.0,9.0,184.72927,184.72927,0.00000,HC,ROPA LAVADO,...,,,,,,,,184.729270,,0.00000
1,10003,20001,2017-01,0.0,17.0,143.49426,143.49426,0.00000,HC,ROPA LAVADO,...,,,,,,,,164.111765,,0.00000
2,10002,20001,2017-01,0.0,17.0,38.68301,35.72806,0.00000,HC,ROPA LAVADO,...,35.72806,,76.936381,,,,,121.317197,,0.00000
3,10001,20001,2017-01,0.0,11.0,99.43861,99.43861,0.00000,HC,ROPA LAVADO,...,35.72806,,54.181006,,,,,115.847550,,0.00000
4,10084,20001,2017-01,0.0,1.0,0.22386,0.22386,0.00000,HC,ROPA LAVADO,...,0.22386,,50.271180,,,,,92.722812,,-184.50541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9460975,10159,21276,2019-12,0.0,0.0,0.00000,0.00000,1.05592,PC,PIEL1,...,0.00000,-1.156482e-18,0.000433,0.0,3.614007e-20,0.000387,0.0,0.000544,0.44555,0.00000
9460976,10101,21276,2019-12,0.0,0.0,0.00000,0.00000,1.05592,PC,PIEL1,...,0.00000,-1.156482e-18,0.000000,0.0,3.614007e-20,0.000387,0.0,0.000544,0.44703,-0.00075
9460977,10154,21276,2019-12,0.0,0.0,0.00000,0.00000,1.05592,PC,PIEL1,...,0.00000,-1.156482e-18,0.000000,0.0,3.614007e-20,0.000387,0.0,0.000543,0.44703,-0.00075
9460978,10052,21276,2019-12,0.0,1.0,0.00594,0.00594,1.05592,PC,PIEL1,...,0.00000,8.650000e-04,0.003429,0.0,4.950000e-04,0.002383,0.0,0.000550,0.44703,0.00594


In [5]:
print(df1.isna().sum())


customer_id                 0
product_id                  0
periodo                     0
plan_precios_cuidados       0
cust_request_qty            0
                         ... 
tn_std_movil_6           3900
tn_min_movil_12          8580
avg_tn                      0
total_total_tn_lag_9     7020
total_total_tn_diff_4       0
Length: 66, dtype: int64


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9460980 entries, 0 to 9460979
Data columns (total 66 columns):
 #   Column                                        Dtype  
---  ------                                        -----  
 0   customer_id                                   int64  
 1   product_id                                    int64  
 2   periodo                                       object 
 3   plan_precios_cuidados                         float64
 4   cust_request_qty                              float64
 5   cust_request_tn                               float64
 6   tn                                            float64
 7   stock_final                                   float64
 8   cat1                                          object 
 9   cat2                                          object 
 10  cat3                                          object 
 11  brand                                         object 
 12  sku_size                                      int64  
 1

In [9]:
columnas_originales = [
    'customer_id', 'product_id', 'periodo', 'plan_precios_cuidados',
    'cust_request_qty', 'cust_request_tn', 'tn', 'stock_final',
    'cat1', 'cat2', 'cat3', 'brand', 'sku_size'
]

df_reducido = df1[columnas_originales].copy()
df_reducido.to_csv("dataset_base.csv", index=False)


In [10]:
df2= pd.read_csv("dataset_base.csv")

In [11]:
df2

Unnamed: 0,customer_id,product_id,periodo,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,stock_final,cat1,cat2,cat3,brand,sku_size
0,10004,20001,2017-01,0.0,9.0,184.72927,184.72927,0.00000,HC,ROPA LAVADO,Liquido,ARIEL,3000
1,10003,20001,2017-01,0.0,17.0,143.49426,143.49426,0.00000,HC,ROPA LAVADO,Liquido,ARIEL,3000
2,10002,20001,2017-01,0.0,17.0,38.68301,35.72806,0.00000,HC,ROPA LAVADO,Liquido,ARIEL,3000
3,10001,20001,2017-01,0.0,11.0,99.43861,99.43861,0.00000,HC,ROPA LAVADO,Liquido,ARIEL,3000
4,10084,20001,2017-01,0.0,1.0,0.22386,0.22386,0.00000,HC,ROPA LAVADO,Liquido,ARIEL,3000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9460975,10159,21276,2019-12,0.0,0.0,0.00000,0.00000,1.05592,PC,PIEL1,Cara,NIVEA,140
9460976,10101,21276,2019-12,0.0,0.0,0.00000,0.00000,1.05592,PC,PIEL1,Cara,NIVEA,140
9460977,10154,21276,2019-12,0.0,0.0,0.00000,0.00000,1.05592,PC,PIEL1,Cara,NIVEA,140
9460978,10052,21276,2019-12,0.0,1.0,0.00594,0.00594,1.05592,PC,PIEL1,Cara,NIVEA,140
