# PREPROCESAMIENTO DE DATOS
1. Lectura y Formateo
2. Selección
3. Preparación

In [1]:
## Get current work directory to load all the custom modules
from pathlib import Path
import sys
BASE_DIR = Path().absolute().resolve(strict=True).parent
if str(BASE_DIR) not in sys.path:
    sys.path.append(str(BASE_DIR))

In [2]:
DATA_PATH = os.path.join(BASE_DIR, 'data', 'autos')
print(DATA_PATH)

/workspace/Externos/TimeSeries/BenchMark/data/autos


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.impute import SimpleImputer
from matplotlib.pyplot import figure
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
import re 

In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model
from sklearn import svm
from sklearn.model_selection import cross_val_score
from statsmodels.tsa.stattools import adfuller

In [5]:
from helpers.general import homogenizar_str, print_linea_de_tiempo_producto
from datetime import datetime, timedelta
import datetime as dt

># 1. Lectura y Formateo de datos

In [6]:
# Leemos los datos
df = pd.read_csv(f"{DATA_PATH}/Data_Repuestos_2018_2021_v2.csv",  
                       usecols=[' noPeriodo',' idArticulo','CnSalida 2018', ' caDescProduct'],
                       )
df.head()

Unnamed: 0,noPeriodo,idArticulo,CnSalida 2018,caDescProduct
0,20180100,25319,1.0,FOCO H11-12 100-90W CURVO
1,20180100,4923,1.0,FILTRO AIRE HYUNDAI TUCSON
2,20180100,25853,1.0,FILTRO GASOLINA NISSAN METAL GA15-16 INJECTADO
3,20180100,13497,1.0,JGO CABLES BUJIA KIA RIO LARGO
4,20180100,23900,4.0,BUJIA NGK


In [7]:
# Cambiamos el nombre de las columnas
df.rename(columns={
    ' noPeriodo':'Periodo',
    ' idArticulo':'producto',
    'CnSalida 2018':'ventas',
    ' caDescProduct':'DescProducto'
    }, inplace=True)
df.head()

Unnamed: 0,Periodo,producto,ventas,DescProducto
0,20180100,25319,1.0,FOCO H11-12 100-90W CURVO
1,20180100,4923,1.0,FILTRO AIRE HYUNDAI TUCSON
2,20180100,25853,1.0,FILTRO GASOLINA NISSAN METAL GA15-16 INJECTADO
3,20180100,13497,1.0,JGO CABLES BUJIA KIA RIO LARGO
4,20180100,23900,4.0,BUJIA NGK


### Formating datetime

In [8]:
# Formaterar las fechas
df['producto'] = df['producto'].astype(str)
df.Periodo = df.Periodo.apply(lambda x: datetime.strptime(str(x), "%Y%m00").date())
df.head()

Unnamed: 0,Periodo,producto,ventas,DescProducto
0,2018-01-01,25319,1.0,FOCO H11-12 100-90W CURVO
1,2018-01-01,4923,1.0,FILTRO AIRE HYUNDAI TUCSON
2,2018-01-01,25853,1.0,FILTRO GASOLINA NISSAN METAL GA15-16 INJECTADO
3,2018-01-01,13497,1.0,JGO CABLES BUJIA KIA RIO LARGO
4,2018-01-01,23900,4.0,BUJIA NGK


In [9]:
# # Homogenizamos los nombres de los productos
# df["DescProductoOrig"] = df.loc[:,"DescProducto"]
# df["DescProducto"] = df["DescProducto"].apply(homogenizar_str)
# df.head()

In [10]:
# Ver el estado de salud de los datos
### Buscamos valores nulos 
### observamos el tipo de datos en la db
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118009 entries, 0 to 118008
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Periodo       118009 non-null  object 
 1   producto      118009 non-null  object 
 2   ventas        118009 non-null  float64
 3   DescProducto  118006 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.6+ MB


In [11]:
# Solo valores significativos
df = df.query('ventas > 0')
df.describe()

Unnamed: 0,ventas
count,114131.0
mean,5.307321
std,82.695333
min,0.3
25%,1.0
50%,1.0
75%,2.0
max,10000.0


## Group by products and months
The group process going to sum all the sales in the related month and product.

In [12]:
# Agrupas datos según mes, año y producto
dfagg = df.groupby(['Periodo', 'producto'])
dfagg = dfagg.agg({'ventas': 'sum', 'DescProducto':'first'})
dfagg = dfagg.reset_index()
dfagg = dfagg.sort_values(by=['producto', 'Periodo'], ascending=False)
dfagg.head()

Unnamed: 0,Periodo,producto,ventas,DescProducto
71991,2021-10-01,9996,1.0,THERMOSTATO TOYOTA COROLLA 1C/2C/2Y/3Y/4Y/3A/...
60384,2021-03-01,9996,1.0,THERMOSTATO TOYOTA COROLLA 1C/2C/2Y/3Y/4Y/3A/5...
54013,2020-11-01,9996,1.0,THERMOSTATO TOYOTA COROLLA 1C/2C/2Y/3Y/4Y/3A/5...
47088,2020-07-01,9996,4.0,THERMOSTATO TOYOTA COROLLA 1C/2C/2Y/3Y/4Y/3A/5...
42002,2020-02-01,9996,1.0,THERMOSTATO TOYOTA COROLLA 1C/2C/2Y/3Y/4Y/3A/5...


In [13]:
# seleccionando productos que se encuentran al menos en el año 2021
## de esta manera se podrán realizar predicciones en el futuro.
products21 =dfagg[dfagg['Periodo']>=dt.date(year=2021,month=1,day=1)]['producto'].unique().tolist()

># 2. Seleccion de datos

## Productos y su presencia en el tiempo

- Cada año tiene 36 meses.
- Tomaremos aquellos productos con al menos 12 meses de presencia en la línea de tiempo, es decir en el últimos 3 años.
- detectamos que tenemos 49 artículos únicos que cumplen todos los requisitos

In [14]:
min_meses = 40

In [15]:
# Aplicamos el filtro para encontrar los productos significativos
temp = dfagg.loc[dfagg['producto'].isin(products21)]
temp = temp.loc[:,['producto']]
temp['count'] = 1
temp = temp.groupby('producto').count()
temp = temp.sort_values('count', ascending=False)
temp = temp.reset_index()
temp = temp.query(f'count > {min_meses}')
value_count = temp.producto.nunique()

print(f"Existen {value_count} producto que cuentan con una presencia mayor a {min_meses} meses")

Existen 50 producto que cuentan con una presencia mayor a 40 meses


In [16]:
# Tomamos solo los productos que cumples con las condiciones deseadas.
lista_articulos = temp.producto.unique().tolist()
df_clean = dfagg.loc[dfagg['producto'].isin(lista_articulos)]
df_clean = df_clean.reset_index(drop=True)
df_clean = df_clean.sort_values(by=['producto', 'Periodo'], ascending=False)
df_clean.rename(columns={'producto': 'idArticulo'}, inplace=True)
df_clean.head()

Unnamed: 0,Periodo,idArticulo,ventas,DescProducto
0,2021-10-01,9950,74.0,REFRIGERANTE COOLANT AZUL/NATURAL TRANSP. GAL...
1,2021-09-01,9950,25.0,REFRIGERANTE COOLANT AZUL/NATURAL TRANSP. GAL...
2,2021-08-01,9950,28.0,REFRIGERANTE COOLANT AZUL/NATURAL TRANSP. GAL...
3,2021-07-01,9950,15.0,REFRIGERANTE COOLANT AZUL/NATURAL TRANSP. GAL...
4,2021-06-01,9950,13.0,REFRIGERANTE COOLANT AZUL/NATURAL TRANSP. GAL...


In [17]:
df_clean.to_csv(f'{DATA_PATH}/producto.csv', index=False)

># 3. Preparación

### Crear series de tiempo

In [18]:
# Pivotando las tablas y llenando de ceros las ventas que no existen.
imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
df_time_pre = df_clean.pivot_table(index='Periodo', columns='idArticulo', values='ventas', aggfunc='sum',)
df_time = imp_mean.fit_transform(df_time_pre)
df_time = pd.DataFrame(df_time, columns=df_time_pre.columns, index=df_time_pre.index)
df_time.tail()

idArticulo,10702,10927,10960,10964,11395,11397,11509,11523,11807,1208,...,5389,551,5579,594,7798,7799,7801,963,9776,9950
Periodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,9.0,3.0,12.0,18.0,216.0,12.0,1.0,3.0,7.0,140.0,...,5.0,4.0,3.0,3.0,13.0,6.0,4.0,6.0,291.0,13.0
2021-07-01,13.0,14.0,21.0,19.0,99.0,12.0,13.0,2.0,4.0,166.0,...,2.0,2.0,13.0,1.0,1.0,1.0,3.0,14.0,571.0,15.0
2021-08-01,5.0,13.0,36.0,9.0,154.0,18.0,5.0,0.0,19.0,520.0,...,6.0,8.0,6.0,4.0,4.0,5.0,1.0,2.0,1701.0,28.0
2021-09-01,2.0,12.0,36.0,34.0,76.0,12.0,4.0,2.0,4.0,280.0,...,0.0,3.0,1.0,10.0,5.0,2.0,6.0,4.0,550.0,25.0
2021-10-01,4.0,36.0,33.0,36.0,123.0,8.0,5.0,1.0,7.0,510.0,...,2.0,3.0,0.0,4.0,14.0,5.0,0.0,3.0,331.0,74.0


### Graficando los productos en la línea de tiempo

In [22]:
scaler = StandardScaler().fit(df_time)
df_time_scaled = scaler.transform(df_time)
df_time_scaled = pd.DataFrame(df_time_scaled, columns=df_time.columns, index=df_time.index)
df_time_scaled.head()
#print_linea_de_tiempo_producto(df_time_scaled , 'idArticulo', height=3000)

idArticulo,10702,10927,10960,10964,11395,11397,11509,11523,11807,1208,...,5389,551,5579,594,7798,7799,7801,963,9776,9950
Periodo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,-0.703284,0.669506,0.130836,-0.055601,0.85066,-0.75641,-0.801352,-0.457587,2.094909,-0.565514,...,0.823642,-0.595422,0.65565,-0.200967,-0.852955,-0.100517,-0.546683,0.456188,-0.153722,0.738177
2018-02-01,2.998212,-0.256302,-0.582464,-0.636884,0.323987,0.240102,-0.801352,-0.203985,-0.372679,-0.44865,...,-0.106113,-0.097433,-0.254347,0.261257,-0.359421,-1.025277,-0.313837,-0.173352,-0.4414,2.241021
2018-03-01,-0.259105,-0.350772,-0.136651,0.176912,2.815005,2.759062,0.801352,-0.457587,-0.502552,-0.404826,...,0.591203,1.023042,-0.124347,0.261257,0.627646,0.824242,2.946014,-0.802892,0.914795,-0.642815
2018-04-01,-0.481195,1.463054,0.665811,0.409425,-0.259623,-0.75641,-0.114479,0.30322,-0.632425,0.272013,...,-0.106113,0.525054,-0.124347,1.185705,-0.112654,0.207736,-0.77953,0.246342,-0.443968,1.063117
2018-05-01,-0.555225,0.008215,-0.760789,-1.218166,-0.387732,0.710676,1.259268,2.332039,-0.632425,-0.656409,...,2.683153,4.010976,3.51564,-1.125415,-0.606188,-1.025277,0.850396,-0.593045,-0.125468,1.063117
