In [10]:
import pandas as pd
import numpy as np
import copy
from datetime import datetime
import pickle 
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score
import ubjson


from sklearn.model_selection import GridSearchCV
import gzip


import lightgbm as lgb

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

In [11]:
fp_active_promos = '../data/active_promos.csv'
fp_clients = '../data/clients_attributes.csv'
fp_executed_promos = '../data/executed_promos.csv'
fp_sales = '../data/sales.csv'
fp_test = '../data/test.csv'

df_active_promos = pd.read_csv(fp_active_promos, encoding='ISO-8859-1', sep=',')
df_clients = pd.read_csv(fp_clients, encoding='ISO-8859-1', sep=',')
df_executed_promos = pd.read_csv(fp_executed_promos, encoding='ISO-8859-1', sep=',')
df_sales = pd.read_csv(fp_sales, encoding='ISO-8859-1', sep=',')
df_test = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')

In [12]:
def extra_head(df, name):
    print(name)
    print(len(df))
    display(df.head(1))
    print("#"*80)
extra_head(df_active_promos, "Active promos")
extra_head(df_clients, "Clients")
extra_head(df_executed_promos, "Executed promos")
extra_head(df_sales, "Sales")
extra_head(df_test, "Test")

Active promos
422437


Unnamed: 0,CodigoDC,Marca,Cupo,Fecha_Desde,Fecha_Hasta,Cliente
0,297601,29,9,2018-08-06,2018-08-30,3213


################################################################################
Clients
15069


Unnamed: 0,Cliente,FechaAltaCliente,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,1,2009-03-03,4,21,12,1,1,1


################################################################################
Executed promos
27794


Unnamed: 0,CodigoDC,Cliente,Marca,Cupo
0,297601,8410,29,9


################################################################################
Sales
1894252


Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto
0,1,2018,1,2,2.0,44,10.0,6352.94,0.03,0.0


################################################################################
Test
16870


Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,


################################################################################


In [13]:
df_sales.head(1)

Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto
0,1,2018,1,2,2.0,44,10.0,6352.94,0.03,0.0


In [14]:
df_sales.isna().sum()

Cliente           0
Año               0
Mes               0
ClaseEnvase       0
SegmentoPrecio    5
Marca             0
Cupo              5
Nr                0
Hl                0
Dcto              0
dtype: int64

In [15]:
df_sales = df_sales.dropna()

In [16]:
#Convertir anio, mes a string
df_sales['Año'] = df_sales['Año'].astype(str)
df_sales['Mes'] = df_sales['Mes'].astype(str)
df_sales["Mes"] = df_sales['Mes'].str.zfill(width=2)
df_sales.head(1)

Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto
0,1,2018,1,2,2.0,44,10.0,6352.94,0.03,0.0


In [18]:
# Unir Anio, Mes en una columna datetime
df_sales['periodo'] = pd.to_datetime([f'{y}-{m}-01' for y, m in zip(df_sales["Año"], df_sales.Mes)])
df_sales['month_year'] = df_sales['periodo'].dt.to_period('M')
df_sales.head(5)

Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto,periodo,month_year
0,1,2018,1,2,2.0,44,10.0,6352.94,0.03,0.0,2018-01-01,2018-01
1,1,2018,1,2,4.0,18,16.0,99130.5,0.45,0.0,2018-01-01,2018-01
2,1,2018,6,2,1.0,2,6.0,11605.4,0.07,-2880.0,2018-06-01,2018-06
3,1,2018,7,1,2.0,37,34.0,1944.01,0.01,0.0,2018-07-01,2018-07
4,1,2018,8,2,1.0,2,16.0,16250.18,0.06,0.0,2018-08-01,2018-08


In [21]:
df_sales_mini = df_sales[['ClaseEnvase', 'SegmentoPrecio', 'Marca', 'Cupo', 'Nr', 'Hl',
                         'Dcto', 'month_year']]
df_sales_mini.head(1)

Unnamed: 0,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto,month_year
0,2,2.0,44,10.0,6352.94,0.03,0.0,2018-01


In [22]:
df_sales_mini.dtypes

ClaseEnvase           int64
SegmentoPrecio      float64
Marca                 int64
Cupo                float64
Nr                  float64
Hl                  float64
Dcto                float64
month_year        period[M]
dtype: object

In [23]:
df_sales_mini['ClaseEnvase'].unique()

array([2, 1])

In [24]:
df_sales_mini['SegmentoPrecio'].unique()

array([2., 4., 1., 3.])

# PRIMER ACUMULADO

In [58]:
df_enr_1 = df_sales_mini.groupby(['Marca', 'Cupo', 'month_year']).agg({
  'Nr': [np.sum, np.mean],
  'Hl': [np.sum, np.mean],
  'Dcto': [np.sum, np.mean],
})
df_enr_1.columns = ['_MC'.join(col).strip() for col in df_enr_1.columns.values]
df_enr_1 = df_enr_1.reset_index()
df_enr_1.head(1)

Unnamed: 0,Marca,Cupo,month_year,Nr_MCsum,Nr_MCmean,Hl_MCsum,Hl_MCmean,Dcto_MCsum,Dcto_MCmean
0,1,30.0,2019-08,176028.75,3826.711957,1.51,0.032826,-57971.25,-1260.244565


In [60]:
import pickle

def store_pickle(fp, obj):
  with open(fp, 'wb') as f:
    pickle.dump(obj, f)
    
fp_output = '../tablones/ventas_mc_new_1.pkl'    
store_pickle(fp_output, df_enr_1)

In [59]:
len(df_enr_1)

1808

# SEGUNDO ACUMULADO

In [34]:
df_sales_mini_2 = df_sales_mini[['ClaseEnvase', 'SegmentoPrecio', 'Marca', 'Cupo', 'month_year']]
df_sales_mini_2.head(1)

Unnamed: 0,ClaseEnvase,SegmentoPrecio,Marca,Cupo,month_year
0,2,2.0,44,10.0,2018-01


In [51]:
df_extra_1 = df_sales_mini_2\
  .groupby(['month_year', 'Marca', 'Cupo', 'ClaseEnvase'])\
  .size().unstack(fill_value=0)\
  .reset_index()
df_extra_1 = df_extra_1.rename(columns={1 : 'ClaseEnvase_1_sales', 2: 'ClaseEnvase2_sales'})

df_extra_1 = df_extra_1.reset_index(drop=True)

print(df_extra_1.isna().sum().sum())
display(df_extra_1.head(1))

0


ClaseEnvase,month_year,Marca,Cupo,ClaseEnvase_1_sales,ClaseEnvase2_sales
0,2018-01,2,5.0,0,1


In [61]:
fp_output = '../tablones/ventas_extra_1.pkl'    
store_pickle(fp_output, df_extra_1)

In [54]:
len(df_extra_1)

1808

In [53]:
df_extra_2 = df_sales_mini_2\
  .groupby(['month_year', 'Marca', 'Cupo', 'SegmentoPrecio'])\
  .size().unstack(fill_value=0)\
  .reset_index()
df_extra_2.head(1)
df_extra_2 = df_extra_2.rename(columns = {
  1.0 : 'Segmento_1_sales',
  2.0: 'Segmento_2_sales',
  3.0: 'Segmento_3_sales',
  4.0: 'Segmento_5_sales',
})
print(df_extra_2.isna().sum().sum())
display(df_extra_2.head(5))

0


SegmentoPrecio,month_year,Marca,Cupo,Segmento_1_sales,Segmento_2_sales,Segmento_3_sales,Segmento_5_sales
0,2018-01,2,5.0,1,0,0,0
1,2018-01,2,10.0,452,0,0,0
2,2018-01,2,16.0,5932,0,0,0
3,2018-01,2,19.0,2119,0,0,0
4,2018-01,2,23.0,160,0,0,0


In [62]:
fp_output = '../tablones/ventas_extra_2.pkl'    
store_pickle(fp_output, df_extra_2)

In [55]:
len(df_extra_2)

1808

In [None]:
# df_extra = df_extra.groupby(['Cliente', 'month_year', 'producto']).size().unstack(fill_value=0)\
#                            .reset_index()
# df_extra['29_9']  = df_extra['29_9'].apply(lambda x: 1 if x>0 else 0)
# df_extra['39_20'] = df_extra['39_20'].apply(lambda x:1 if x>0 else 0)
# df_extra['40_16'] = df_extra['40_16'].apply(lambda x:1 if x>0 else 0)