In [83]:
import pandas as pd
import numpy as np
import copy
from datetime import datetime
import pickle 
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score
import ubjson


from sklearn.model_selection import GridSearchCV
import gzip


import lightgbm as lgb

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

In [84]:
fp_clients = '../data/clients_attributes.csv'
fp_sales = '../data/sales.csv'

fp_active_promos = '../data/active_promos.csv'
fp_executed_promos = '../data/executed_promos.csv'
fp_test = '../data/test.csv'

df_active_promos = pd.read_csv(fp_active_promos, encoding='ISO-8859-1', sep=',')
df_clients = pd.read_csv(fp_clients, encoding='ISO-8859-1', sep=',')
df_executed_promos = pd.read_csv(fp_executed_promos, encoding='ISO-8859-1', sep=',')
df_sales = pd.read_csv(fp_sales, encoding='ISO-8859-1', sep=',')
df_test = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')

In [85]:
print(df_active_promos.isna().sum())
display(df_active_promos.head(1))

CodigoDC       0
Marca          0
Cupo           0
Fecha_Desde    0
Fecha_Hasta    0
Cliente        0
dtype: int64


Unnamed: 0,CodigoDC,Marca,Cupo,Fecha_Desde,Fecha_Hasta,Cliente
0,297601,29,9,2018-08-06,2018-08-30,3213


In [86]:
df_sales[df_sales['Hl']==0]

Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto
25,1,2019,7,1,2.0,43,32.0,1259.09,0.0,0.0
38,2,2018,6,1,4.0,31,8.0,2658.16,0.0,0.0
213,9,2019,1,1,1.0,5,23.0,1900.46,0.0,0.0
219,9,2019,3,1,2.0,45,32.0,1008.28,0.0,0.0
263,10,2019,2,1,1.0,5,23.0,1900.46,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1894061,15048,2019,3,1,1.0,5,23.0,1906.40,0.0,0.0
1894122,15056,2019,8,1,2.0,43,32.0,1259.09,0.0,0.0
1894171,15059,2019,7,1,3.0,46,23.0,2031.80,0.0,0.0
1894176,15060,2019,8,1,2.0,37,32.0,1259.09,0.0,0.0


In [87]:
# Crear datetime para fecha_desde y fecha_hasta
# Obtener fecha promedio entre fecha_desde hasta fecha_hasta
# Crear columna periodo a partir de la fecha_promedio, periodo solo es ANIO+MES

In [88]:
# Crear periodo, como el mes de la mediana entre fecha_desde y fecha_hasta
df_active_promos['fecha_desde_dt'] \
  = pd.to_datetime(df_active_promos['Fecha_Desde'], format = '%Y-%m-%d')
df_active_promos['fecha_hasta_dt'] \
  = pd.to_datetime(df_active_promos['Fecha_Hasta'], format = '%Y-%m-%d')

df_active_promos['mid_date'] \
  = df_active_promos['fecha_desde_dt'] + \
    (df_active_promos['fecha_hasta_dt'] - df_active_promos['fecha_desde_dt']) /2

df_active_promos['periodo'] = df_active_promos['mid_date'].apply(lambda dt: dt.replace(day = 1))

df_active_promos['month_year'] = df_active_promos['periodo'].dt.to_period('M')
# Dropear columnas innecesarias
df_active_promos = df_active_promos.drop(['Fecha_Desde', 'Fecha_Hasta', 'fecha_desde_dt',
                                          'fecha_hasta_dt', 'mid_date', 'periodo'], axis = 1)
df_active_promos.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year
0,297601,29,9,3213,2018-08


In [89]:
fechas_active_promos = list(df_active_promos['month_year'].unique())
fechas_active_promos

[Period('2018-08', 'M'),
 Period('2018-09', 'M'),
 Period('2018-10', 'M'),
 Period('2018-11', 'M'),
 Period('2018-12', 'M'),
 Period('2019-01', 'M'),
 Period('2019-02', 'M'),
 Period('2019-03', 'M'),
 Period('2019-04', 'M'),
 Period('2019-07', 'M'),
 Period('2019-05', 'M'),
 Period('2019-06', 'M'),
 Period('2019-08', 'M'),
 Period('2019-09', 'M')]

In [90]:
df_active_promos.groupby(['month_year']).count()

Unnamed: 0_level_0,CodigoDC,Marca,Cupo,Cliente
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08,2315,2315,2315,2315
2018-09,80050,80050,80050,80050
2018-10,69134,69134,69134,69134
2018-11,89347,89347,89347,89347
2018-12,3238,3238,3238,3238
2019-01,50522,50522,50522,50522
2019-02,1551,1551,1551,1551
2019-03,34723,34723,34723,34723
2019-04,2633,2633,2633,2633
2019-05,2836,2836,2836,2836


In [91]:
df_active_promos[df_active_promos['month_year'] == fechas_active_promos[-1]]

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year
259546,652143,39,20,365,2019-09
259547,652143,39,20,1557,2019-09
259548,652143,39,20,5028,2019-09
259549,652143,39,20,5677,2019-09
259550,652143,40,16,2326,2019-09
...,...,...,...,...,...
422432,688354,40,16,14435,2019-09
422433,688355,29,9,320,2019-09
422434,688355,29,9,3117,2019-09
422435,688355,29,9,3503,2019-09


In [92]:
df_executed_promos[df_executed_promos['CodigoDC']==652143]

Unnamed: 0,CodigoDC,Cliente,Marca,Cupo
4659,652143,367,40,16
27793,652143,1584,40,16


In [93]:
# Quitare manualmente los active_promos de septiembre 2019 debido a que no hay match con executed_promos
#df_active_promos = df_active_promos[df_active_promos['month_year']!=fechas_active_promos[-1]]
df_active_promos = df_active_promos[df_active_promos['month_year']!= '201909']

In [94]:
df_active_promos.groupby(['month_year']).count()

Unnamed: 0_level_0,CodigoDC,Marca,Cupo,Cliente
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08,2315,2315,2315,2315
2018-09,80050,80050,80050,80050
2018-10,69134,69134,69134,69134
2018-11,89347,89347,89347,89347
2018-12,3238,3238,3238,3238
2019-01,50522,50522,50522,50522
2019-02,1551,1551,1551,1551
2019-03,34723,34723,34723,34723
2019-04,2633,2633,2633,2633
2019-05,2836,2836,2836,2836


In [95]:
df_active_promos['month_year'].unique()

<PeriodArray>
['2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02',
 '2019-03', '2019-04', '2019-07', '2019-05', '2019-06', '2019-08']
Length: 13, dtype: period[M]

In [96]:
# Cruzar executed promos con active_promos
# Llenar 1/0 dependiendo si se ejecuto la promocion o no

In [97]:
print(df_executed_promos.isna().sum())
# Agregar label a executed_promos
df_executed_promos['label'] = 1
display(df_executed_promos.head(1))

CodigoDC    0
Cliente     0
Marca       0
Cupo        0
dtype: int64


Unnamed: 0,CodigoDC,Cliente,Marca,Cupo,label
0,297601,8410,29,9,1


In [98]:
print(f"Numero de registros en active promos: {len(df_active_promos)}")
df_m1 = pd.merge(
  df_active_promos,
  df_executed_promos,
  on = ['CodigoDC', 'Cliente', 'Marca', 'Cupo'],
  how = 'left')
print(f"Numero de registros en df_m1: {len(df_m1)}")

Numero de registros en active promos: 385802
Numero de registros en df_m1: 385802


In [99]:
df_m1.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label
0,297601,29,9,3213,2018-08,


In [100]:
df_m1.isna().sum()

CodigoDC           0
Marca              0
Cupo               0
Cliente            0
month_year         0
label         358011
dtype: int64

In [101]:
# Llenamos NaN con 0, 0 implica que el cliente no acepto la promocion
df_m1 = df_m1.fillna(0)

In [102]:
fechas_m1 = list(df_m1['month_year'].unique())
fechas_m1.sort()
fechas_m1

[Period('2018-08', 'M'),
 Period('2018-09', 'M'),
 Period('2018-10', 'M'),
 Period('2018-11', 'M'),
 Period('2018-12', 'M'),
 Period('2019-01', 'M'),
 Period('2019-02', 'M'),
 Period('2019-03', 'M'),
 Period('2019-04', 'M'),
 Period('2019-05', 'M'),
 Period('2019-06', 'M'),
 Period('2019-07', 'M'),
 Period('2019-08', 'M')]

In [103]:
fechas_m1[-1]

Period('2019-08', 'M')

In [104]:
df_m1.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label
0,297601,29,9,3213,2018-08,0.0


In [105]:
# Ahora, a df_m1 le vamos a agregar toda la informacion que nos sirve para la prediccion

In [106]:
# Agregamos informacion de clientes DIRECTA
df_clients.head(1)

Unnamed: 0,Cliente,FechaAltaCliente,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,1,2009-03-03,4,21,12,1,1,1


In [107]:
df_m2 = pd.merge(
  df_m1, 
  df_clients[['Cliente', 'Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']],
  on = 'Cliente',
  how = 'left'
)
print(df_m2.isna().sum())
display(df_m2.head(1))

CodigoDC         0
Marca            0
Cupo             0
Cliente          0
month_year       0
label            0
Region           0
Gerencia         0
SubCanal         0
TipoPoblacion    0
Estrato          0
EF               0
dtype: int64


Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1


In [108]:
fechas_m2 = list(df_m2['month_year'].unique())
fechas_m2.sort()
fechas_m2

[Period('2018-08', 'M'),
 Period('2018-09', 'M'),
 Period('2018-10', 'M'),
 Period('2018-11', 'M'),
 Period('2018-12', 'M'),
 Period('2019-01', 'M'),
 Period('2019-02', 'M'),
 Period('2019-03', 'M'),
 Period('2019-04', 'M'),
 Period('2019-05', 'M'),
 Period('2019-06', 'M'),
 Period('2019-07', 'M'),
 Period('2019-08', 'M')]

In [109]:
# Agregamos informacion de ventas Directa e Enriquecida
df_sales.head(1)

Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto
0,1,2018,1,2,2.0,44,10.0,6352.94,0.03,0.0


In [110]:
# Cargamos pickle
f_ventas = '../tablones/data_grupo_3.pkl'
def load_pickle(fp):
  with open(fp, 'rb') as f:
    df = pickle.load(f)
  return df
df_ventas_enriched = load_pickle(f_ventas)
print(df_ventas_enriched.isna().sum().sum())
# Creamos month_year
df_ventas_enriched['month_year'] = df_ventas_enriched['periodo'].dt.to_period('M')
df_ventas_enriched = df_ventas_enriched.drop('periodo', axis=1)
display(df_ventas_enriched.head(1))

0


Unnamed: 0,Cliente,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,month_year
0,1,591975.69,13,2.31,-6785.54,680049.570526,340024.785263,1032291.12,344097.04,1389348.0,347337.026471,1764272.0,352854.43425,2180481.0,363413.575889,2.491579,1.245789,3.772778,1.257593,5.072353,1.268088,6.429375,1.285875,7.932,1.322,-47691.214211,-23845.607105,-72644.81,-24214.936667,-98924.870588,-24731.217647,-127734.36625,-25546.87325,-160123.783333,-26687.297222,13,17.736842,8.868421,26.722222,8.907407,35.882353,8.970588,45.0625,9.0125,54.933333,9.155556,2018-01


In [111]:
len(df_ventas_enriched)

301380

In [112]:
df_m3 = pd.merge(
  df_m2,
  df_ventas_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'
)
print(df_m3.isna().sum()/len(df_m3)*100)
display(df_m3.head(1))

CodigoDC               0.0
Marca                  0.0
Cupo                   0.0
Cliente                0.0
month_year             0.0
label                  0.0
Region                 0.0
Gerencia               0.0
SubCanal               0.0
TipoPoblacion          0.0
Estrato                0.0
EF                     0.0
Nr_sum                 0.0
numero_ventas          0.0
Hl_sum                 0.0
Dcto_sum               0.0
sum_2Nr                0.0
mean_2Nr               0.0
sum_3Nr                0.0
mean_3Nr               0.0
sum_4Nr                0.0
mean_4Nr               0.0
sum_5Nr                0.0
mean_5Nr               0.0
sum_6Nr                0.0
mean_6Nr               0.0
sum_2Hl                0.0
mean_2Hl               0.0
sum_3Hl                0.0
mean_3Hl               0.0
sum_4Hl                0.0
mean_4Hl               0.0
sum_5Hl                0.0
mean_5Hl               0.0
sum_6Hl                0.0
mean_6Hl               0.0
sum_2Dcto              0.0
m

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667


In [113]:
a = list(df_ventas_enriched['month_year'].unique())
a.sort()
a

[Period('2018-01', 'M'),
 Period('2018-02', 'M'),
 Period('2018-03', 'M'),
 Period('2018-04', 'M'),
 Period('2018-05', 'M'),
 Period('2018-06', 'M'),
 Period('2018-07', 'M'),
 Period('2018-08', 'M'),
 Period('2018-09', 'M'),
 Period('2018-10', 'M'),
 Period('2018-11', 'M'),
 Period('2018-12', 'M'),
 Period('2019-01', 'M'),
 Period('2019-02', 'M'),
 Period('2019-03', 'M'),
 Period('2019-04', 'M'),
 Period('2019-05', 'M'),
 Period('2019-06', 'M'),
 Period('2019-07', 'M'),
 Period('2019-08', 'M')]

In [114]:
df_m3['month_year'].unique()

<PeriodArray>
['2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02',
 '2019-03', '2019-04', '2019-07', '2019-05', '2019-06', '2019-08']
Length: 13, dtype: period[M]

In [115]:
df_m3[df_m3['Nr_sum'].isna()].head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas


In [116]:
cod_cliente = 365
df_sales[(df_sales['Cliente'] == cod_cliente)&(df_sales['Mes']==9)&(df_sales['Año']==2019)]

Unnamed: 0,Cliente,Año,Mes,ClaseEnvase,SegmentoPrecio,Marca,Cupo,Nr,Hl,Dcto


In [117]:
df_m3.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667


In [118]:
# Agregar informacion de cliente enriquecida
df_clients.head(1)

Unnamed: 0,Cliente,FechaAltaCliente,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,1,2009-03-03,4,21,12,1,1,1


In [119]:
# ESTO SE DEBERIA IMPORTAR DE UN UTILITARIO
def obtener_prop_num(df, col_variable):
  """
  df: pd.Dataframe
  col_variable: str
  """
  # Copia: para asegurarnos que no modificamos df -> se debe hacer refactoring
  c_df = copy.deepcopy(df)
  # Cuenta de registros en tabla de clientes
  numero_clientes = len(c_df)
  
  # Contamos registros por valor de col_variable
  df_g1 = c_df.groupby([col_variable]).agg({
    'FechaAltaCliente': 'count',
  }).reset_index()
  
  # Creamos nombre de la columna donde se almacena el numero de clientes
  col_num_clientes = 'num_clientes_' + col_variable
  # Renombrar columna donde se almacena el numero de clientes
  df_g1 = df_g1.rename(columns={'FechaAltaCliente': col_num_clientes})
  # Creamos nombre de la columna donde se almacena la proporcion de clientes
  col_proporcion_clientes = 'prop_clientes_' + col_variable
  # Agregamos columna que almacena informacion de propocion de clientes 
  df_g1[col_proporcion_clientes] = df_g1[col_num_clientes] / numero_clientes
  
  return df_g1

cols_cat_clientes = ['Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']
df_m4 = copy.deepcopy(df_m3)
for col_name in cols_cat_clientes:
  curr_df = obtener_prop_num(df_clients, col_name) 
  df_m4 = pd.merge(df_m4, curr_df, how = 'left', on = col_name)
  print(df_m4.isna().sum().sum())


0
0
0
0
0
0


In [120]:
df_m4.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683


In [121]:
# Agregar informacion de descuentos ofrecidos enriquecida

In [122]:
# Cargar data de prom ofrecidas
# Arreglar fecha para poder cruzar sin problemas
# Quitar registros correspondientes a septiembre 2019
# Conversion de tipos para poder cruzar
f_promo_ofrecidas = '../tablones/num_prom_ofr.pkl'
def load_pickle(fp):
  with open(fp, 'rb') as f:
    df = pickle.load(f)
  return df
df_promo_ofrecidas = load_pickle(f_promo_ofrecidas)
df_promo_ofrecidas['Date_Desde'] \
  = pd.to_datetime(df_promo_ofrecidas['Date_Desde'], format= "%Y-%m")
# Convertir Date_Desde a periodo -> month_year
df_promo_ofrecidas['month_year'] = df_promo_ofrecidas['Date_Desde'].dt.to_period('M')
# Dropear columnas innecesarias
df_promo_ofrecidas = df_promo_ofrecidas.drop('Date_Desde', axis = 1)
# QUITAR DATA DE SEPTIEMBRE PORQUE ESTO NO SE USA PARA ENTRENAR POR VARIAS RAZONES
df_promo_ofrecidas = df_promo_ofrecidas[df_promo_ofrecidas['month_year'] != '201909']
# Convertir columnas a int para poder cruzar
df_promo_ofrecidas['Marca'] = df_promo_ofrecidas['Marca'].astype(int)
df_promo_ofrecidas['Cupo'] = df_promo_ofrecidas['Cupo'].astype(int)
df_promo_ofrecidas['Cliente'] = df_promo_ofrecidas['Cliente'].astype(int)


df_promo_ofrecidas.head(1)

Unnamed: 0,Marca,Cupo,Cliente,Num_prom_ofr,month_year
0,29,9,10358,1,2018-08


In [123]:
df_promo_ofrecidas['month_year'].unique()

<PeriodArray>
['2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02',
 '2019-03', '2019-04', '2019-05', '2019-06', '2019-07', '2019-08']
Length: 13, dtype: period[M]

In [124]:
df_promo_ofrecidas.dtypes

Marca               int64
Cupo                int64
Cliente             int64
Num_prom_ofr        int64
month_year      period[M]
dtype: object

In [125]:
df_promo_ofrecidas.isna().sum()

Marca           0
Cupo            0
Cliente         0
Num_prom_ofr    0
month_year      0
dtype: int64

In [126]:
# Cruzamos df_promo_ofrecidas(enriched) con df_m4
df_m5 = pd.merge(
  df_m4,
  df_promo_ofrecidas,
  on = ['month_year', 'Cliente', 'Marca', 'Cupo'],
  how = 'left'
)
print(df_m5.isna().sum()/len(df_m5)*100)
print(df_m5.isna().sum().sum())
display(df_m5.head(1))

CodigoDC                       0.000000
Marca                          0.000000
Cupo                           0.000000
Cliente                        0.000000
month_year                     0.000000
label                          0.000000
Region                         0.000000
Gerencia                       0.000000
SubCanal                       0.000000
TipoPoblacion                  0.000000
Estrato                        0.000000
EF                             0.000000
Nr_sum                         0.000000
numero_ventas                  0.000000
Hl_sum                         0.000000
Dcto_sum                       0.000000
sum_2Nr                        0.000000
mean_2Nr                       0.000000
sum_3Nr                        0.000000
mean_3Nr                       0.000000
sum_4Nr                        0.000000
mean_4Nr                       0.000000
sum_5Nr                        0.000000
mean_5Nr                       0.000000
sum_6Nr                        0.000000


Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0


In [127]:
print(df_m5.isna().sum()[df_m5.isna().sum() > 0])

Num_prom_ofr    1602
dtype: int64


In [128]:
# Llenamos los nan de esta columna con 0 porque significa que no se les ha ofrecido productos para esa
# combinacion de marca, cupo, fecha, cliente
df_m5['Num_prom_ofr'] = df_m5['Num_prom_ofr'].fillna(0)
print(df_m5.isna().sum()[df_m5.isna().sum() > 0])
display(df_m5.head(1))

Series([], dtype: int64)


Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0


In [129]:
df_m5.shape

(385802, 70)

In [130]:
# Ahora agregaremos data enriquecida de promociones activas y promociones ejecutadas

In [131]:
f_prop = '../tablones/data_grupo2_proporcion.pkl' 
df_prop_enriched = load_pickle(f_prop)
df_prop_enriched.head(1)

Unnamed: 0,month_year,Cliente,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,2018-08,1,0,0.0,0.0,5.083333,2.541667,7.181818,2.393939,9.1,2.275,10.555556,2.111111,12.375,2.0625,0.166667,0.083333,0.272727,0.090909,0.4,0.1,0.444444,0.088889,0.5,0.083333,0.020833,0.010417,0.034091,0.011364,0.05,0.0125,0.055556,0.011111,0.0625,0.010417


In [132]:
# Cruzamos df_prop_enriched(enriched) 
df_m6 = pd.merge(
  df_m5,
  df_prop_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'
)
print(df_m6.isna().sum()/len(df_m6)*100)
print(df_m6.isna().sum().sum())
display(df_m6.head(1))

CodigoDC                       0.0
Marca                          0.0
Cupo                           0.0
Cliente                        0.0
month_year                     0.0
label                          0.0
Region                         0.0
Gerencia                       0.0
SubCanal                       0.0
TipoPoblacion                  0.0
Estrato                        0.0
EF                             0.0
Nr_sum                         0.0
numero_ventas                  0.0
Hl_sum                         0.0
Dcto_sum                       0.0
sum_2Nr                        0.0
mean_2Nr                       0.0
sum_3Nr                        0.0
mean_3Nr                       0.0
sum_4Nr                        0.0
mean_4Nr                       0.0
sum_5Nr                        0.0
mean_5Nr                       0.0
sum_6Nr                        0.0
mean_6Nr                       0.0
sum_2Hl                        0.0
mean_2Hl                       0.0
sum_3Hl             

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843


In [133]:
df_m6.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843


In [134]:
df_m6.shape

(385802, 103)

In [135]:
def store_pickle(fp, obj):
  with open(fp, 'wb') as f:
    pickle.dump(obj, f)

In [136]:
# Almacenar para continuar entrenando en "la bestia"
fp_output = '../tablones/df_m6.pkl'
store_pickle(fp_output, df_m6)

In [137]:
df_m6['Marca'].unique()

array([29, 40, 39])

In [138]:
df_m6['TipoPoblacion'].unique()

array([2, 1])

# Inicio de preprocesamiento antes de entrenar
Se manejaran 2 escenarios : nominales con dummies, nominales con el valor numerico con que llegaron

In [139]:
# Quitar columnas que no son parte del entrenamiento
COLS_QUITAR =  ['CodigoDC', 'month_year', 'Cliente']
df_todo = df_m6.drop(COLS_QUITAR, axis = 1)
df_todo.head(1)

Unnamed: 0,Marca,Cupo,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,29,9,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843


In [140]:
df_todo['Region'].unique()

array([3, 5, 1, 4, 2])

In [141]:
df_todo['TipoPoblacion'].unique()

array([2, 1])

In [142]:
df_todo['Region'] = df_todo['Region'].astype(int)

In [143]:
COLS_NOMINAL = [
  'Region',
  'Gerencia',
  'SubCanal',
  'TipoPoblacion',
  'Marca',
  'Cupo',
  #ClaseEnvase
  #SegmentoPrecio
]
COLS_ORDINAL = [
  'Estrato'

]
COLS_FLAG = [
  'EF'
]
COLS_ONEHOT = [
  'Region',
  'Gerencia',
  'SubCanal',
  'TipoPoblacion',
  'Marca',
  'Cupo',
  'Estrato',
  'EF'
]

df_todo_dummies = copy.deepcopy(df_todo)
#Convertimos en string nuestras proto-dummies
for col_name in COLS_ONEHOT:
  df_todo[col_name] = df_todo[col_name].astype(int)
  df_todo_dummies[col_name] = df_todo_dummies[col_name].astype(str)


In [144]:
df_todo['TipoPoblacion'].unique()

array([2, 1])

In [145]:
df_todo_dummies['TipoPoblacion'].unique()

array(['2', '1'], dtype=object)

In [147]:
dummies_df_temp = pd.get_dummies(df_todo_dummies[COLS_ONEHOT], prefix = COLS_ONEHOT)
dummies_df_temp.head(1)

Unnamed: 0,Region_1,Region_2,Region_3,Region_4,Region_5,Gerencia_10,Gerencia_11,Gerencia_12,Gerencia_13,Gerencia_14,Gerencia_15,Gerencia_16,Gerencia_17,Gerencia_18,Gerencia_19,Gerencia_2,Gerencia_20,Gerencia_21,Gerencia_22,Gerencia_23,Gerencia_24,Gerencia_25,Gerencia_26,Gerencia_27,Gerencia_28,Gerencia_29,Gerencia_3,Gerencia_30,Gerencia_31,Gerencia_32,Gerencia_33,Gerencia_34,Gerencia_35,Gerencia_36,Gerencia_37,Gerencia_4,Gerencia_5,Gerencia_6,Gerencia_7,Gerencia_8,Gerencia_9,SubCanal_1,SubCanal_10,SubCanal_11,SubCanal_12,SubCanal_13,SubCanal_14,SubCanal_15,SubCanal_16,SubCanal_17,SubCanal_18,SubCanal_19,SubCanal_2,SubCanal_20,SubCanal_22,SubCanal_23,SubCanal_24,SubCanal_25,SubCanal_26,SubCanal_27,SubCanal_3,SubCanal_4,SubCanal_5,SubCanal_6,SubCanal_7,SubCanal_8,SubCanal_9,TipoPoblacion_1,TipoPoblacion_2,Marca_29,Marca_39,Marca_40,Cupo_16,Cupo_20,Cupo_9,Estrato_1,Estrato_2,Estrato_3,Estrato_4,Estrato_5,Estrato_6,EF_0,EF_1
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1


In [148]:
# df_t4_dummies = pd.get_dummies(df_t4_dummies, prefix = COLS_ONEHOT)
# concnat horizontally
df_todo_dummies = pd.concat([df_todo_dummies, dummies_df_temp], axis=1)

print(df_todo_dummies.isna().sum().sum())
display(df_todo_dummies.head(1))

0


Unnamed: 0,Marca,Cupo,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas,Region_1,Region_2,Region_3,Region_4,Region_5,Gerencia_10,Gerencia_11,Gerencia_12,Gerencia_13,Gerencia_14,Gerencia_15,Gerencia_16,Gerencia_17,Gerencia_18,Gerencia_19,Gerencia_2,Gerencia_20,Gerencia_21,Gerencia_22,Gerencia_23,Gerencia_24,Gerencia_25,Gerencia_26,Gerencia_27,Gerencia_28,Gerencia_29,Gerencia_3,Gerencia_30,Gerencia_31,Gerencia_32,Gerencia_33,Gerencia_34,Gerencia_35,Gerencia_36,Gerencia_37,Gerencia_4,Gerencia_5,Gerencia_6,Gerencia_7,Gerencia_8,Gerencia_9,SubCanal_1,SubCanal_10,SubCanal_11,SubCanal_12,SubCanal_13,SubCanal_14,SubCanal_15,SubCanal_16,SubCanal_17,SubCanal_18,SubCanal_19,SubCanal_2,SubCanal_20,SubCanal_22,SubCanal_23,SubCanal_24,SubCanal_25,SubCanal_26,SubCanal_27,SubCanal_3,SubCanal_4,SubCanal_5,SubCanal_6,SubCanal_7,SubCanal_8,SubCanal_9,TipoPoblacion_1,TipoPoblacion_2,Marca_29,Marca_39,Marca_40,Cupo_16,Cupo_20,Cupo_9,Estrato_1,Estrato_2,Estrato_3,Estrato_4,Estrato_5,Estrato_6,EF_0,EF_1
0,29,9,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1


In [149]:
# Dropear columnas que se volvieron dummies
df_todo_dummies = df_todo_dummies.drop(COLS_ONEHOT, axis=1)
df_todo_dummies.shape

(385802, 175)

In [150]:
print("Data set sin dummies")
display(df_todo.head(1))
print(df_todo.shape)
print("#"*90)
print("Data set con dummies")
display(df_todo_dummies.head(1))
print(df_todo_dummies.shape)

Data set sin dummies


Unnamed: 0,Marca,Cupo,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,29,9,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843


(385802, 100)
##########################################################################################
Data set con dummies


Unnamed: 0,label,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas,Region_1,Region_2,Region_3,Region_4,Region_5,Gerencia_10,Gerencia_11,Gerencia_12,Gerencia_13,Gerencia_14,Gerencia_15,Gerencia_16,Gerencia_17,Gerencia_18,Gerencia_19,Gerencia_2,Gerencia_20,Gerencia_21,Gerencia_22,Gerencia_23,Gerencia_24,Gerencia_25,Gerencia_26,Gerencia_27,Gerencia_28,Gerencia_29,Gerencia_3,Gerencia_30,Gerencia_31,Gerencia_32,Gerencia_33,Gerencia_34,Gerencia_35,Gerencia_36,Gerencia_37,Gerencia_4,Gerencia_5,Gerencia_6,Gerencia_7,Gerencia_8,Gerencia_9,SubCanal_1,SubCanal_10,SubCanal_11,SubCanal_12,SubCanal_13,SubCanal_14,SubCanal_15,SubCanal_16,SubCanal_17,SubCanal_18,SubCanal_19,SubCanal_2,SubCanal_20,SubCanal_22,SubCanal_23,SubCanal_24,SubCanal_25,SubCanal_26,SubCanal_27,SubCanal_3,SubCanal_4,SubCanal_5,SubCanal_6,SubCanal_7,SubCanal_8,SubCanal_9,TipoPoblacion_1,TipoPoblacion_2,Marca_29,Marca_39,Marca_40,Cupo_16,Cupo_20,Cupo_9,Estrato_1,Estrato_2,Estrato_3,Estrato_4,Estrato_5,Estrato_6,EF_0,EF_1
0,0.0,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1


(385802, 175)


In [63]:
df_todo['label'].value_counts()

0.0    358011
1.0     27791
Name: label, dtype: int64

In [77]:
import pyarrow as pa

In [151]:
df_todo.to_parquet('../tablones/train_data_todo.parquet', engine='pyarrow')

In [152]:
df_todo_dummies.to_parquet('../tablones/train_data_todo_dummi.parquet', engine='pyarrow')

In [156]:
df_todo_dummies.head(1)

Unnamed: 0,label,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas,Region_1,Region_2,Region_3,Region_4,Region_5,Gerencia_10,Gerencia_11,Gerencia_12,Gerencia_13,Gerencia_14,Gerencia_15,Gerencia_16,Gerencia_17,Gerencia_18,Gerencia_19,Gerencia_2,Gerencia_20,Gerencia_21,Gerencia_22,Gerencia_23,Gerencia_24,Gerencia_25,Gerencia_26,Gerencia_27,Gerencia_28,Gerencia_29,Gerencia_3,Gerencia_30,Gerencia_31,Gerencia_32,Gerencia_33,Gerencia_34,Gerencia_35,Gerencia_36,Gerencia_37,Gerencia_4,Gerencia_5,Gerencia_6,Gerencia_7,Gerencia_8,Gerencia_9,SubCanal_1,SubCanal_10,SubCanal_11,SubCanal_12,SubCanal_13,SubCanal_14,SubCanal_15,SubCanal_16,SubCanal_17,SubCanal_18,SubCanal_19,SubCanal_2,SubCanal_20,SubCanal_22,SubCanal_23,SubCanal_24,SubCanal_25,SubCanal_26,SubCanal_27,SubCanal_3,SubCanal_4,SubCanal_5,SubCanal_6,SubCanal_7,SubCanal_8,SubCanal_9,TipoPoblacion_1,TipoPoblacion_2,Marca_29,Marca_39,Marca_40,Cupo_16,Cupo_20,Cupo_9,Estrato_1,Estrato_2,Estrato_3,Estrato_4,Estrato_5,Estrato_6,EF_0,EF_1
0,0.0,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1


# Xgboost

In [160]:
#train_data, test_data = train_test_split(df_todo_dummies, test_size=0.2)
X_train_dummie, X_test_dummie, y_train_dummie, y_test_dummie \
  = train_test_split( df_todo_dummies.drop(["label"], axis=1), df_todo_dummies["label"], test_size = 0.2)

print(X_train_dummie.shape)
print(X_test_dummie.shape)

(308641, 174)
(77161, 174)


In [161]:
classifier_xgb_1 = xgb.XGBClassifier(
  n_estimators=100,
  reg_lambda=1,
  gamma=0,
  max_depth=3,
  objective='binary:logistic',
  scale_pos_weight=12.882264042315859,
)
classifier_xgb_1.fit(X_train_dummie, y_train_dummie)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=12.882264042315859,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [163]:
# make predictions for test data    
print("Con scale_pos_weight=12.882264042315859")
predictions = classifier_xgb_1.predict_proba(X_test_dummie)[:, 1]
# evaluate predictions
print('ROC AUC Score',roc_auc_score(y_test_dummie, predictions))

Con scale_pos_weight=12.882264042315859
ROC AUC Score 0.9641562300652459


In [164]:
classifier_xgb_2 = xgb.XGBClassifier(
  n_estimators=100,
  reg_lambda=1,
  gamma=0,
  max_depth=3,
  objective='binary:logistic',
  #scale_pos_weight=12.882264042315859,
)
classifier_xgb_2.fit(X_train_dummie, y_train_dummie)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [165]:
# make predictions for test data    
predictions = classifier_xgb_2.predict_proba(X_test_dummie)[:, 1]
# evaluate predictions
print('ROC AUC Score',roc_auc_score(y_test_dummie, predictions))
# old  ROC AUC Score 0.8908498062214215

ROC AUC Score 0.963606739864431


## tunear xgboost

In [70]:
# model empty?
# classifier_xgb_cv = xgb.XGBClassifier()
  
# # define grid
# weights = [1, 10, 25, 50, 75, 99, 100, 1000]
# param_grid = dict(scale_pos_weight = weights)

# # define evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

# # define grid search
# grid = GridSearchCV(estimator = classifier_xgb_cv, 
#                     param_grid = param_grid,
#                     n_jobs = -1, 
#                     cv = cv, 
#                     scoring='roc_auc')

# # execute the grid search
# grid_result = grid.fit(X_train, y_train)

## LGBM

In [166]:
# LGBM COn dummies
classifier_lgbm_dummies = lgb.LGBMClassifier(
  n_estimators=100,
  reg_lambda=1,
  max_depth=-1,
  objective='binary',
  scale_pos_weight = 12.882264042315859,
)
classifier_lgbm_dummies.fit(X_train_dummie, y_train_dummie)
# make predictions for test data    
predictions = classifier_lgbm_dummies.predict_proba(X_test_dummie)[:, 1]
# evaluate predictions
print('ROC AUC Score',roc_auc_score(y_test_dummie, predictions))
# old ROC AUC Score 0.8972606476313741

ROC AUC Score 0.963284215190083


In [167]:
#train_data, test_data = train_test_split(df_todo, test_size=0.2)
X_train_todo, X_test_todo, y_train_todo, y_test_todo \
  = train_test_split( df_todo.drop(["label"], axis=1), df_todo["label"], test_size = 0.2)
print(X_train_todo.shape)
print(X_test_todo.shape)

(308641, 99)
(77161, 99)


In [168]:
classifier_lgbm_todo = lgb.LGBMClassifier(
  n_estimators=100,
  learning_rate = 0.05,
  reg_lambda=1,
  max_depth=-1,
  objective='binary',
  scale_pos_weight = 12.882264042315859,
)
classifier_lgbm_todo.fit(X_train_todo, y_train_todo, eval_metric='auc')

LGBMClassifier(learning_rate=0.05, objective='binary', reg_lambda=1,
               scale_pos_weight=12.882264042315859)

In [170]:
# make predictions for test data    
predictions = classifier_lgbm_todo.predict_proba(X_test_todo)[:, 1]
# evaluate predictions
print('ROC AUC Score',roc_auc_score(y_test_todo, predictions))
#ROC AUC Score 0.8932139912379119

ROC AUC Score 0.9658410281486521


## Tunear LightGBM

In [78]:
#model empty?
# classifier_lgbm_tuned =  lgb.LGBMClassifier()
  
# # define grid
# param_grid = {
#     'scale_pos_weight': [5,10,12],
#     'n_estimators': [100, 200],
#     'reg_lambda': [1],
#     'objective': ['binary'],
#     #'num_leaves': [15, 31, 63, 127, 255, 511, 1023, 2047, 4095],
#     'max_depth': [2,8,10,12,14], # make it not so deep
# }

# # define evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

# # define grid search
# grid = GridSearchCV(estimator = classifier_lgbm_tuned, 
#                     param_grid = param_grid,
#                     n_jobs = -1, 
#                     cv = cv, 
#                     scoring='roc_auc')

# # execute the grid search
# grid_result = grid.fit(X_train, y_train)
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

## Entrenando al mejor modelo sobre toda la data

In [171]:
#X_all = df_todo_dummies.drop(["label"], axis=1)
#y_all =  df_todo_dummies["label"]
X_all = df_todo.drop(["label"], axis=1)
y_all = df_todo["label"]

X_all.shape

(385802, 99)

In [172]:
X_train_todo.shape

(308641, 99)

In [173]:
X_all.head(1)

Unnamed: 0,Marca,Cupo,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,29,9,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,4981,0.330546,867,0.057535,2293,0.152167,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,0.0,0.0,6.75,3.375,9.818182,3.272727,12.9,3.225,15.444444,3.088889,18.625,3.104167,0.5,0.25,0.727273,0.242424,0.9,0.225,1.0,0.2,1.125,0.1875,0.062963,0.031481,0.093939,0.031313,0.114444,0.028611,0.12716,0.025432,0.143056,0.023843


In [174]:
X_train_todo.head(1)

Unnamed: 0,Marca,Cupo,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
273707,40,16,3,10,7,2,2,0,47734.16,3,0.17,-780.67,80206.69,40103.345,145422.23,48474.076667,203877.6,50969.4,237385.69,47477.138,274306.67,45717.778333,0.27,0.135,0.5,0.166667,0.7,0.175,0.8,0.16,0.92,0.153333,-780.67,-390.335,-2192.83,-730.943333,-2192.83,-548.2075,-2192.83,-438.566,-2192.83,-365.471667,3,5.0,2.5,9.0,3.0,13.0,3.25,15.0,3.0,17.0,2.833333,4981,0.330546,750,0.049771,466,0.030924,11718,0.777623,6598,0.437853,9664,0.641317,1.0,2,0.0,0.0,2.0,1.0,2.0,0.666667,2.0,0.5,2.0,0.4,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.775558e-17,1.387779e-17,2.775558e-17,9.251859000000001e-18,2.775558e-17,6.938894e-18,2.775558e-17,5.551115e-18,2.775558e-17,4.625929e-18


In [176]:
classifier_prod = lgb.LGBMClassifier(
#   max_depth = 2, 
#   n_estimators = 200,
#   objective = 'binary',
#   reg_lambda = 1,
#   scale_pos_weight = 12
  n_estimators=100,
  learning_rate = 0.05,
  reg_lambda=1,
  max_depth=-1,
  objective='binary',
  scale_pos_weight = 12.882264042315859,  
)

classifier_prod.fit(X_all, y_all)

LGBMClassifier(learning_rate=0.05, objective='binary', reg_lambda=1,
               scale_pos_weight=12.882264042315859)

# Predecir en el test set

In [177]:
classifier_prod

LGBMClassifier(learning_rate=0.05, objective='binary', reg_lambda=1,
               scale_pos_weight=12.882264042315859)

In [178]:
df_test = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')
df_test.head(1)

Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,


In [179]:
# Agregar month_year = '201908', se asume
df_test['month_year'] = pd.Period('2019-08', 'M')
# Dropear Ejecuto_Promo, al final lo agrego otra vez
df_test = df_test.drop('Ejecuto_Promo', axis = 1)
print(df_test.isna().sum().sum())
display(df_test.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,month_year
0,10,40,16,2019-08


In [180]:
# Agregamos informacion de clientes sin enriquecer
df_t1 = pd.merge(
  df_test,
  df_clients[['Cliente', 'Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']],
  on = 'Cliente',
  how = 'left'
)
print(df_t1.isna().sum().sum())
display(df_t1.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,10,40,16,2019-08,4,21,15,2,2,0


In [181]:
# Agregamos informacion de ventas enriquecida
df_t2 = pd.merge(
  df_t1,
  df_ventas_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'  
)
print(df_t2.isna().sum().sum())
display(df_t2.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5


In [182]:
# Agregamos informacion de prop_num 
cols_cat_clientes = ['Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']
df_t3 = copy.deepcopy(df_t2)
for col_name in cols_cat_clientes:
  curr_df = obtener_prop_num(df_clients, col_name) 
  df_t3 = pd.merge(df_t3, curr_df, how = 'left', on = col_name)
  #print(df_t3.isna().sum().sum())
print(df_t3.isna().sum().sum())
display(df_t3.head(1))  

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317


In [183]:
# Agregar info de promo ofrecidas
df_t4 = pd.merge(
  df_t3,
  df_promo_ofrecidas,
  on = ['month_year', 'Cliente', 'Marca', 'Cupo'],
  how = 'left'
)
print(df_t4.isna().sum().sum())
display(df_t4.head(1))  

1025


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0


In [184]:
# Llena con 0 los NaN en Num_prom_ofr porque significa que no se ofrecieron descuentos
df_t4['Num_prom_ofr'] = df_t4['Num_prom_ofr'].fillna(0)
print(df_t4.isna().sum().sum())
display(df_t4.head(1))  

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0


In [185]:
f_prop = '../tablones/data_grupo2_proporcion.pkl' 
df_prop_enriched = load_pickle(f_prop)
df_prop_enriched.head(1)

# Cruzamos df_prop_enriched(enriched) 
df_t5 = pd.merge(
  df_t4,
  df_prop_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'
)
print(df_t5.isna().sum()/len(df_t5)*100)
print(df_t5.isna().sum().sum())
display(df_t5.head(1))

Cliente                        0.0
Marca                          0.0
Cupo                           0.0
month_year                     0.0
Region                         0.0
Gerencia                       0.0
SubCanal                       0.0
TipoPoblacion                  0.0
Estrato                        0.0
EF                             0.0
Nr_sum                         0.0
numero_ventas                  0.0
Hl_sum                         0.0
Dcto_sum                       0.0
sum_2Nr                        0.0
mean_2Nr                       0.0
sum_3Nr                        0.0
mean_3Nr                       0.0
sum_4Nr                        0.0
mean_4Nr                       0.0
sum_5Nr                        0.0
mean_5Nr                       0.0
sum_6Nr                        0.0
mean_6Nr                       0.0
sum_2Hl                        0.0
mean_2Hl                       0.0
sum_3Hl                        0.0
mean_3Hl                       0.0
sum_4Hl             

Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,ofertas_tomadas,proporcion_tomadas,sum_2ofertas_dadas,mean_2ofertas_dadas,sum_3ofertas_dadas,mean_3ofertas_dadas,sum_4ofertas_dadas,mean_4ofertas_dadas,sum_5ofertas_dadas,mean_5ofertas_dadas,sum_6ofertas_dadas,mean_6ofertas_dadas,sum_2ofertas_tomadas,mean_2ofertas_tomadas,sum_3ofertas_tomadas,mean_3ofertas_tomadas,sum_4ofertas_tomadas,mean_4ofertas_tomadas,sum_5ofertas_tomadas,mean_5ofertas_tomadas,sum_6ofertas_tomadas,mean_6ofertas_tomadas,sum_2proporcion_tomadas,mean_2proporcion_tomadas,sum_3proporcion_tomadas,mean_3proporcion_tomadas,sum_4proporcion_tomadas,mean_4proporcion_tomadas,sum_5proporcion_tomadas,mean_5proporcion_tomadas,sum_6proporcion_tomadas,mean_6proporcion_tomadas
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0,5,1.0,0.2,8.0,4.0,8.0,2.666667,10.0,2.5,12.0,2.4,16.0,2.666667,2.0,1.0,2.0,0.666667,2.0,0.5,2.0,0.4,2.0,0.333333,0.533333,0.266667,0.533333,0.177778,0.533333,0.133333,0.533333,0.106667,0.533333,0.088889


In [186]:
df_t5.shape

(16870, 101)

In [187]:
# Crear dummies
# COLS_ONEHOT = [
#   'Region',
#   'Gerencia',
#   'SubCanal',
#   'TipoPoblacion',
#   'Marca',
#   'Cupo',
#   'Estrato',
#   'EF'
# ]

# df_t5_dummies = copy.deepcopy(df_t5)
# for col_name in COLS_ONEHOT:
#   df_t5_dummies[col_name] = df_t5_dummies[col_name].astype(str)

# df_t5_dummies = pd.get_dummies(df_t5_dummies, prefix=COLS_ONEHOT)
# print(df_t5_dummies.isna().sum().sum())
# display(df_t5_dummies.head(1))

In [188]:
#df_t5_dummies.shape

## Correcciones manuales para que coincidan las columnas con lo que se entreno

In [189]:
#dif_set = set(X_train.columns).difference(set(df_t5_dummies.columns))
dif_set = set(X_train_todo.columns).difference(set(df_t5.columns))
print(dif_set)

set()


In [190]:
# Agregar columnas manualmente
for elem in dif_set:
  #df_t5_dummies[elem] = 0
    continue

In [191]:
#set(X_train.columns).difference(set(df_t5_dummies.columns))
set(X_train_todo.columns).difference(set(df_t5.columns))

set()

In [192]:
set(df_t5.columns).difference(set(X_train_todo.columns))

{'Cliente', 'month_year'}

In [193]:
# df_t5_dummies = df_t5_dummies.drop('month_year', axis = 1)
# print(set(X_train.columns).difference(set(df_t5_dummies.columns)))
# print(set(df_t5_dummies.columns).difference(set(X_train.columns)))
df_t5 = df_t5.drop('month_year', axis = 1)
print(set(X_train_todo.columns).difference(set(df_t5.columns)))
print(set(df_t5.columns).difference(set(X_train_todo.columns)))

set()
{'Cliente'}


In [194]:
#df_t5_dummies = df_t5_dummies.drop('Cliente', axis=1)
df_t5 = df_t5.drop('Cliente', axis=1)

## Prediccion

In [197]:
#pred = classifier_lgbm_dummies.predict_proba(df_t4_dummies)
#pred = classifier_prod.predict_proba(df_t5_dummies)
pred = classifier_prod.predict_proba(df_t5)

In [198]:
print(pred)

[[1.25689529e-01 8.74310471e-01]
 [3.89162365e-01 6.10837635e-01]
 [9.99510791e-01 4.89208833e-04]
 ...
 [9.99510791e-01 4.89208833e-04]
 [9.99510791e-01 4.89208833e-04]
 [9.99510791e-01 4.89208833e-04]]


In [199]:
lista_pred_compro = [x for _, x in pred]
len(lista_pred_compro)

16870

In [200]:
df_output = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')
df_output.tail(1)

Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
16869,15064,29,9,


In [201]:
df_output['Ejecuto_Promo'] = lista_pred_compro
print(df_output.isna().sum().sum())
display(df_output.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,0.87431


In [202]:
import time
fp = '../output/' +  str(time.asctime()) + '.csv'
fp = '../output/' + 'asddd.csv'
fp

'../output/asddd.csv'

In [203]:
# Almacenar 
df_output = df_output.reset_index(drop=True)
display(df_output.head(1))


Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,0.87431


In [204]:
df_output.to_csv(fp, sep = ',', columns = ['Cliente', 'Marca', 'Cupo', 'Ejecuto_Promo'], index=False)