In [279]:
import pandas as pd
import numpy as np
import copy
from datetime import datetime
import pickle 
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score
import ubjson


from sklearn.model_selection import GridSearchCV
import gzip


import lightgbm as lgb

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

In [280]:
fp_clients = '../data/clients_attributes.csv'
fp_sales = '../data/sales.csv'

fp_active_promos = '../data/active_promos.csv'
fp_executed_promos = '../data/executed_promos.csv'
fp_test = '../data/test.csv'

df_active_promos = pd.read_csv(fp_active_promos, encoding='ISO-8859-1', sep=',')
df_clients = pd.read_csv(fp_clients, encoding='ISO-8859-1', sep=',')
df_executed_promos = pd.read_csv(fp_executed_promos, encoding='ISO-8859-1', sep=',')
df_sales = pd.read_csv(fp_sales, encoding='ISO-8859-1', sep=',')
df_test = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')

In [281]:
# Crear periodo, como el mes de la mediana entre fecha_desde y fecha_hasta
df_active_promos['fecha_desde_dt'] \
  = pd.to_datetime(df_active_promos['Fecha_Desde'], format = '%Y-%m-%d')
df_active_promos['fecha_hasta_dt'] \
  = pd.to_datetime(df_active_promos['Fecha_Hasta'], format = '%Y-%m-%d')

df_active_promos['mid_date'] \
  = df_active_promos['fecha_desde_dt'] + \
    (df_active_promos['fecha_hasta_dt'] - df_active_promos['fecha_desde_dt']) /2

df_active_promos['periodo'] = df_active_promos['mid_date'].apply(lambda dt: dt.replace(day = 1))

df_active_promos['month_year'] = df_active_promos['periodo'].dt.to_period('M')
# Dropear columnas innecesarias
df_active_promos = df_active_promos.drop(['Fecha_Desde', 'Fecha_Hasta', 'fecha_desde_dt',
                                          'fecha_hasta_dt', 'mid_date', 'periodo'], axis = 1)
df_active_promos.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year
0,297601,29,9,3213,2018-08


In [282]:
fechas_active_promos = list(df_active_promos['month_year'].unique())
fechas_active_promos

[Period('2018-08', 'M'),
 Period('2018-09', 'M'),
 Period('2018-10', 'M'),
 Period('2018-11', 'M'),
 Period('2018-12', 'M'),
 Period('2019-01', 'M'),
 Period('2019-02', 'M'),
 Period('2019-03', 'M'),
 Period('2019-04', 'M'),
 Period('2019-07', 'M'),
 Period('2019-05', 'M'),
 Period('2019-06', 'M'),
 Period('2019-08', 'M'),
 Period('2019-09', 'M')]

In [283]:
# Quitare manualmente los active_promos de septiembre 2019 debido a que no hay match con executed_promos
#df_active_promos = df_active_promos[df_active_promos['month_year']!=fechas_active_promos[-1]]
df_active_promos = df_active_promos[df_active_promos['month_year']!= '201909']

In [284]:
df_active_promos['month_year'].unique()

<PeriodArray>
['2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02',
 '2019-03', '2019-04', '2019-07', '2019-05', '2019-06', '2019-08']
Length: 13, dtype: period[M]

In [285]:
print(df_executed_promos.isna().sum())
# Agregar label a executed_promos
df_executed_promos['label'] = 1
display(df_executed_promos.head(1))

CodigoDC    0
Cliente     0
Marca       0
Cupo        0
dtype: int64


Unnamed: 0,CodigoDC,Cliente,Marca,Cupo,label
0,297601,8410,29,9,1


In [286]:
print(f"Numero de registros en active promos: {len(df_active_promos)}")
df_m1 = pd.merge(
  df_active_promos,
  df_executed_promos,
  on = ['CodigoDC', 'Cliente', 'Marca', 'Cupo'],
  how = 'left')
print(f"Numero de registros en df_m1: {len(df_m1)}")

Numero de registros en active promos: 385802
Numero de registros en df_m1: 385802


In [287]:
# Llenamos NaN con 0, 0 implica que el cliente no acepto la promocion
df_m1 = df_m1.fillna(0)

In [288]:
fechas_m1 = list(df_m1['month_year'].unique())
fechas_m1.sort()
fechas_m1

[Period('2018-08', 'M'),
 Period('2018-09', 'M'),
 Period('2018-10', 'M'),
 Period('2018-11', 'M'),
 Period('2018-12', 'M'),
 Period('2019-01', 'M'),
 Period('2019-02', 'M'),
 Period('2019-03', 'M'),
 Period('2019-04', 'M'),
 Period('2019-05', 'M'),
 Period('2019-06', 'M'),
 Period('2019-07', 'M'),
 Period('2019-08', 'M')]

In [289]:
# Agregamos informacion de clientes DIRECTA
df_clients.head(1)

Unnamed: 0,Cliente,FechaAltaCliente,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,1,2009-03-03,4,21,12,1,1,1


In [290]:
df_m2 = pd.merge(
  df_m1, 
  df_clients[['Cliente', 'Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']],
  on = 'Cliente',
  how = 'left'
)
print(df_m2.isna().sum())
display(df_m2.head(1))

CodigoDC         0
Marca            0
Cupo             0
Cliente          0
month_year       0
label            0
Region           0
Gerencia         0
SubCanal         0
TipoPoblacion    0
Estrato          0
EF               0
dtype: int64


Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1


In [291]:
# Cargamos pickle
f_ventas = '../tablones/data_grupo_3.pkl'
def load_pickle(fp):
  with open(fp, 'rb') as f:
    df = pickle.load(f)
  return df
df_ventas_enriched = load_pickle(f_ventas)
print(df_ventas_enriched.isna().sum().sum())
# Creamos month_year
df_ventas_enriched['month_year'] = df_ventas_enriched['periodo'].dt.to_period('M')
df_ventas_enriched = df_ventas_enriched.drop('periodo', axis=1)
display(df_ventas_enriched.head(1))

0


Unnamed: 0,Cliente,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,month_year
0,1,591975.69,13,2.31,-6785.54,680049.570526,340024.785263,1032291.12,344097.04,1389348.0,347337.026471,1764272.0,352854.43425,2180481.0,363413.575889,2.491579,1.245789,3.772778,1.257593,5.072353,1.268088,6.429375,1.285875,7.932,1.322,-47691.214211,-23845.607105,-72644.81,-24214.936667,-98924.870588,-24731.217647,-127734.36625,-25546.87325,-160123.783333,-26687.297222,13,17.736842,8.868421,26.722222,8.907407,35.882353,8.970588,45.0625,9.0125,54.933333,9.155556,2018-01


In [292]:
# Agregamos data de ventas enriquecida
df_m3 = pd.merge(
  df_m2,
  df_ventas_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'
)
print(df_m3.isna().sum()/len(df_m3)*100)
display(df_m3.head(1))

CodigoDC               0.0
Marca                  0.0
Cupo                   0.0
Cliente                0.0
month_year             0.0
label                  0.0
Region                 0.0
Gerencia               0.0
SubCanal               0.0
TipoPoblacion          0.0
Estrato                0.0
EF                     0.0
Nr_sum                 0.0
numero_ventas          0.0
Hl_sum                 0.0
Dcto_sum               0.0
sum_2Nr                0.0
mean_2Nr               0.0
sum_3Nr                0.0
mean_3Nr               0.0
sum_4Nr                0.0
mean_4Nr               0.0
sum_5Nr                0.0
mean_5Nr               0.0
sum_6Nr                0.0
mean_6Nr               0.0
sum_2Hl                0.0
mean_2Hl               0.0
sum_3Hl                0.0
mean_3Hl               0.0
sum_4Hl                0.0
mean_4Hl               0.0
sum_5Hl                0.0
mean_5Hl               0.0
sum_6Hl                0.0
mean_6Hl               0.0
sum_2Dcto              0.0
m

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667


In [293]:
# Cargar data de prom ofrecidas
# Arreglar fecha para poder cruzar sin problemas
# Quitar registros correspondientes a septiembre 2019
# Conversion de tipos para poder cruzar
f_promo_ofrecidas = '../tablones/num_prom_ofr.pkl'
def load_pickle(fp):
  with open(fp, 'rb') as f:
    df = pickle.load(f)
  return df
df_promo_ofrecidas = load_pickle(f_promo_ofrecidas)
df_promo_ofrecidas['Date_Desde'] \
  = pd.to_datetime(df_promo_ofrecidas['Date_Desde'], format= "%Y-%m")
# Convertir Date_Desde a periodo -> month_year
df_promo_ofrecidas['month_year'] = df_promo_ofrecidas['Date_Desde'].dt.to_period('M')
# Dropear columnas innecesarias
df_promo_ofrecidas = df_promo_ofrecidas.drop('Date_Desde', axis = 1)
# QUITAR DATA DE SEPTIEMBRE PORQUE ESTO NO SE USA PARA ENTRENAR POR VARIAS RAZONES
df_promo_ofrecidas = df_promo_ofrecidas[df_promo_ofrecidas['month_year'] != '201909']
# Convertir columnas a int para poder cruzar
df_promo_ofrecidas['Marca'] = df_promo_ofrecidas['Marca'].astype(int)
df_promo_ofrecidas['Cupo'] = df_promo_ofrecidas['Cupo'].astype(int)
df_promo_ofrecidas['Cliente'] = df_promo_ofrecidas['Cliente'].astype(int)

df_promo_ofrecidas.head(1)

Unnamed: 0,Marca,Cupo,Cliente,Num_prom_ofr,month_year
0,29,9,10358,1,2018-08


In [294]:
# Cruzamos df_promo_ofrecidas(enriched) con df_m4
df_m4 = pd.merge(
  df_m3,
  df_promo_ofrecidas,
  on = ['month_year', 'Cliente', 'Marca', 'Cupo'],
  how = 'left'
)
print(df_m4.isna().sum()/len(df_m4)*100)
print(df_m4.isna().sum().sum())
display(df_m4.head(1))

CodigoDC               0.000000
Marca                  0.000000
Cupo                   0.000000
Cliente                0.000000
month_year             0.000000
label                  0.000000
Region                 0.000000
Gerencia               0.000000
SubCanal               0.000000
TipoPoblacion          0.000000
Estrato                0.000000
EF                     0.000000
Nr_sum                 0.000000
numero_ventas          0.000000
Hl_sum                 0.000000
Dcto_sum               0.000000
sum_2Nr                0.000000
mean_2Nr               0.000000
sum_3Nr                0.000000
mean_3Nr               0.000000
sum_4Nr                0.000000
mean_4Nr               0.000000
sum_5Nr                0.000000
mean_5Nr               0.000000
sum_6Nr                0.000000
mean_6Nr               0.000000
sum_2Hl                0.000000
mean_2Hl               0.000000
sum_3Hl                0.000000
mean_3Hl               0.000000
sum_4Hl                0.000000
mean_4Hl

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0


In [295]:
# Llenamos los nan de esta columna con 0 porque significa que no se les ha ofrecido productos para esa
# combinacion de marca, cupo, fecha, cliente
df_m4['Num_prom_ofr'] = df_m4['Num_prom_ofr'].fillna(0)
print(df_m4.isna().sum()[df_m4.isna().sum() > 0])
display(df_m4.head(1))

Series([], dtype: int64)


Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0


In [296]:
df_m4.shape

(385802, 58)

In [297]:
f_prop = '../tablones/data_grupo2_proporcion.pkl' 
df_prop_enriched = load_pickle(f_prop)
df_prop_enriched.head(1)

Unnamed: 0,month_year,Cliente,ofertas_dadas
0,2018-08,1,0


In [298]:
# Cruzamos df_prop_enriched(enriched) 
df_m5 = pd.merge(
  df_m4,
  df_prop_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'
)
print(df_m5.isna().sum()/len(df_m5)*100)
print(df_m5.isna().sum().sum())
display(df_m5.head(1))

CodigoDC               0.0
Marca                  0.0
Cupo                   0.0
Cliente                0.0
month_year             0.0
label                  0.0
Region                 0.0
Gerencia               0.0
SubCanal               0.0
TipoPoblacion          0.0
Estrato                0.0
EF                     0.0
Nr_sum                 0.0
numero_ventas          0.0
Hl_sum                 0.0
Dcto_sum               0.0
sum_2Nr                0.0
mean_2Nr               0.0
sum_3Nr                0.0
mean_3Nr               0.0
sum_4Nr                0.0
mean_4Nr               0.0
sum_5Nr                0.0
mean_5Nr               0.0
sum_6Nr                0.0
mean_6Nr               0.0
sum_2Hl                0.0
mean_2Hl               0.0
sum_3Hl                0.0
mean_3Hl               0.0
sum_4Hl                0.0
mean_4Hl               0.0
sum_5Hl                0.0
mean_5Hl               0.0
sum_6Hl                0.0
mean_6Hl               0.0
sum_2Dcto              0.0
m

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,ofertas_dadas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0,1


In [299]:
df_m5['Cliente'].value_counts()

5226     61
581      61
553      59
4138     59
9263     59
         ..
13724     1
14036     1
7650      1
11596     1
6131      1
Name: Cliente, Length: 13549, dtype: int64

In [300]:
cod_cliente = 9263
month_year = pd.Period('2019-08', 'M')
cond_1  = df_m5['Cliente'] == cod_cliente
cond_2 = df_m5['month_year'] == month_year
df_m5[cond_1 & cond_2]

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,ofertas_dadas
255518,639304,39,20,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
255519,639304,40,16,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
258749,650750,39,20,9263,2019-08,1.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
258750,650750,40,16,9263,2019-08,1.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
281901,647587,29,9,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,1.0,9
329544,637396,40,16,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
332530,642745,39,20,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
332531,642745,40,16,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9
338258,637396,39,20,9263,2019-08,0.0,1,20,16,1,2,1,4331482.58,22,16.08,-409918.4,10509738.23,5254869.115,18500670.13,6166890.0,27340901.43,6835225.0,33352599.22,6670519.844,38266193.57,6377699.0,39.61,19.805,66.38,22.126667,96.73,24.1825,118.95,23.79,137.31,22.885,-857317.72,-428658.86,-1277090.92,-425696.973333,-1483938.71,-370984.6775,-1714718.0,-342943.6,-1975896.41,-329316.068333,22,45.0,22.5,69.0,23.0,92.0,23.0,117.0,23.4,142.0,23.666667,4.0,9


In [301]:
df_extra = copy.deepcopy(df_m5)
df_extra = df_extra[['Marca', 'Cupo', 'Cliente', 'month_year']]
df_extra['producto'] = df_extra['Marca'].astype(str) + '_' + df_extra['Cupo'].astype(str)
df_extra = df_extra.drop(['Marca', 'Cupo'], axis = 1)
#df_extra['cuenta'] = 1
df_extra.head(5)

Unnamed: 0,Cliente,month_year,producto
0,3213,2018-08,29_9
1,3795,2018-08,29_9
2,11816,2018-08,29_9
3,8444,2018-08,40_16
4,8488,2018-08,40_16


In [302]:
# df_extra.groupby(['Cliente', 'month_year']).agg({
  
# })
#df_extra.pivot(index=['Cliente', 'month_year'], columns='producto', values='cuenta')
df_extra = df_extra.groupby(['Cliente', 'month_year', 'producto']).size().unstack(fill_value=0)\
                           .reset_index()
df_extra['29_9']  = df_extra['29_9'].apply(lambda x: 1 if x>0 else 0)
df_extra['39_20'] = df_extra['39_20'].apply(lambda x:1 if x>0 else 0)
df_extra['40_16'] = df_extra['40_16'].apply(lambda x:1 if x>0 else 0)

In [303]:
df_extra[df_extra['Cliente'] == cod_cliente]

producto,Cliente,month_year,29_9,39_20,40_16
52620,9263,2018-08,0,0,1
52621,9263,2018-09,0,1,1
52622,9263,2018-10,0,1,1
52623,9263,2018-11,0,1,1
52624,9263,2018-12,0,0,1
52625,9263,2019-01,0,1,1
52626,9263,2019-03,0,1,1
52627,9263,2019-04,1,0,0
52628,9263,2019-07,0,1,1
52629,9263,2019-08,1,1,1


In [304]:
df_m5.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,ofertas_dadas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0,1


In [305]:
# Dropear duplicates por Cliente / month_year
df_m5 = df_m5.drop_duplicates(subset=['Cliente', 'month_year'], keep='last')
df_m5.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,ofertas_dadas
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0,1


In [306]:
df_extra.head(1)

producto,Cliente,month_year,29_9,39_20,40_16
0,1,2018-09,0,1,1


In [307]:
df_m5.shape

(78141, 59)

In [308]:
df_m5.isna().sum().sum()

0

In [309]:
# Agreamos la tabla extra
df_m6 = pd.merge(
  df_m5, 
  df_extra[['Cliente', 'month_year', '29_9', '39_20', '40_16']],
  how = 'left',
  on = ['Cliente', 'month_year']
)
df_m6.head(1)

Unnamed: 0,CodigoDC,Marca,Cupo,Cliente,month_year,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,ofertas_dadas,29_9,39_20,40_16
0,297601,29,9,3213,2018-08,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0,1,1,0,0


In [310]:
df_m6.isna().sum().sum()

0

In [311]:
# Juntar Marca y Cupo: van a ser 3 categorias 39_20, 40_16, 39_20
# Por (cliente, month_year) hago flags por los 3 productos, si es que se oferto alguno de esos productos
# Drpoear duplicates en df_m5 por cliente/month_year
# Las vuelvo a agregar a M5
# LO QUE SE HIZO CELDAS ARRIBA

In [312]:
# Num_prom_ofr, para probar
df_m6 = df_m6.drop(['month_year', 'CodigoDC', 'Cliente', 'ofertas_dadas', 'Marca', 'Cupo'], axis=1)
df_m6.head(1)

Unnamed: 0,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,29_9,39_20,40_16
0,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0,1,0,0


In [313]:
df_m6.to_parquet('../tablones/train_data_num_mini_v3mes_prop.parquet', engine='pyarrow')

# DESDE ACA SE TRABAJA ASUMIENDO VARIABLES NUMERICAS

In [314]:
df_m6.head(1)

Unnamed: 0,label,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,29_9,39_20,40_16
0,0.0,3,8,15,2,3,1,398013.65,16,1.36,-32036.23,839393.42,419696.71,1248479.17,416159.723333,1785985.13,446496.2825,2127202.34,425440.468,2709897.62,451649.603333,2.75,1.375,4.05,1.35,5.79,1.4475,6.91,1.382,8.9,1.483333,-44248.91,-22124.455,-61301.06,-20433.686667,-92151.35,-23037.8375,-92151.35,-18430.27,-128334.72,-21389.12,16,32.0,16.0,47.0,15.666667,67.0,16.75,81.0,16.2,97.0,16.166667,1.0,1,0,0


In [316]:
df_m6.shape

(78141, 56)

In [352]:
X_train_todo, X_test_todo, y_train_todo, y_test_todo \
  = train_test_split( df_m6.drop(["label"], axis=1), df_m6["label"], test_size = 0.2)

# LGBM

In [383]:
classifier_lgbm_todo = lgb.LGBMClassifier(
  n_estimators=100,
  reg_lambda=10,
  max_depth=-1,
  objective='binary',
  scale_pos_weight = 12.882264042315859,
)
classifier_lgbm_todo.fit(X_train_todo, y_train_todo)
# make predictions for test data    
predictions = classifier_lgbm_todo.predict_proba(X_test_todo)[:, 1]
# evaluate predictions
print('ROC AUC Score',roc_auc_score(y_test_todo, predictions))
# old ROC AUC Score 0.8972606476313741

ROC AUC Score 0.8273550489953827


# ARMAR EL TEST SET

In [355]:
df_test = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')
df_test.head(1)

Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,


In [356]:
# Agregar month_year = '201908', se asume
df_test['month_year'] = pd.Period('2019-08', 'M')
# Dropear Ejecuto_Promo, al final lo agrego otra vez
df_test = df_test.drop('Ejecuto_Promo', axis = 1)
print(df_test.isna().sum().sum())
display(df_test.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,month_year
0,10,40,16,2019-08


In [357]:
# Agregamos informacion de clientes sin enriquecer
df_t1 = pd.merge(
  df_test,
  df_clients[['Cliente', 'Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']],
  on = 'Cliente',
  how = 'left'
)
print(df_t1.isna().sum().sum())
display(df_t1.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF
0,10,40,16,2019-08,4,21,15,2,2,0


In [358]:
# Agregamos informacion de ventas enriquecida
df_t2 = pd.merge(
  df_t1,
  df_ventas_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'  
)
print(df_t2.isna().sum().sum())
display(df_t2.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5


In [359]:
# ESTO SE DEBERIA IMPORTAR DE UN UTILITARIO
def obtener_prop_num(df, col_variable):
  """
  df: pd.Dataframe
  col_variable: str
  """
  # Copia: para asegurarnos que no modificamos df -> se debe hacer refactoring
  c_df = copy.deepcopy(df)
  # Cuenta de registros en tabla de clientes
  numero_clientes = len(c_df)
  
  # Contamos registros por valor de col_variable
  df_g1 = c_df.groupby([col_variable]).agg({
    'FechaAltaCliente': 'count',
  }).reset_index()
  
  # Creamos nombre de la columna donde se almacena el numero de clientes
  col_num_clientes = 'num_clientes_' + col_variable
  # Renombrar columna donde se almacena el numero de clientes
  df_g1 = df_g1.rename(columns={'FechaAltaCliente': col_num_clientes})
  # Creamos nombre de la columna donde se almacena la proporcion de clientes
  col_proporcion_clientes = 'prop_clientes_' + col_variable
  # Agregamos columna que almacena informacion de propocion de clientes 
  df_g1[col_proporcion_clientes] = df_g1[col_num_clientes] / numero_clientes
  
  return df_g1


# Agregamos informacion de prop_num 
cols_cat_clientes = ['Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF']
df_t3 = copy.deepcopy(df_t2)
for col_name in cols_cat_clientes:
  curr_df = obtener_prop_num(df_clients, col_name) 
  df_t3 = pd.merge(df_t3, curr_df, how = 'left', on = col_name)
  #print(df_t3.isna().sum().sum())
print(df_t3.isna().sum().sum())
display(df_t3.head(1))  

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317


In [360]:
# Agregar info de promo ofrecidas
df_t4 = pd.merge(
  df_t3,
  df_promo_ofrecidas,
  on = ['month_year', 'Cliente', 'Marca', 'Cupo'],
  how = 'left'
)
print(df_t4.isna().sum().sum())
display(df_t4.head(1))  

1025


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0


In [361]:
# Llena con 0 los NaN en Num_prom_ofr porque significa que no se ofrecieron descuentos
df_t4['Num_prom_ofr'] = df_t4['Num_prom_ofr'].fillna(0)
print(df_t4.isna().sum().sum())
display(df_t4.head(1))  

0


Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0


In [362]:
f_prop = '../tablones/data_grupo2_proporcion.pkl' 
df_prop_enriched = load_pickle(f_prop)
df_prop_enriched.head(1)

# Cruzamos df_prop_enriched(enriched) 
df_t5 = pd.merge(
  df_t4,
  df_prop_enriched,
  on = ['month_year', 'Cliente'],
  how = 'left'
)
print(df_t5.isna().sum()/len(df_t5)*100)
print(df_t5.isna().sum().sum())
display(df_t5.head(1))

Cliente                        0.0
Marca                          0.0
Cupo                           0.0
month_year                     0.0
Region                         0.0
Gerencia                       0.0
SubCanal                       0.0
TipoPoblacion                  0.0
Estrato                        0.0
EF                             0.0
Nr_sum                         0.0
numero_ventas                  0.0
Hl_sum                         0.0
Dcto_sum                       0.0
sum_2Nr                        0.0
mean_2Nr                       0.0
sum_3Nr                        0.0
mean_3Nr                       0.0
sum_4Nr                        0.0
mean_4Nr                       0.0
sum_5Nr                        0.0
mean_5Nr                       0.0
sum_6Nr                        0.0
mean_6Nr                       0.0
sum_2Hl                        0.0
mean_2Hl                       0.0
sum_3Hl                        0.0
mean_3Hl                       0.0
sum_4Hl             

Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0,5


In [363]:
df_t5.shape

(16870, 69)

In [364]:
df_t5.head(1)

Unnamed: 0,Cliente,Marca,Cupo,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas
0,10,40,16,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0,5


In [365]:
# Creamos la columna de los 3 productos

In [366]:
# df_extra['29_9']  = df_extra['29_9'].apply(lambda x: 1 if x>0 else 0)
# df_extra['39_20'] = df_extra['39_20'].apply(lambda x:1 if x>0 else 0)
# df_extra['40_16'] = df_extra['40_16'].apply(lambda x:1 if x>0 else 0)

#Crear columna producto
df_t5['producto'] = df_t5['Marca'].astype(str) + '_' + df_t5['Cupo'].astype(str)
df_t5 = df_t5.drop(['Marca', 'Cupo'], axis = 1)

df_t5.loc[df_t5['producto']  =='29_9'   ,'29_9']   = 1
df_t5.loc[df_t5['producto'] =='40_16'  ,  '40_16'] = 1
df_t5.loc[df_t5['producto'] =='39_20'   , '39_20'] = 1

df_t5.head(5)

Unnamed: 0,Cliente,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,producto,29_9,40_16,39_20
0,10,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897991.0,965996.9,3580176.93,895044.2,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.5,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0,5,40_16,,1.0,
1,17,2019-08,4,21,15,2,5,1,899711.35,25,3.25,-124786.07,1879132.53,939566.265,3244108.0,1081369.0,4452269.88,1113067.0,4714395.29,942879.058,5849760.56,974960.093333,7.07,3.535,11.89,3.963333,16.25,4.0625,17.45,3.49,21.51,3.585,-245853.8,-122926.905,-287237.5,-95745.816667,-345125.26,-86281.315,-373725.73,-74745.146,-490282.74,-81713.79,25,46.0,23.0,66.0,22.0,85.0,21.25,94.0,18.8,116.0,19.333333,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,205,0.013604,5405,0.358683,1.0,5,29_9,1.0,,
2,37,2019-08,4,21,16,2,3,1,0.0,0,0.0,0.0,0.0,0.0,6.984919e-10,2.328306e-10,446655.39,111663.8,567594.54,113518.908,979507.51,163251.251667,-2.220446e-15,0.0,4.440892e-16,1.480297e-16,1.89,0.4725,2.43,0.486,4.32,0.72,-2.000888e-11,0.0,-1.273293e-11,0.0,-4208.81,-1052.2025,-6313.22,-1262.644,-12626.44,-2104.406667,0,0.0,0.0,0.0,0.0,7.0,1.75,10.0,2.0,16.0,2.666667,3531,0.234322,434,0.028801,942,0.062512,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,29_9,1.0,,
3,46,2019-08,4,21,9,2,3,1,874421.59,18,3.23,-11144.12,1692225.07,846112.535,2778581.0,926193.6,3912809.63,978202.4,4666520.81,933304.162,5366327.81,894387.968333,6.37,3.185,10.33,3.443333,14.55,3.6375,17.36,3.472,20.28,3.38,-21076.55,-10538.275,-43234.25,-14411.416667,-63750.48,-15937.62,-74021.66,-14804.332,-118167.33,-19694.555,18,40.0,20.0,61.0,20.333333,80.0,20.0,97.0,19.4,115.0,19.166667,3531,0.234322,434,0.028801,5713,0.379123,11718,0.777623,4009,0.266043,5405,0.358683,2.0,4,29_9,1.0,,
4,48,2019-08,4,21,10,2,2,0,98109.7,3,0.38,-12583.97,258380.25,129190.125,322470.1,107490.0,527899.38,131974.8,648692.64,129738.528,810008.54,135001.423333,1.02,0.51,1.28,0.4266667,2.07,0.5175,2.49,0.498,3.08,0.513333,-24988.6,-12494.3,-29661.97,-9887.323333,-45031.52,-11257.88,-57436.15,-11487.23,-79472.2,-13245.366667,3,6.0,3.0,9.0,3.0,14.0,3.5,16.0,3.2,21.0,3.5,3531,0.234322,434,0.028801,1096,0.072732,11718,0.777623,6598,0.437853,9664,0.641317,2.0,6,29_9,1.0,,


In [367]:
df_t5 = df_t5.fillna(0)
df_t5.head(4)

Unnamed: 0,Cliente,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,producto,29_9,40_16,39_20
0,10,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897991.0,965996.9,3580176.93,895044.2,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.5,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0,5,40_16,0.0,1.0,0.0
1,17,2019-08,4,21,15,2,5,1,899711.35,25,3.25,-124786.07,1879132.53,939566.265,3244108.0,1081369.0,4452269.88,1113067.0,4714395.29,942879.058,5849760.56,974960.093333,7.07,3.535,11.89,3.963333,16.25,4.0625,17.45,3.49,21.51,3.585,-245853.8,-122926.905,-287237.5,-95745.816667,-345125.26,-86281.315,-373725.73,-74745.146,-490282.74,-81713.79,25,46.0,23.0,66.0,22.0,85.0,21.25,94.0,18.8,116.0,19.333333,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,205,0.013604,5405,0.358683,1.0,5,29_9,1.0,0.0,0.0
2,37,2019-08,4,21,16,2,3,1,0.0,0,0.0,0.0,0.0,0.0,6.984919e-10,2.328306e-10,446655.39,111663.8,567594.54,113518.908,979507.51,163251.251667,-2.220446e-15,0.0,4.440892e-16,1.480297e-16,1.89,0.4725,2.43,0.486,4.32,0.72,-2.000888e-11,0.0,-1.273293e-11,0.0,-4208.81,-1052.2025,-6313.22,-1262.644,-12626.44,-2104.406667,0,0.0,0.0,0.0,0.0,7.0,1.75,10.0,2.0,16.0,2.666667,3531,0.234322,434,0.028801,942,0.062512,11718,0.777623,4009,0.266043,5405,0.358683,1.0,1,29_9,1.0,0.0,0.0
3,46,2019-08,4,21,9,2,3,1,874421.59,18,3.23,-11144.12,1692225.07,846112.535,2778581.0,926193.6,3912809.63,978202.4,4666520.81,933304.162,5366327.81,894387.968333,6.37,3.185,10.33,3.443333,14.55,3.6375,17.36,3.472,20.28,3.38,-21076.55,-10538.275,-43234.25,-14411.416667,-63750.48,-15937.62,-74021.66,-14804.332,-118167.33,-19694.555,18,40.0,20.0,61.0,20.333333,80.0,20.0,97.0,19.4,115.0,19.166667,3531,0.234322,434,0.028801,5713,0.379123,11718,0.777623,4009,0.266043,5405,0.358683,2.0,4,29_9,1.0,0.0,0.0


In [368]:
X_train_todo.head(1)

Unnamed: 0,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,Num_prom_ofr,29_9,39_20,40_16
19185,1,18,9,2,3,0,268840.51,16,1.09,-9344.97,504560.08,252280.04,737545.43,245848.476667,986490.46,246622.615,1164336.86,232867.372,1297518.68,216253.113333,2.05,1.025,3.04,1.013333,4.18,1.045,5.0,1.0,5.68,0.946667,-23003.58,-11501.79,-32842.93,-10947.643333,-46173.28,-11543.32,-65564.54,-13112.908,-85649.54,-14274.923333,16,30.0,15.0,41.0,13.666667,52.0,13.0,60.0,12.0,68.0,11.333333,3.0,0,1,1


In [369]:
df_t5.head(1)

Unnamed: 0,Cliente,month_year,Region,Gerencia,SubCanal,TipoPoblacion,Estrato,EF,Nr_sum,numero_ventas,Hl_sum,Dcto_sum,sum_2Nr,mean_2Nr,sum_3Nr,mean_3Nr,sum_4Nr,mean_4Nr,sum_5Nr,mean_5Nr,sum_6Nr,mean_6Nr,sum_2Hl,mean_2Hl,sum_3Hl,mean_3Hl,sum_4Hl,mean_4Hl,sum_5Hl,mean_5Hl,sum_6Hl,mean_6Hl,sum_2Dcto,mean_2Dcto,sum_3Dcto,mean_3Dcto,sum_4Dcto,mean_4Dcto,sum_5Dcto,mean_5Dcto,sum_6Dcto,mean_6Dcto,numero_ventas_sum,sum_2numero_ventas,mean_2numero_ventas,sum_3numero_ventas,mean_3numero_ventas,sum_4numero_ventas,mean_4numero_ventas,sum_5numero_ventas,mean_5numero_ventas,sum_6numero_ventas,mean_6numero_ventas,num_clientes_Region,prop_clientes_Region,num_clientes_Gerencia,prop_clientes_Gerencia,num_clientes_SubCanal,prop_clientes_SubCanal,num_clientes_TipoPoblacion,prop_clientes_TipoPoblacion,num_clientes_Estrato,prop_clientes_Estrato,num_clientes_EF,prop_clientes_EF,Num_prom_ofr,ofertas_dadas,producto,29_9,40_16,39_20
0,10,2019-08,4,21,15,2,2,0,818199.93,14,3.58,-43358.46,1521431.02,760715.51,2897990.58,965996.86,3580176.93,895044.2325,4375014.64,875002.928,5271955.73,878659.288333,6.66,3.33,12.19,4.063333,14.78,3.695,18.0,3.6,21.67,3.611667,-62775.26,-31387.63,-133840.46,-44613.486667,-161979.51,-40494.8775,-199488.19,-39897.638,-262790.86,-43798.476667,14,29.0,14.5,41.0,13.666667,50.0,12.5,63.0,12.6,81.0,13.5,3531,0.234322,434,0.028801,2293,0.152167,11718,0.777623,6598,0.437853,9664,0.641317,2.0,5,40_16,0.0,1.0,0.0


In [370]:
X_train_todo.columns

Index(['Region', 'Gerencia', 'SubCanal', 'TipoPoblacion', 'Estrato', 'EF',
       'Nr_sum', 'numero_ventas', 'Hl_sum', 'Dcto_sum', 'sum_2Nr', 'mean_2Nr',
       'sum_3Nr', 'mean_3Nr', 'sum_4Nr', 'mean_4Nr', 'sum_5Nr', 'mean_5Nr',
       'sum_6Nr', 'mean_6Nr', 'sum_2Hl', 'mean_2Hl', 'sum_3Hl', 'mean_3Hl',
       'sum_4Hl', 'mean_4Hl', 'sum_5Hl', 'mean_5Hl', 'sum_6Hl', 'mean_6Hl',
       'sum_2Dcto', 'mean_2Dcto', 'sum_3Dcto', 'mean_3Dcto', 'sum_4Dcto',
       'mean_4Dcto', 'sum_5Dcto', 'mean_5Dcto', 'sum_6Dcto', 'mean_6Dcto',
       'numero_ventas_sum', 'sum_2numero_ventas', 'mean_2numero_ventas',
       'sum_3numero_ventas', 'mean_3numero_ventas', 'sum_4numero_ventas',
       'mean_4numero_ventas', 'sum_5numero_ventas', 'mean_5numero_ventas',
       'sum_6numero_ventas', 'mean_6numero_ventas', 'Num_prom_ofr', '29_9',
       '39_20', '40_16'],
      dtype='object')

# Hacemos que df_t5 tenga las mismas columnas que el X_train_todo

In [371]:
df_t5 = df_t5[list(X_train_todo.columns)]

In [372]:
df_t5.isna().sum().sum()

0

## Prediccion

In [373]:
#pred = classifier_lgbm_dummies.predict_proba(df_t4_dummies)
#pred = classifier_prod.predict_proba(df_t5_dummies)
pred = classifier_lgbm_todo.predict_proba(df_t5)

In [374]:
print(pred)

[[0.23986607 0.76013393]
 [0.39332716 0.60667284]
 [0.99632019 0.00367981]
 ...
 [0.28924914 0.71075086]
 [0.93594271 0.06405729]
 [0.65518012 0.34481988]]


In [375]:
lista_pred_compro = [x for _, x in pred]
len(lista_pred_compro)

16870

In [376]:
df_output = pd.read_csv(fp_test, encoding='ISO-8859-1', sep=',')
df_output.tail(1)

Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
16869,15064,29,9,


In [377]:
df_output['Ejecuto_Promo'] = lista_pred_compro
print(df_output.isna().sum().sum())
display(df_output.head(1))

0


Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,0.760134


In [378]:
import time
fp = '../output/' +  str(time.asctime()) + '.csv'
fp = '../output/' + 'asddd.csv'
fp

'../output/asddd.csv'

In [379]:
# Almacenar 
df_output = df_output.reset_index(drop=True)
display(df_output.head(1))


Unnamed: 0,Cliente,Marca,Cupo,Ejecuto_Promo
0,10,40,16,0.760134


In [380]:
df_output.to_csv(fp, sep = ',', columns = ['Cliente', 'Marca', 'Cupo', 'Ejecuto_Promo'], index=False)