In [14]:
# Librerias generales
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import datetime
# Librerias para listar archivos
from os import listdir
from os.path import isfile, join

In [67]:
# Importo datasets

active_users = pd.read_csv('../data/ACTIVE_USER.csv') # Datos de visitas a la app
demograficos = pd.read_csv('../data/DEMOGRAFICOS.csv')
dinero_cuenta = pd.read_csv('../data/DINERO_CUENTA.csv') # Datos de la cantidad de dinero en la cuenta y si invierta
evaluate = pd.read_csv('../data/EVALUATE.csv')
marketplace_data = pd.read_csv('../data/MARKETPLACE_DATA.csv') # Movimientos en mercadolibre
payments = pd.read_csv('../data/PAYMENTS.csv') # Datos transaccionales en MP

In [16]:
# Active users
active_users.head(3)

Unnamed: 0,CUS_CUST_ID_BUY,MAU_MP_3,MAU_ML_3,MAU_MP_2,MAU_ML_2,MAU_MP_1,MAU_ML_1,last_login_mp_date_1,last_login_ml_date_1
0,424070401,,,1.0,0.0,,,,
1,271894453,1.0,14.0,11.0,21.0,2.0,12.0,2019-05-17,2019-05-24
2,327402711,,,0.0,10.0,4.0,26.0,2019-05-28,2019-05-31


In [17]:
# Demograficos
demograficos.head(3)

Unnamed: 0,CITY,CUS_CUST_ID_BUY,GENDER,RANGO_EDAD,TARJETAS,ESTADO
0,caba,47629682,male,03.Entre 26 y 30 años,Credit Card,capital federal
1,belen de escobar,184288617,male,03.Entre 26 y 30 años,Account Money,buenos aires
2,san fernando del valle de catamarca,134359455,female,03.Entre 26 y 30 años,Debit Card,catamarca


In [18]:
# Dinero en cuenta
dinero_cuenta.head(3)

Unnamed: 0,CUS_CUST_ID_BUY,PLATA_CUENTA_1,PLATA_CUENTA_2,INVERSION
0,54018.0,250.99,0.0,pending
1,23513.0,20.05,2220.05,warmup
2,21230.0,0.0,0.0,warmup


In [19]:
# Evaluate
evaluate.head(3)

Unnamed: 0,CUS_CUST_ID_BUY,churn
0,20663512,1.0
1,444286946,0.0
2,348084528,1.0


In [20]:
# Marketplace 
marketplace_data.head(3)

Unnamed: 0,CUS_CUST_ID_BUY,SPENT_ML,RECENCY_ML,FREQUENCY_ML
0,53621056,71.51,2019-03-30,1
1,308208758,269.2,2019-05-30,11
2,30591970,149.22,2019-03-24,4


In [21]:
# Payments
payments.head(3)

Unnamed: 0,FECHA,CUS_CUST_ID_SEL,CUS_CUST_ID_BUY,SPENT,TPV_SEGMENT_DETAIL,DESCUENTO
0,2019-05-25,314941456,20663512,11.3566,Instore,2.2312
1,2019-06-20,251693291,444286946,4.619,Transport,0.9238
2,2019-01-10,251693291,20663512,5.3519,Transport,1.0704


In [22]:
# Revision de payments dataframe

payments.info() # La información esta ok con sus clases excepto por la fecha, se convierte a datetime

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555516 entries, 0 to 555515
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   FECHA               555516 non-null  object 
 1   CUS_CUST_ID_SEL     555516 non-null  int64  
 2   CUS_CUST_ID_BUY     555516 non-null  int64  
 3   SPENT               555516 non-null  float64
 4   TPV_SEGMENT_DETAIL  555516 non-null  object 
 5   DESCUENTO           555516 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 25.4+ MB


In [68]:
# Convierto la columna fecha en datetime
payments['FECHA'] = [datetime.datetime.strptime(x, '%Y-%m-%d') for x in payments['FECHA']]

In [69]:
# La idea es unificar la data por usuario de forma que cada linea tenga toda la información
# Por eso primero se dummiza el dataframe payments para que los distintos segmentos estén por columna

payments = pd.get_dummies(payments, columns = ['TPV_SEGMENT_DETAIL'], prefix = ['segment'])

In [None]:
# Multiplico los valores de spent y discount por las variables dummies
for col in payments.iloc[:, 5:11]:
    name1 = col + '_spent'
    name2 = col + '_discount'
    payments[name1] = payments[col].multiply(payments['SPENT'], axis = 'index')
    payments[name2] = payments[col].multiply(payments['SPENT'], axis = 'index')

In [43]:
# Funcion de agregacion por ventanas de tiempo
payments_group_date = payments.groupby([pd.Grouper(key = 'FECHA', freq = '3M'), 'CUS_CUST_ID_BUY'], as_index = True).agg(segment_cellphone_recharge = ('SPENT', 'sum'), 
                                                                                                                         spent_count = ('SPENT', 'count'), 
                                                                                                                         spent_mean = ('SPENT', 'mean'), 
                                                                                                                         discount = ('DESCUENTO','sum'),
                                                                                                                         discount_mean = ('DESCUENTO','mean'))  
payments_group_date = payments_group_date.reset_index() 

# Creo la variable mes como objeto para dummizar 
payments_group_date['MES'] = [str(x.month) for x in payments_group_date['FECHA']]

# Variable util para automatizar la agregacion
k = len(payments_group_date['MES'].unique())

# Creo variables dummies
payments_group_date = pd.get_dummies(payments_group_date, columns = ['MES'])

# Obtengo los indices inicial y final de las columnas dummies porque despues se iran agregando nuevas 
# columnas en la medida que las multiplique por los features obtenidos anteriormente 
max_index = payments_group_date.shape[1] - 1
low_index = max_index  + 1 - k

# Lista de columnas a elminar una vez finalizado el proceso
drop_cols = payments_group_date.columns[low_index:max_index + 1].values.tolist() + ['spent','spent_count','spent_mean','discount','discount_mean']

# Loop que ira greando una nueva columna por cada feature y variable dummie existente.
# La idea es que se multiplica la variable dummie por el valor de cada feature 
for col1 in payments_group_date.columns[2:low_index]:
    for col2 in payments_group_date.columns[low_index:max_index + 1]:
        col_name = col1 + '_' + col2
        payments_group_date[col_name] = payments_group_date[col2].multiply(payments_group_date[col1], axis = 'index')

# Elimino columnas que ya no uso
payments_group_date.drop(columns = drop_cols, axis = 1, inplace = True)

# Group by customer y obtengo un registro unico por cada uno
payments_group_date = payments_group_date.groupby('CUS_CUST_ID_BUY').sum()
payments_group_date.head()


Unnamed: 0_level_0,spent_MES_1,spent_MES_4,spent_MES_7,spent_count_MES_1,spent_count_MES_4,spent_count_MES_7,spent_mean_MES_1,spent_mean_MES_4,spent_mean_MES_7,discount_MES_1,discount_MES_4,discount_MES_7,discount_mean_MES_1,discount_mean_MES_4,discount_mean_MES_7
CUS_CUST_ID_BUY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1276,0.0,6.5167,1.1198,0,3,1,0.0,2.172233,1.1198,0.0,0.0,0.0,0.0,0.0,0.0
14525,14.2774,271.9272,143.1102,2,29,15,7.1387,9.3768,9.54068,2.8555,11.3489,4.4531,1.42775,0.391341,0.296873
21230,0.0,0.0,4.4198,0,0,1,0.0,0.0,4.4198,0.0,0.0,2.2099,0.0,0.0,2.2099
22083,69.0491,65.7796,61.7749,3,10,5,23.016367,6.57796,12.35498,7.7778,5.5951,1.5736,2.5926,0.55951,0.31472
23513,0.0,30.6686,0.4459,0,9,1,0.0,3.407622,0.4459,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
#payments.iloc[:,-6:].multiply(payments['SPENT'], axis = 'index')

payments.groupby([pd.Grouper(key = 'FECHA', freq = '3M'), 'CUS_CUST_ID_BUY'], as_index = True).agg(
    spent_segment_cellphone_recharge = ('segment_Cellphone Recharge_spent', 'sum'), 
    spent_segment_cellphone_recharge_mean = ('segment_Cellphone Recharge_spent', 'mean'),
    spent_segment_cellphone_recharge_max = ('segment_Cellphone Recharge_spent', 'max'),
    spent_segment_instore = ('segment_Instore_spent','sum'),
    spent_segment_instore_mean = ('segment_Instore_spent','mean'),
    spent_segment_instore_max = ('segment_Instore_spent','max'),
    spent_segment_money_transfer = ('segment_Money Transfer_spent','sum'),
    spent_segment_money_transfer_mean = ('segment_Money Transfer_spent','mean'),
    spent_segment_money_transfer_max = ('segment_Money Transfer_spent','max'),
    spent_segment_other_single_players = ('segment_Other Single Players_spent','sum'),
    spent_segment_other_single_players_mean = ('segment_Other Single Players_spent','mean'),
    spent_segment_other_single_players_max = ('segment_Other Single Players_spent','max'),
    spent_segment_transport = ('segment_Transport_spent','sum'),
    spent_segment_transport_mean = ('segment_Transport_spent','mean'),
    spent_segment_transport_max = ('segment_Transport_spent','max'),
    spent_segment_utilities = ('segment_Utilities_spent','sum'),
    spent_segment_utilities_mean = ('segment_Utilities_spent','mean'),
    spent_segment_utilities_max = ('segment_Utilities_spent','max'),
    
    discount_segment_cellphone_recharge = ('segment_Cellphone Recharge_discount', 'sum'), 
    discount_segment_cellphone_recharge_mean = ('segment_Cellphone Recharge_discount', 'mean'),
    discount_segment_cellphone_recharge_max = ('segment_Cellphone Recharge_discount', 'max'),
    discount_segment_instore = ('segment_Instore_discount','sum'),
    discount_segment_instore_mean = ('segment_Instore_discount','mean'),
    discount_segment_instore_max = ('segment_Instore_discount','max'),
    discount_segment_money_transfer = ('segment_Money Transfer_discount','sum'),
    discount_segment_money_transfer_mean = ('segment_Money Transfer_discount','mean'),
    discount_segment_money_transfer_max = ('segment_Money Transfer_discount','max'),
    discount_segment_other_single_players = ('segment_Other Single Players_discount','sum'),
    discount_segment_other_single_players_mean = ('segment_Other Single Players_discount','mean'),
    discount_segment_other_single_players_max = ('segment_Other Single Players_discount','max'),
    discount_segment_transport = ('segment_Transport_discount','sum'),
    discount_segment_transport_mean = ('segment_Transport_discount','mean'),
    discount_segment_transport_max = ('segment_Transport_discount','max'),
    discount_segment_utilities = ('segment_Utilities_discount','sum'),
    discount_segment_utilities_mean = ('segment_Utilities_discount','mean'),
    discount_segment_utilities_max = ('segment_Utilities_discount','max'))

Unnamed: 0_level_0,Unnamed: 1_level_0,spent_segment_cellphone_recharge,spent_segment_cellphone_recharge_mean,spent_segment_cellphone_recharge_max,spent_segment_instore,spent_segment_instore_mean,spent_segment_instore_max,spent_segment_money_transfer,spent_segment_money_transfer_mean,spent_segment_money_transfer_max,spent_segment_other_single_players,...,discount_segment_money_transfer_max,discount_segment_other_single_players,discount_segment_other_single_players_mean,discount_segment_other_single_players_max,discount_segment_transport,discount_segment_transport_mean,discount_segment_transport_max,discount_segment_utilities,discount_segment_utilities_mean,discount_segment_utilities_max
FECHA,CUS_CUST_ID_BUY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019-01-31,14525,0.0,0.0,0.0,14.2774,7.138700,7.4721,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0000,0.0000,0.00000,0.0000
2019-01-31,22083,0.0,0.0,0.0,69.0491,23.016367,36.0177,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0000,0.0000,0.00000,0.0000
2019-01-31,34173,0.0,0.0,0.0,0.0000,0.000000,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0000,136.9079,68.45395,91.7432
2019-01-31,36482,0.0,0.0,0.0,0.0000,0.000000,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,23.9667,4.79334,5.3836,0.0000,0.00000,0.0000
2019-01-31,51395,0.0,0.0,0.0,36.4893,3.317209,13.0389,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0000,0.0000,0.00000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-31,448290283,0.0,0.0,0.0,0.0000,0.000000,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.7100,4.71000,4.7100,0.0000,0.00000,0.0000
2019-07-31,448313152,0.0,0.0,0.0,0.0000,0.000000,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.7100,4.71000,4.7100,0.0000,0.00000,0.0000
2019-07-31,448326912,0.0,0.0,0.0,7.7715,7.771500,7.7715,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0000,0.0000,0.00000,0.0000
2019-07-31,448330186,0.0,0.0,0.0,9.7025,9.702500,9.7025,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0000,0.00000,0.0000,0.0000,0.00000,0.0000


In [71]:
payments

Unnamed: 0,FECHA,CUS_CUST_ID_SEL,CUS_CUST_ID_BUY,SPENT,DESCUENTO,segment_Cellphone Recharge,segment_Instore,segment_Money Transfer,segment_Other Single Players,segment_Transport,...,segment_Instore_spent,segment_Instore_discount,segment_Money Transfer_spent,segment_Money Transfer_discount,segment_Other Single Players_spent,segment_Other Single Players_discount,segment_Transport_spent,segment_Transport_discount,segment_Utilities_spent,segment_Utilities_discount
0,2019-05-25,314941456,20663512,11.3566,2.2312,0,1,0,0,0,...,11.3566,11.3566,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
1,2019-06-20,251693291,444286946,4.6190,0.9238,0,0,0,0,1,...,0.0000,0.0000,0.0,0.0,0.0,0.0,4.6190,4.6190,0.0,0.0
2,2019-01-10,251693291,20663512,5.3519,1.0704,0,0,0,0,1,...,0.0000,0.0000,0.0,0.0,0.0,0.0,5.3519,5.3519,0.0,0.0
3,2019-03-28,251693291,20663512,4.5589,0.0000,0,0,0,0,1,...,0.0000,0.0000,0.0,0.0,0.0,0.0,4.5589,4.5589,0.0,0.0
4,2019-01-07,251693291,20663512,8.0278,0.0000,0,0,0,0,1,...,0.0000,0.0000,0.0,0.0,0.0,0.0,8.0278,8.0278,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555511,2019-01-08,254303242,312909117,2.6806,0.0000,1,0,0,0,0,...,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
555512,2019-03-05,341479786,390459868,20.1261,1.0063,0,1,0,0,0,...,20.1261,20.1261,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
555513,2019-01-15,251693291,39226986,5.4054,0.0000,0,0,0,0,1,...,0.0000,0.0000,0.0,0.0,0.0,0.0,5.4054,5.4054,0.0,0.0
555514,2019-06-24,251693291,349278665,4.6740,0.0000,0,0,0,0,1,...,0.0000,0.0000,0.0,0.0,0.0,0.0,4.6740,4.6740,0.0,0.0


In [115]:
payments_group_spent = payments.groupby('CUS_CUST_ID_BUY', as_index = False).agg({'SPENT':['sum','mean','count'], 'DESCUENTO':['sum','mean']},)
payments_group_spent.columns = ['_'.join(col).strip() for col in payments_group_spent.columns.values]
payments_group_spent.rename(columns = {'CUS_CUST_ID_BUY_':'CUS_CUST_ID_BUY'}, inplace = True)        
payments_group_spent                                                           

Unnamed: 0,CUS_CUST_ID_BUY,SPENT_sum,SPENT_mean,SPENT_count,DESCUENTO_sum,DESCUENTO_mean
0,1276,7.6365,1.909125,4,0.0000,0.000000
1,14525,429.3148,9.332930,46,18.6575,0.405598
2,21230,4.4198,4.419800,1,2.2099,2.209900
3,22083,196.6036,10.922422,18,14.9465,0.830361
4,23513,31.1145,3.111450,10,0.0000,0.000000
...,...,...,...,...,...,...
43497,448290283,4.7100,4.710000,1,0.0000,0.000000
43498,448313152,4.7100,4.710000,1,0.0000,0.000000
43499,448326912,7.7715,7.771500,1,1.5543,1.554300
43500,448330186,9.7025,9.702500,1,1.9405,1.940500


In [120]:
dates = payments_group_date['FECHA'].unique()
subset 1= payments_group_date[payments_group_date['FECHA'] == [dates0]]
[['CUS_CUST_ID_BUY','spent','discount']]

subset2 = payments_group_date[payments_group_date['FECHA'] == dates[1]][['CUS_CUST_ID_BUY','spent','discount']]
psubset3 = payments_group_date[payments_group_date['FECHA'] == dates[2]][['CUS_CUST_ID_BUY','spent','discount']]d.merge(payments_group_spent,  subset on = 'CUS_CUST_ID_BUY', how = 'left')

Unnamed: 0,CUS_CUST_ID_BUY,SPENT_sum,SPENT_mean,SPENT_count,DESCUENTO_sum,DESCUENTO_mean,spent,discount
0,1276,7.6365,1.909125,4,0.0000,0.000000,,
1,14525,429.3148,9.332930,46,18.6575,0.405598,14.2774,2.8555
2,21230,4.4198,4.419800,1,2.2099,2.209900,,
3,22083,196.6036,10.922422,18,14.9465,0.830361,69.0491,7.7778
4,23513,31.1145,3.111450,10,0.0000,0.000000,,
...,...,...,...,...,...,...,...,...
43497,448290283,4.7100,4.710000,1,0.0000,0.000000,,
43498,448313152,4.7100,4.710000,1,0.0000,0.000000,,
43499,448326912,7.7715,7.771500,1,1.5543,1.554300,,
43500,448330186,9.7025,9.702500,1,1.9405,1.940500,,


In [117]:
payments[payments['CUS_CUST_ID_BUY'] == 1276]

Unnamed: 0,FECHA,CUS_CUST_ID_SEL,CUS_CUST_ID_BUY,SPENT,DESCUENTO,segment_Cellphone Recharge,segment_Instore,segment_Money Transfer,segment_Other Single Players,segment_Transport,segment_Utilities
372096,2019-03-20,295153733,1276,1.25,0.0,1,0,0,0,0,0
373227,2019-02-16,295153733,1276,2.5893,0.0,1,0,0,0,0,0
375585,2019-05-07,295153733,1276,1.1198,0.0,1,0,0,0,0,0
377042,2019-02-01,295153733,1276,2.6774,0.0,1,0,0,0,0,0
