In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

#Librerias para visualizacion
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
labels_training = pd.read_csv('/home/lucas/Documentos/Facultad/OrgaDeDatos/tp/labels_training_set.csv',
                              low_memory= False)
events_data = pd.read_csv('/home/lucas/Documentos/Facultad/OrgaDeDatos/tp/events_up_to_01062018.csv', 
                              low_memory= False)

## Generacion de features

In [3]:
labels_training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [4]:
events_data.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [5]:
#Mapeo las diferentes condiciones a un valor numerico
map_condicion = {'Novo':5,'Excelente':4, 'Muito Bom':3,'Bom':2,'Bom - Sem Touch ID':1}

#Mapeo los almacenamientos a con sus numeros. Por el momento despues hay que ver que otra cosa hacer
map_storage = {'16GB':16,'32GB':32, '64GB':64,'128GB':128,'8GB':8,'256GB': 256, '512MB': 0.5, '4GB':4}

In [6]:
#Creo archivos para hacer los sets de train y test
train_set = pd.DataFrame()
test_set = pd.DataFrame()

In [7]:
train_set = events_data

In [8]:
train_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [9]:
train_set.replace({'condition': map_condicion}, inplace = True)
train_set.replace({'storage': map_storage}, inplace = True)

In [10]:
train_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,4.0,32.0,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,3.0,32.0,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,3.0,64.0,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,2.0,128.0,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,4.0,32.0,Branco,,...,,,,,,,,,,


In [11]:
#Mapear 63 colores a numeros de alguna forma
#Usar el promedio de aparicion de cada uno podria ser
colors = events_data.loc[events_data['event'] == 'conversion', ['color']]

In [12]:
colors.head()

Unnamed: 0,color
195,Rosa
234,Rosa
724,Rosa
744,Preto
1198,Dourado


In [13]:
colors_convertions = pd.DataFrame(colors['color'].value_counts()/events_data.shape[0])

In [14]:
colors_convertions = colors_convertions.reset_index()

In [15]:
colors_convertions.head()

Unnamed: 0,index,color
0,Preto,0.00099
1,Dourado,0.000734
2,Branco,0.000388
3,Cinza espacial,0.000213
4,Prateado,0.000132


In [16]:
colors_convertions.rename({'index':'color', 'color':'porcentaje de conversiones por colores'}, axis = 1, inplace = True)
colors_convertions.head()

Unnamed: 0,color,porcentaje de conversiones por colores
0,Preto,0.00099
1,Dourado,0.000734
2,Branco,0.000388
3,Cinza espacial,0.000213
4,Prateado,0.000132


In [17]:
train_set = train_set.merge(colors_convertions, on = 'color',how= 'left')

In [18]:
train_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,porcentaje de conversiones por colores
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,4.0,32.0,Dourado,,...,,,,,,,,,,0.000734
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,3.0,32.0,Cinza espacial,,...,,,,,,,,,,0.000213
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,3.0,64.0,Prateado,,...,,,,,,,,,,0.000132
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,2.0,128.0,Vermelho,,...,,,,,,,,,,6e-06
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,4.0,32.0,Branco,,...,,,,,,,,,,0.000388


In [19]:
train_set['storage'] = train_set['storage'].fillna(0)

In [20]:
train_set['condition'] = train_set['condition'].fillna(0)

In [21]:
train_set['color'] = train_set['color'].fillna(0)

In [22]:
#Me creo un dataframe con la cantidad de conversiones por usuario para mergear con el train set y crear el feature
events =pd.DataFrame()
events = events_data.loc[events_data['event'] == 'conversion', ['person']]

In [23]:
convertions = pd.DataFrame(events['person'].value_counts())

In [24]:
convertions.rename({'person':'convertions'},axis=1, inplace= True)
convertions = convertions.reset_index()

In [25]:
convertions.rename({'index':'person'}, axis = 1, inplace = True)

In [26]:
train_set = train_set.merge(convertions, on = 'person',how= 'left')

In [27]:
train_set['convertions'] = train_set['convertions'].fillna(0)

In [28]:
#Convierto los modelos de cel a sus valores segun la cantidad de conversiones sobre el total
models = pd.DataFrame()
models = events_data.loc[events_data['event'] == 'conversion', ['model']]

In [29]:
models_convertions = pd.DataFrame(models['model'].value_counts()/events_data.shape[0])

In [30]:
models_convertions = models_convertions.reset_index()

In [31]:
models_convertions.rename({'index':'model', 'model':'porcentaje de conversiones por modelos'}, axis = 1, inplace = True)
models_convertions.head()

Unnamed: 0,model,porcentaje de conversiones por modelos
0,Samsung Galaxy J5,0.000222
1,iPhone 5s,0.000212
2,iPhone 6,0.00017
3,iPhone 6S,0.000131
4,Motorola Moto G2 3G Dual,0.000116


In [32]:
train_set = train_set.merge(models_convertions, on = 'model',how= 'left')

In [33]:
train_set.rename({'convertions':'conversiones por usuario'},axis=1, inplace= True)

In [34]:
train_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,porcentaje de conversiones por colores,conversiones por usuario,porcentaje de conversiones por modelos
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,4.0,32.0,Dourado,,...,,,,,,,,0.000734,0.0,9.4e-05
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,3.0,32.0,Cinza espacial,,...,,,,,,,,0.000213,0.0,0.000212
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,3.0,64.0,Prateado,,...,,,,,,,,0.000132,0.0,0.000131
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,2.0,128.0,Vermelho,,...,,,,,,,,6e-06,0.0,3.7e-05
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,4.0,32.0,Branco,,...,,,,,,,,0.000388,0.0,3e-06


In [35]:
#Feature de porcentaje de conversiones por condicion
conditions = pd.DataFrame()
conditions = events_data.loc[events_data['event'] == 'conversion', ['condition']]
condition_convertions = pd.DataFrame(conditions['condition'].value_counts()/events_data.shape[0])
condition_convertions = condition_convertions.reset_index()
condition_convertions.rename({'index':'condition', 'condition':'porcentaje de conversiones por condicion'}, axis = 1, inplace = True)
condition_convertions.head()

Unnamed: 0,condition,porcentaje de conversiones por condicion
0,2.0,0.001247
1,3.0,0.000915
2,4.0,0.000812
3,1.0,5.2e-05
4,5.0,1e-06


In [36]:
train_set = train_set.merge(condition_convertions, on = 'condition',how= 'left')

In [37]:
train_set['porcentaje de conversiones por colores'] = train_set['porcentaje de conversiones por colores'].fillna(0)

In [38]:
train_set['porcentaje de conversiones por modelos'] = train_set['porcentaje de conversiones por modelos'].fillna(0)

In [49]:
train_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,screen_resolution,operating_system_version,browser_version,porcentaje de conversiones por colores,conversiones por usuario,porcentaje de conversiones por modelos,porcentaje de conversiones por condicion,time,date,hour
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,4.0,32.0,Dourado,,...,,,,0.000734,0.0,9.4e-05,0.000812,00:11:59,2018-05-18,0
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,3.0,32.0,Cinza espacial,,...,,,,0.000213,0.0,0.000212,0.000915,00:11:27,2018-05-18,0
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,3.0,64.0,Prateado,,...,,,,0.000132,0.0,0.000131,0.000915,00:11:16,2018-05-18,0
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,2.0,128.0,Vermelho,,...,,,,6e-06,0.0,3.7e-05,0.001247,00:11:14,2018-05-18,0
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,4.0,32.0,Branco,,...,,,,0.000388,0.0,3e-06,0.000812,00:11:09,2018-05-18,0


In [48]:
train_set['hour'] = pd.to_datetime(train_set['timestamp']).dt.hour

In [41]:
train_set['date'] = pd.to_datetime(train_set['timestamp']).dt.date

In [52]:
#Porcentaje de conversiones por hora del dia
hours = pd.DataFrame()
hours = train_set.loc[train_set['event'] == 'conversion', ['hour']]
hour_convertions = pd.DataFrame(hours['hour'].value_counts()/events_data.shape[0])
hour_convertions = hour_convertions.reset_index()
hour_convertions.rename({'index':'hour', 'condition':'porcentaje de conversiones por hora del dia'}, axis = 1, inplace = True)
hour_convertions.head()

Unnamed: 0,hour,hour.1
0,21,0.000211
1,18,0.000203
2,17,0.000199
3,19,0.000193
4,15,0.00019
