In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from datetime import datetime

events = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
training = pd.read_csv('labels_training_set.csv', low_memory = False)
kaggle = pd.read_csv('trocafone_kaggle_test.csv', low_memory = False)

In [17]:
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [19]:
eventsWithLabel = events.merge(training, how='left', left_on='person', right_on='person')

In [20]:
eventsWithLabel['timestamp'] = pd.to_datetime(eventsWithLabel['timestamp'])

In [21]:
diccWeek = {0: "Monday",
            1: "Tuesday",
            2: "Wednesday",
            3: "Thursday",
            4: "Friday",
            5: "Saturday",
            6: "Sunday"}

diccMonth = {1: "January",
             2: "Febrary",
             3: "March",
             4: "April",
             5: "May"}

In [22]:
#Obtengo los dias de la semana
eventsWithLabel['dayweek'] = eventsWithLabel.timestamp.dt.dayofweek
eventsWithLabel["dayweek"] = eventsWithLabel["dayweek"].map(lambda x: diccWeek.get(x))
#Obtengo la fecha
eventsWithLabel['fecha'] = eventsWithLabel.timestamp.dt.date
#Separo la marca
eventsWithLabel['marca'] = eventsWithLabel['model'].map(lambda x: str(x).split()[0])
#Separo los dias del mes
eventsWithLabel['month'] = eventsWithLabel.timestamp.dt.month
eventsWithLabel["month"] = eventsWithLabel["month"].map(lambda x: diccMonth.get(x))

In [23]:
groupbyCantEventos = eventsWithLabel.groupby('person', as_index = False)['event'].agg({'cantEventos': 'count'})
groupbyCantEventos.sort_values(by = 'cantEventos', ascending = False)
eventsWithLabel = pd.merge(eventsWithLabel, groupbyCantEventos, on = 'person')

In [24]:
#Filtro a la gente que no tiene el evento visited_site
personasConVisitas = eventsWithLabel.loc[eventsWithLabel['event'] == 'visited site']\
                             .drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataNotVisit = eventsWithLabel.loc[eventsWithLabel['person'].isin(personasConVisitas)]

In [25]:
eventsWithLabel2 = dataNotVisit.loc[dataNotVisit['event'] == 'visited site'].groupby(['person', 'fecha'], as_index = False).agg({'city': 'first', 'region': 'first', 'country': 'first', 'device_type': 'first', 'operating_system_version': 'first', 'channel': 'first', 'new_vs_returning': 'first'})

In [26]:
eventsWithLabel3 = pd.merge(dataNotVisit, eventsWithLabel2, on = ['person', 'fecha'])
eventsWithLabel3.drop(['region_x', 'device_type_x', 'operating_system_version_x', 'country_x', 'city_x', 'channel_x', 'new_vs_returning_x'], axis = 1, inplace = True)
eventsWithLabel3.rename(columns = {'region_y': 'region', 'device_type_y': 'device_type', 'operating_system_version_y': 'SO', 'city_y': 'city', 'country_y': 'country', 'channel_y': 'channel', 'new_vs_returning_y': 'new_vs_returning'}, inplace =True)

In [27]:
eventsWithLabel3

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,marca,month,cantEventos,city,SO,country,region,device_type,channel,new_vs_returning
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,Samsung,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
1,2018-05-18 00:30:30,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,Samsung,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
2,2018-05-18 00:07:23,search engine hit,4886f805,,,,,,,,...,,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
3,2018-05-18 00:11:56,checkout,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,Samsung,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
4,2018-05-18 00:11:35,viewed product,4886f805,,9287.0,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,...,Samsung,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
5,2018-05-18 00:10:52,searched products,4886f805,,,,,,,1194792729286108401086810854641363996385,...,,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
6,2018-05-18 00:11:53,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,Samsung,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
7,2018-05-18 00:07:22,generic listing,4886f805,,,,,,,"6594,6650,6664,7238,1059,6706,6721,480,12606,7...",...,,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
8,2018-05-18 00:07:22,visited site,4886f805,,,,,,,,...,,May,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
9,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,iPhone,May,65,São Paulo,Android 5.1.1,Brazil,Sao Paulo,Smartphone,Paid,Returning


In [28]:
eventsWithLabel3["color"] = eventsWithLabel3.color.str.split(" ", expand=True)
eventsWithLabel3["SO"] = eventsWithLabel3.SO.str.split(" ", expand=True)

In [29]:
train = eventsWithLabel3.loc[eventsWithLabel3["label"].notnull()]
test = eventsWithLabel3.loc[eventsWithLabel3["label"].isnull()] 
test = test.drop("label", axis=1)

In [30]:
kaggle = pd.merge(kaggle, test, how ="left", on = "person")

In [31]:
# kaggle2.loc[kaggle2["campaign_source"].notnull()].groupby("person")["campaign_source"].first().value_counts()

In [32]:
dataVisitasTrain = train.groupby(["person", "fecha"], as_index =False)["timestamp"].count()
dataVisitasKaggle = kaggle.groupby(["person", "fecha"], as_index =False)["timestamp"].count()

In [33]:
dataVisitasPromTrain = dataVisitasTrain.groupby(["person"], as_index=False)["timestamp"].agg({"visitasProm": "mean"})
dataVisitasPromKaggle = dataVisitasKaggle.groupby(["person"], as_index=False)["timestamp"].agg({"visitasProm": "mean"})

In [34]:
train = pd.merge(train, dataVisitasPromTrain, how = "left", on= "person")
kaggle = pd.merge(kaggle, dataVisitasPromKaggle, how = "left", on= "person")

In [35]:
#Determino si el usuario es new or returning.
dataNewReturningTrain = train.groupby(["person", "new_vs_returning"])["timestamp"].count().unstack()
dataNewReturningTrain = dataNewReturningTrain.fillna(0)
dataNewReturningTrain["returning"] = dataNewReturningTrain["Returning"].map(lambda x: 1 if x > 0 else 0)
dataNewReturningTrain = dataNewReturningTrain.reset_index().drop(["New","Returning"], axis = 1)
train = train.drop(['new_vs_returning'], axis = 1)
train = pd.merge(train, dataNewReturningTrain, how = 'left', on = 'person')

dataNewReturningKaggle = kaggle.groupby(["person", "new_vs_returning"])["timestamp"].count().unstack()
dataNewReturningKaggle = dataNewReturningKaggle.fillna(0)
dataNewReturningKaggle["returning"] = dataNewReturningKaggle["Returning"].map(lambda x: 1 if x > 0 else 0)
dataNewReturningKaggle = dataNewReturningKaggle.reset_index().drop(["New","Returning"], axis = 1)
kaggle = kaggle.drop(['new_vs_returning'], axis = 1)
kaggle = pd.merge(kaggle, dataNewReturningKaggle, how = 'left', on = 'person')


In [36]:
dataPastConversionsTrain = train.loc[train["event"] == "conversion"].groupby("person", as_index = False)["timestamp"].agg({"pastConversions":"count"})
dataPastConversionsKaggle = kaggle.loc[kaggle["event"] == "conversion"].groupby("person", as_index = False)["timestamp"].agg({"pastConversions":"count"})

In [37]:
train = pd.merge(train, dataPastConversionsTrain, how = 'left', on = 'person')
train["pastConversions"] = train["pastConversions"].fillna(0)
kaggle = pd.merge(kaggle, dataPastConversionsKaggle, how = 'left', on = 'person')
kaggle["pastConversions"] = kaggle["pastConversions"].fillna(0)

In [38]:
dataPersonRegion = train.groupby("person")[["region", "label", "timestamp"]].agg("first")

In [39]:
dataRegiones = dataPersonRegion.groupby(["region", "label"]).count()
# dataRegionesFilter = dataPersonRegion.groupby(["region", "label" ])["timestamp"].count().unstack().fillna(value = 0)

dataRegionesFilter = dataRegiones.loc[dataRegiones["timestamp"] > 10]
dataRegionesFilter = dataRegionesFilter["timestamp"].unstack().fillna(value = 0)

In [40]:
dataRegionesFilter["ratioCompra"] = dataRegionesFilter[1.0]/dataRegionesFilter[0.0]

In [41]:
dataRegionesFilter.sort_values(by = "ratioCompra", ascending = False)

label,0.0,1.0,ratioCompra
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Santa Catarina,334.0,26.0,0.077844
Maranhao,317.0,24.0,0.07571
Para,291.0,22.0,0.075601
Rio Grande do Sul,539.0,36.0,0.06679
Minas Gerais,1487.0,88.0,0.05918
Parana,558.0,33.0,0.05914
Federal District,342.0,20.0,0.05848
Sao Paulo,5090.0,288.0,0.056582
Unknown,3099.0,168.0,0.054211
Espirito Santo,319.0,17.0,0.053292


In [42]:
regionesMasCompras = ["Santa Catarina", "Maranhao", "Para", "Rio Grande do Sul", "Minas Gerais", "Parana", "Federal District", "Sao Paulo", "Espirito Santo", "Rio de Janeiro", "Bahia", "Goias", "Pernambuco"]

In [43]:
train = train.drop(["url", "sku", "skus", "search_term", "staticpage", "campaign_source", "search_engine", "screen_resolution", "browser_version"], axis = 1)
kaggle = kaggle.drop(["url", "sku", "skus", "search_term", "staticpage", "campaign_source", "search_engine", "screen_resolution", "browser_version"], axis = 1)

In [44]:
eventsWithSessionsTrain = train.sort_values(by = ["person", "timestamp"])
eventsWithSessionsKaggle = kaggle.sort_values(by = ["person", "timestamp"])

In [45]:
dataPrimerIngresoTrain =  eventsWithSessionsTrain.groupby("person")["timestamp"].first().reset_index()
dataPrimerIngresoKaggle =  eventsWithSessionsKaggle.groupby("person")["timestamp"].first().reset_index()

In [46]:
date_str = '2018-06-01' 
format_str = '%Y-%m-%d'
finalDate = datetime.strptime(date_str, format_str)

In [47]:
dataPrimerIngresoTrain["primerIngreso"] = (finalDate - dataPrimerIngresoTrain.timestamp)
dataPrimerIngresoTrain["primerIngreso"] = dataPrimerIngresoTrain["primerIngreso"].map(lambda x: x.days)
dataPrimerIngresoTrain = dataPrimerIngresoTrain.drop("timestamp", axis=1)

dataPrimerIngresoKaggle["primerIngreso"] = (finalDate - dataPrimerIngresoKaggle.timestamp)
dataPrimerIngresoKaggle["primerIngreso"] = dataPrimerIngresoKaggle["primerIngreso"].map(lambda x: x.days)
dataPrimerIngresoKaggle = dataPrimerIngresoKaggle.drop("timestamp", axis=1)

In [48]:
eventsWithSessionsTrain = pd.merge(eventsWithSessionsTrain, dataPrimerIngresoTrain, how = "left", on = "person")
eventsWithSessionsKaggle = pd.merge(eventsWithSessionsKaggle, dataPrimerIngresoKaggle, how = "left", on = "person")

In [49]:
#Empezamos con las sesiones.

eventsWithSessionsTrain["time_diff"] = (eventsWithSessionsTrain.timestamp - (eventsWithSessionsTrain.timestamp.shift())) / np.timedelta64(1, 's')
eventsWithSessionsTrain.loc[eventsWithSessionsTrain.person != eventsWithSessionsTrain.person.shift(), "time_diff"] = 0

eventsWithSessionsKaggle["time_diff"] = (eventsWithSessionsKaggle.timestamp - (eventsWithSessionsKaggle.timestamp.shift())) / np.timedelta64(1, 's')
eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle.person != eventsWithSessionsKaggle.person.shift(), "time_diff"] = 0

In [50]:
eventsWithSessionsTrain["session_change"] = ((eventsWithSessionsTrain.event == "visited site") & (eventsWithSessionsTrain.time_diff > 1800.0)) | (eventsWithSessionsTrain.time_diff > 36000.0) | (eventsWithSessionsTrain.person != eventsWithSessionsTrain.person.shift())
eventsWithSessionsTrain["session_id"] = eventsWithSessionsTrain.groupby("person")["session_change"].cumsum()

eventsWithSessionsKaggle["session_change"] = ((eventsWithSessionsKaggle.event == "visited site") & (eventsWithSessionsKaggle.time_diff > 1800.0)) | (eventsWithSessionsKaggle.time_diff > 36000.0) | (eventsWithSessionsKaggle.person != eventsWithSessionsKaggle.person.shift())
eventsWithSessionsKaggle["session_id"] = eventsWithSessionsKaggle.groupby("person")["session_change"].cumsum()

In [51]:
# #Obtener las personas con conversiones
# dataConversionTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain['event'] == 'conversion']
# personasCompraronTrain = dataConversionTrain.drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
# dataPersonasCompraronTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain['person'].isin(personasCompraronTrain)]

# dataConversionKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle['event'] == 'conversion']
# personasCompraronKaggle = dataConversionKaggle.drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
# dataPersonasCompraronKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle['person'].isin(personasCompraronKaggle)]

In [52]:
# dataPersonasCompraron["aConversion"] = (dataPersonasCompraron.event == "conversion") | (dataPersonasCompraron.person != dataPersonasCompraron.person.shift())
# dataPersonasCompraron["conversions"] = dataPersonasCompraron.groupby("person")["aConversion"].cumsum()
# dataFirstConversion = dataPersonasCompraron.loc[dataPersonasCompraron.conversions == 1.0].groupby("person", as_index= False)["time_diff"].agg({"timeFirstConversion": "sum" })
# eventsWithSessions = pd.merge(eventsWithSessions, dataFirstConversion, how='left', on = 'person')
# eventsWithSessions['timeFirstConversion'] = eventsWithSessions['timeFirstConversion'].fillna(value=0)

In [53]:
dataSessionTrain =  eventsWithSessionsTrain.groupby(["person", "session_id"], as_index=False)["time_diff"].agg({"sessionDuration": "sum" })
dataSessionTrain["sessionDuration"] = dataSessionTrain["sessionDuration"]/60

dataSessionKaggle =  eventsWithSessionsKaggle.groupby(["person", "session_id"], as_index=False)["time_diff"].agg({"sessionDuration": "sum" })
dataSessionKaggle["sessionDuration"] = dataSessionKaggle["sessionDuration"]/60

In [54]:
dataSessionGroupByPersonTrain = dataSessionTrain.loc[dataSessionTrain.sessionDuration > 0.0].groupby("person", as_index = False)
dataSessionTrain = dataSessionGroupByPersonTrain.agg({"sessionDuration": {"sessionDurationMean": "mean"}, "session_id": {"cantSessions": "count"}})
dataSessionTrain.columns = ["person", "cantSessions", "sessionDuration"]

dataSessionGroupByPersonKaggle = dataSessionKaggle.loc[dataSessionKaggle.sessionDuration > 0.0].groupby("person", as_index = False)
dataSessionKaggle = dataSessionGroupByPersonKaggle.agg({"sessionDuration": {"sessionDurationMean": "mean"}, "session_id": {"cantSessions": "count"}})
dataSessionKaggle.columns = ["person", "cantSessions", "sessionDurationMean"]

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [55]:
#Funcion que realiza el oneHotEncoding
def oneHotEncoding(column, uniqueArray, dataFrame):
    label_encoder = LabelEncoder()
    label_encoder.fit(uniqueArray)
    integer_encoded = label_encoder.transform(dataFrame[column])
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    for i, item in enumerate(uniqueArray):
        dataFrame[item] = onehot_encoded[:, label_encoder.transform(uniqueArray)[i]]


In [56]:
eventsColorTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain["color"].notnull()][["color", "person"]]
eventsConditionTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain["condition"].notnull()][["condition", "person"]]
eventsStorageTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain["storage"].notnull()][["storage", "person"]]
eventsModelTrain = eventsWithSessionsTrain.loc[(eventsWithSessionsTrain["model"].notnull())][["model", "person"]]
eventsChannelTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain["channel"] != "Unknown"][["channel", "person"]]
eventsDeviceTrain = eventsWithSessionsTrain.loc[eventsWithSessionsTrain["device_type"] != "Unknown"][["device_type", "person"]]

eventsColorKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle["color"].notnull()][["color", "person"]]
eventsConditionKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle["condition"].notnull()][["condition", "person"]]
eventsStorageKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle["storage"].notnull()][["storage", "person"]]
eventsModelKaggle = eventsWithSessionsKaggle.loc[(eventsWithSessionsKaggle["model"].notnull())][["model", "person"]]
eventsChannelKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle["channel"] != "Unknown"][["channel", "person"]]
eventsDeviceKaggle = eventsWithSessionsKaggle.loc[eventsWithSessionsKaggle["device_type"] != "Unknown"][["device_type", "person"]]

In [57]:
# eventsColor["color"] = eventsColor.color.str.split(" ", expand=True)
# top20Colors = eventsColor['color'].value_counts().nlargest(20).index
# top50models = eventsWithLabel3['model'].value_counts().nlargest(50).index

In [58]:
# def colorGrouping(x):
#     return x if x in top20Colors else 'OtroColor'

# def modelGrouping(x):
#     return x if x in top50models else 'OtroModelo'

In [59]:
# eventsColor['color'] = eventsColor['color'].apply(colorGrouping)
# eventsModel['model'] = eventsModel['model'].apply(modelGrouping)

In [60]:
eventosUnicosTrain = eventsWithSessionsTrain.drop_duplicates(subset="event", keep="first").reset_index()["event"]
coloresUnicosTrain = eventsColorTrain.drop_duplicates(subset='color', keep='first').reset_index()["color"]
conditionUnicosTrain = eventsConditionTrain.drop_duplicates(subset='condition', keep='first').reset_index()["condition"]
storageUnicosTrain = eventsStorageTrain.drop_duplicates(subset='storage', keep='first').reset_index()["storage"]
modelosUnicosTrain = eventsModelTrain.drop_duplicates(subset='model', keep='first').reset_index()['model']
channelUnicosTrain = eventsChannelTrain.drop_duplicates(subset="channel", keep="first").reset_index()["channel"]
# SOUnicosTrain = eventsWithSessionsTrain.drop_duplicates(subset="SO", keep="first").reset_index()["SO"]
deviceTypeUnicosTrain = eventsDeviceTrain.drop_duplicates(subset="device_type", keep="first").reset_index()["device_type"]
daysWeekUnicosTrain = eventsWithSessionsTrain.drop_duplicates(subset="dayweek", keep="first").reset_index()["dayweek"]
monthUnicosTrain = eventsWithSessionsTrain.drop_duplicates(subset="month", keep="first").reset_index()["month"]

eventosUnicosKaggle = eventsWithSessionsKaggle.drop_duplicates(subset="event", keep="first").reset_index()["event"]
coloresUnicosKaggle = eventsColorKaggle.drop_duplicates(subset='color', keep='first').reset_index()["color"]
conditionUnicosKaggle = eventsConditionKaggle.drop_duplicates(subset='condition', keep='first').reset_index()["condition"]
storageUnicosKaggle = eventsStorageKaggle.drop_duplicates(subset='storage', keep='first').reset_index()["storage"]
modelosUnicosKaggle = eventsModelKaggle.drop_duplicates(subset='model', keep='first').reset_index()['model']
channelUnicosKaggle = eventsChannelKaggle.drop_duplicates(subset="channel", keep="first").reset_index()["channel"]
# SOUnicosKaggle = eventsWithSessionsKaggle.drop_duplicates(subset="SO", keep="first").reset_index()["SO"]
deviceTypeUnicosKaggle = eventsDeviceKaggle.drop_duplicates(subset="device_type", keep="first").reset_index()["device_type"]
daysWeekUnicosKaggle = eventsWithSessionsKaggle.drop_duplicates(subset="dayweek", keep="first").reset_index()["dayweek"]
monthUnicosKaggle = eventsWithSessionsKaggle.drop_duplicates(subset="month", keep="first").reset_index()["month"]


In [61]:
oneHotEncoding("event", eventosUnicosTrain, eventsWithSessionsTrain)
oneHotEncoding("color", coloresUnicosTrain, eventsColorTrain)
oneHotEncoding("condition", conditionUnicosTrain, eventsConditionTrain)
oneHotEncoding("storage", storageUnicosTrain, eventsStorageTrain)
oneHotEncoding("model", modelosUnicosTrain, eventsModelTrain)
oneHotEncoding("channel", channelUnicosTrain, eventsChannelTrain)
# oneHotEncoding("SO", SOUnicosTrain, eventsWithSessionsTrain)
oneHotEncoding("device_type", deviceTypeUnicosTrain, eventsDeviceTrain)
oneHotEncoding("dayweek", daysWeekUnicosTrain, eventsWithSessionsTrain)
oneHotEncoding("month", monthUnicosTrain, eventsWithSessionsTrain)


oneHotEncoding("event", eventosUnicosKaggle, eventsWithSessionsKaggle)
oneHotEncoding("color", coloresUnicosKaggle, eventsColorKaggle)
oneHotEncoding("condition", conditionUnicosKaggle, eventsConditionKaggle)
oneHotEncoding("storage", storageUnicosKaggle, eventsStorageKaggle)
oneHotEncoding("model", modelosUnicosKaggle, eventsModelKaggle)
oneHotEncoding("channel", channelUnicosKaggle, eventsChannelKaggle)
# oneHotEncoding("SO", SOUnicosKaggle, eventsWithSessionsKaggle)
oneHotEncoding("device_type", deviceTypeUnicosKaggle, eventsDeviceKaggle)
oneHotEncoding("dayweek", daysWeekUnicosKaggle, eventsWithSessionsKaggle)
oneHotEncoding("month", monthUnicosKaggle, eventsWithSessionsKaggle)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [62]:
eventsWithSessions2Train = eventsWithSessionsTrain.drop(["timestamp", "dayweek", "month", "event", "cantEventos","condition", "model", "storage", "color", "fecha", "SO", "marca", "city", "country", "channel", "device_type", "session_id", "session_change", "time_diff"], axis = 1)

eventsWithSessions2Kaggle = eventsWithSessionsKaggle.drop(["timestamp", "dayweek", "month", "event", "cantEventos","condition", "model", "storage", "color", "fecha", "SO", "marca", "city", "country", "channel", "device_type", "session_id", "session_change", "time_diff"], axis = 1)


In [63]:
eventsFinaleTrain = eventsWithSessions2Train
eventsFinaleKaggle = eventsWithSessions2Kaggle

In [64]:
eventosUnicosWithPersonTrain = eventosUnicosTrain.values
eventosUnicosWithPersonTrain = eventosUnicosWithPersonTrain.tolist()
eventosUnicosWithPersonTrain.append("person")

eventosUnicosWithPersonKaggle = eventosUnicosKaggle.values
eventosUnicosWithPersonKaggle = eventosUnicosWithPersonKaggle.tolist()
eventosUnicosWithPersonKaggle.append("person")

In [65]:
eventsFinaleTrain = eventsFinaleTrain.loc[:, eventosUnicosWithPersonTrain]
eventsFinaleFilterTrain = eventsFinaleTrain.groupby("person").sum()

eventsFinaleKaggle = eventsFinaleKaggle.loc[:, eventosUnicosWithPersonTrain]
eventsFinaleFilterKaggle = eventsFinaleKaggle.groupby("person").sum()

In [66]:
eventsColor1Train = eventsColorTrain.groupby("person").sum()
eventsCondition1Train = eventsConditionTrain.groupby("person").sum()
eventsDevice1Train = eventsDeviceTrain.groupby("person").sum()
eventsModel1Train = eventsModelTrain.groupby("person").sum()
eventsStorage1Train = eventsStorageTrain.groupby("person").sum()
eventsChannel1Train = eventsChannelTrain.groupby("person").sum()

eventsColor1Kaggle = eventsColorKaggle.groupby("person").sum()
eventsCondition1Kaggle = eventsConditionKaggle.groupby("person").sum()
eventsDevice1Kaggle = eventsDeviceKaggle.groupby("person").sum()
eventsModel1Kaggle = eventsModelKaggle.groupby("person").sum()
eventsStorage1Kaggle = eventsStorageKaggle.groupby("person").sum()
eventsChannel1Kaggle = eventsChannelKaggle.groupby("person").sum()

In [67]:
eventsWithSessionsFilterTrain = eventsWithSessions2Train.drop(columns= eventosUnicosTrain).groupby("person").first()
eventsWithSessionsFilterKaggle = eventsWithSessions2Kaggle.drop(columns= eventosUnicosKaggle).groupby("person").first()

In [68]:
eventsWithSessionsFilterTrain["regionMorePurchase"] = eventsWithSessionsFilterTrain["region"].map(lambda x: 1 if x in regionesMasCompras else 0) 
eventsWithSessionsFilterKaggle["regionMorePurchase"] = eventsWithSessionsFilterKaggle["region"].map(lambda x: 1 if x in regionesMasCompras else 0) 

In [69]:
eventsWithSessionsFilterTrain = eventsWithSessionsFilterTrain.drop("region", axis=1)
eventsWithSessionsFilterKaggle = eventsWithSessionsFilterKaggle.drop("region", axis=1)

In [70]:
trainDF = pd.merge(eventsWithSessionsFilterTrain, eventsFinaleFilterTrain, on = "person")
# trainDF = pd.merge(trainDF, eventsModel1Train , how="left", on = "person")
# trainDF = pd.merge(trainDF, eventsColor1Train, how="left" , on = "person")
# trainDF = pd.merge(trainDF, eventsChannel1Train, how="left" , on = "person")
# trainDF = pd.merge(trainDF, eventsCondition1Train, how="left" , on = "person")
trainDF = pd.merge(trainDF, eventsDevice1Train, how="left" , on = "person")
trainDF = pd.merge(trainDF, eventsStorage1Train, how="left" , on = "person")

kaggleDF = pd.merge(eventsWithSessionsFilterKaggle, eventsFinaleFilterKaggle, on = "person")
# kaggleDF = pd.merge(kaggleDF, eventsModel1Kaggle , how="left", on = "person")
# kaggleDF = pd.merge(kaggleDF, eventsColor1Kaggle, how="left" , on = "person")
# kaggleDF = pd.merge(kaggleDF, eventsChannel1Kaggle, how="left" , on = "person")
# kaggleDF = pd.merge(kaggleDF, eventsCondition1Kaggle, how="left" , on = "person")
kaggleDF = pd.merge(kaggleDF, eventsDevice1Kaggle, how="left" , on = "person")
kaggleDF = pd.merge(kaggleDF, eventsStorage1Kaggle, how="left" , on = "person")

In [71]:
trainDF = trainDF.fillna(value=0)
kaggleDF = kaggleDF.fillna(value=0)

In [72]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# scaled_values = scaler.fit_transform(trainDF)
# trainDF.loc[:, :] = scaled_values
# scaled_values = scaler.fit_transform(testDFFinal)
# testDFFinal.loc[:, :] = scaled_values


In [73]:
#Creamos los csv ya listo para entrenar a los algoritmos y para testear que tan buenos son.
trainDF.to_csv('setEntrenamiento.csv')
kaggleDF.to_csv('setKaggle.csv')