In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

events = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
training = pd.read_csv('labels_training_set.csv', low_memory = False)

In [61]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [62]:
eventsWithLabel = events.merge(training, how='left', left_on='person', right_on='person')

In [63]:
eventsWithLabel['timestamp'] = pd.to_datetime(eventsWithLabel['timestamp'])

In [64]:
diccWeek = {0: "Monday",
            1: "Tuesday",
            2: "Wednesday",
            3: "Thursday",
            4: "Friday",
            5: "Saturday",
            6: "Sunday"}

In [65]:
#Obtengo los dias de la semana
eventsWithLabel['dayweek'] = eventsWithLabel.timestamp.dt.dayofweek
eventsWithLabel["dayweek"] = eventsWithLabel["dayweek"].map(lambda x: diccWeek.get(x))
#Obtengo la fecha
eventsWithLabel['fecha'] = eventsWithLabel.timestamp.dt.date
#Separo la marca
eventsWithLabel['marca'] = eventsWithLabel['model'].map(lambda x: str(x).split()[0])

In [66]:
groupbyCantEventos = eventsWithLabel.groupby('person', as_index = False)['event'].agg({'cantEventos': 'count'})
groupbyCantEventos.sort_values(by = 'cantEventos', ascending = False)
eventsWithLabel = pd.merge(eventsWithLabel, groupbyCantEventos, on = 'person')

In [67]:
#Filtro a la gente que no tiene el evento visited_site
personasConVisitas = eventsWithLabel.loc[eventsWithLabel['event'] == 'visited site']\
                             .drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataNotVisit = eventsWithLabel.loc[eventsWithLabel['person'].isin(personasConVisitas)]

In [68]:
eventsWithLabel2 = dataNotVisit.loc[dataNotVisit['event'] == 'visited site'].groupby(['person', 'fecha'], as_index = False).agg({'city': 'first', 'region': 'first', 'country': 'first', 'device_type': 'first', 'operating_system_version': 'first', 'channel': 'first', 'new_vs_returning': 'first'})

In [69]:
eventsWithLabel3 = pd.merge(dataNotVisit, eventsWithLabel2, on = ['person', 'fecha'])
eventsWithLabel3.drop(['region_x', 'device_type_x', 'operating_system_version_x', 'country_x', 'city_x', 'channel_x', 'new_vs_returning_x'], axis = 1, inplace = True)
eventsWithLabel3.rename(columns = {'region_y': 'region', 'device_type_y': 'device_type', 'operating_system_version_y': 'SO', 'city_y': 'city', 'country_y': 'country', 'channel_y': 'channel', 'new_vs_returning_y': 'new_vs_returning'}, inplace =True)

In [70]:
def obtenerSistema(so):
    sistema = so.split()
    if len(sistema) > 1:
        otro = sistema[1].split('.')[0]
        return sistema[0] + ' ' + otro
    else:
        return sistema[0]

In [71]:
eventsWithLabel3['SO'] = eventsWithLabel3['SO'].map(obtenerSistema)

In [107]:
eventsWithLabel3["color"] = eventsWithLabel3.color.str.split(" ", expand=True)

In [None]:
#veo a los tipos que compraron (label == 1)

In [108]:
dataCompraron = eventsWithLabel3.loc[eventsWithLabel3["label"] == 1]

In [109]:
dataCompraron["model"].value_counts()

iPhone 6                                       4132
iPhone 5s                                      4011
iPhone 6S                                      3551
Samsung Galaxy S7 Edge                         2530
iPhone 7                                       2297
Samsung Galaxy S7                              1763
Samsung Galaxy S6 Edge                         1694
Samsung Galaxy S6 Flat                         1551
Samsung Galaxy S8                              1411
iPhone 5c                                      1364
iPhone 7 Plus                                  1313
iPhone SE                                      1130
iPhone 6S Plus                                 1093
Samsung Galaxy J7 Prime                         991
Samsung Galaxy J5                               899
Samsung Galaxy A7 2017                          888
iPhone 6 Plus                                   851
Motorola Moto G4 Plus                           844
Samsung Galaxy S5                               826
Samsung Gala

In [146]:
dataPersonRegion = eventsWithLabel3.loc[eventsWithLabel3["label"].notnull()].groupby("person")[["region", "label", "timestamp"]].agg("first")

In [161]:
dataRegiones = dataPersonRegion.groupby(["region", "label"]).count()
# dataRegionesFilter = dataPersonRegion.groupby(["region", "label" ])["timestamp"].count().unstack().fillna(value = 0)

dataRegionesFilter = dataRegiones.loc[dataRegiones["timestamp"] > 10]
dataRegionesFilter = dataRegionesFilter["timestamp"].unstack().fillna(value = 0)

In [162]:
dataRegionesFilter["ratioCompra"] = dataRegionesFilter[1.0]/dataRegionesFilter[0.0]

In [163]:
dataRegionesFilter.sort_values(by = "ratioCompra", ascending = False)

label,0.0,1.0,ratioCompra
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Santa Catarina,334.0,26.0,0.077844
Maranhao,317.0,24.0,0.07571
Para,291.0,22.0,0.075601
Rio Grande do Sul,539.0,36.0,0.06679
Minas Gerais,1487.0,88.0,0.05918
Parana,558.0,33.0,0.05914
Federal District,342.0,20.0,0.05848
Sao Paulo,5090.0,288.0,0.056582
Unknown,3099.0,168.0,0.054211
Espirito Santo,319.0,17.0,0.053292


In [165]:
regionesMasCompras = ["Santa Catarina", "Maranhao", "Para", "Rio Grande do Sul", "Minas Gerais", "Parana", "Federal District", "Sao Paulo", "Espirito Santo", "Rio de Janeiro", "Bahia", "Goias", "Pernambuco"]

In [72]:
eventsWithLabel3 = eventsWithLabel3.drop(columns=["url", "sku", "skus", "search_term", "staticpage", "campaign_source", "search_engine", "screen_resolution", "browser_version"])

In [73]:
#Empezamos con las sesiones.
eventsWithSessions = eventsWithLabel3.sort_values(by = ["person", "timestamp"])
eventsWithSessions["time_diff"] = (eventsWithSessions.timestamp - (eventsWithSessions.timestamp.shift())) / np.timedelta64(1, 's')
eventsWithSessions.loc[eventsWithSessions.person != eventsWithSessions.person.shift(), "time_diff"] = 0

In [74]:
eventsWithSessions["session_change"] = ((eventsWithSessions.event == "visited site") & (eventsWithSessions.time_diff > 1800.0)) | (eventsWithSessions.time_diff > 36000.0) | (eventsWithSessions.person != eventsWithSessions.person.shift())
eventsWithSessions["session_id"] = eventsWithSessions.groupby("person")["session_change"].cumsum()

In [75]:
#Obtener las personas con conversiones
dataConversion = eventsWithSessions.loc[eventsWithSessions['event'] == 'conversion']
personasCompraron = dataConversion.drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataPersonasCompraron = eventsWithSessions.loc[eventsWithSessions['person'].isin(personasCompraron)]

In [76]:
dataPersonasCompraron["aConversion"] = (dataPersonasCompraron.event == "conversion") | (dataPersonasCompraron.person != dataPersonasCompraron.person.shift())
dataPersonasCompraron["conversions"] = dataPersonasCompraron.groupby("person")["aConversion"].cumsum()
dataFirstConversion = dataPersonasCompraron.loc[dataPersonasCompraron.conversions == 1.0].groupby("person")["time_diff"].agg({"timeFirstConversion": "sum" })
eventsWithSessions = pd.merge(eventsWithSessions, dataFirstConversion, how='left', on = 'person')
eventsWithSessions['timeFirstConversion'] = eventsWithSessions['timeFirstConversion'].fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
is deprecated and will be removed in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


In [77]:
dataSession =  eventsWithSessions.groupby(["person", "session_id"], as_index=False)["time_diff"].agg({"sessionDuration": "sum" })
dataSession["sessionDuration"] = dataSession["sessionDuration"]/60

In [78]:
dataSessionGroupByPerson = dataSession.loc[dataSession.sessionDuration > 0.0].groupby("person")
dataSession = dataSessionGroupByPerson.agg({"sessionDuration": {"sessionDuration": "mean"}, "session_id": {"cantSessions": "count"}})
dataSession.columns = dataSession.columns.droplevel(0)

In [79]:
eventsWithSessions = pd.merge(eventsWithSessions, dataSession, how ='left', on = 'person')
eventsWithSessions["sessionDuration"] = eventsWithSessions["sessionDuration"].fillna(value = "0.0")
eventsWithSessions["cantSessions"] = eventsWithSessions["cantSessions"].fillna(value = "0.0")

In [80]:
#Funcion que realiza el oneHotEncoding
def oneHotEncoding(column, uniqueArray, dataFrame):
    label_encoder = LabelEncoder()
    label_encoder.fit(uniqueArray)
    integer_encoded = label_encoder.transform(dataFrame[column])
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    for i, item in enumerate(uniqueArray):
        dataFrame[item] = onehot_encoded[:, label_encoder.transform(uniqueArray)[i]]


In [81]:
eventsColor = eventsWithLabel3.loc[eventsWithLabel3["color"].notnull()][["color", "person"]]
eventsCondition = eventsWithLabel3.loc[eventsWithLabel3["condition"].notnull()][["condition", "person"]]
eventsStorage = eventsWithLabel3.loc[eventsWithLabel3["storage"].notnull()][["storage", "person"]]
eventsModel = eventsWithLabel3.loc[eventsWithLabel3["model"].notnull()][["model", "person"]]
eventsChannel = eventsWithLabel3.loc[eventsWithLabel3["channel"] != "Unknown"][["channel", "person"]]
eventsDevice = eventsWithLabel3.loc[eventsWithLabel3["device_type"] != "Unknown"][["device_type", "person"]]

In [82]:
eventsColor["color"] = eventsColor.color.str.split(" ", expand=True)
top20Colors = eventsColor['color'].value_counts().nlargest(20).index
top50models = eventsWithLabel3['model'].value_counts().nlargest(50).index

In [83]:
def colorGrouping(x):
    return x if x in top20Colors else 'OtroColor'

def modelGrouping(x):
    return x if x in top50models else 'OtroModelo'

In [84]:
eventsColor['color'] = eventsColor['color'].apply(colorGrouping)
eventsModel['model'] = eventsModel['model'].apply(modelGrouping)

In [85]:
eventsWithLabel3["SO"] = eventsWithLabel3.SO.str.split(" ", expand=True)

In [86]:
eventosUnicos = eventsWithLabel3.drop_duplicates(subset="event", keep="first").reset_index()["event"]
coloresUnicos = eventsColor.drop_duplicates(subset='color', keep='first').reset_index()["color"]
conditionUnicos = eventsCondition.drop_duplicates(subset='condition', keep='first').reset_index()["condition"]
storageUnicos = eventsStorage.drop_duplicates(subset='storage', keep='first').reset_index()["storage"]
modelosUnicos = eventsModel.drop_duplicates(subset='model', keep='first').reset_index()['model']
channelUnicos = eventsChannel.drop_duplicates(subset="channel", keep="first").reset_index()["channel"]
SOUnicos = eventsWithLabel3.drop_duplicates(subset="SO", keep="first").reset_index()["SO"]
deviceTypeUnicos = eventsDevice.drop_duplicates(subset="device_type", keep="first").reset_index()["device_type"]
daysWeekUnicos = eventsWithLabel3.drop_duplicates(subset="dayweek", keep="first").reset_index()["dayweek"]

In [87]:
oneHotEncoding("event", eventosUnicos, eventsWithLabel3)
oneHotEncoding("color", coloresUnicos, eventsColor)
oneHotEncoding("condition", conditionUnicos, eventsCondition)
oneHotEncoding("storage", storageUnicos, eventsStorage)
oneHotEncoding("model", modelosUnicos, eventsModel)
oneHotEncoding("channel", channelUnicos, eventsChannel)
oneHotEncoding("SO", SOUnicos, eventsWithLabel3)
oneHotEncoding("device_type", deviceTypeUnicos, eventsDevice)
oneHotEncoding("dayweek", daysWeekUnicos, eventsWithLabel3)

In [88]:
eventsFinale = eventsWithLabel3.drop(columns=["timestamp", "dayweek","event", "cantEventos","condition", "model", "storage", "color", "fecha", "SO", "marca", "city", "country", "region", "channel", "device_type", "new_vs_returning" ])

In [89]:
eventsColor1 = eventsColor.groupby("person").sum()
eventsCondition1 = eventsCondition.groupby("person").sum()
eventsDevice1 = eventsDevice.groupby("person").sum()
eventsModel1 = eventsModel.groupby("person").sum()
eventsStorage1 = eventsStorage.groupby("person").sum()
eventsChannel1 = eventsChannel.groupby("person").sum()

In [90]:
eventsWithSessionsFilter = eventsWithSessions.groupby("person")[["timeFirstConversion", "sessionDuration", "cantSessions"]].agg("first")

In [91]:
testDF = eventsFinale.loc[eventsFinale['label'].isnull()]
trainingDF = eventsFinale.loc[eventsFinale['label'].notnull()]

In [92]:
labels = trainingDF.groupby("person")["label"].agg("first")
testDFFinal = testDF.drop(columns=["label"]).groupby("person").sum()
trainingDFFinal = trainingDF.drop(columns=["label"]).groupby("person").sum()

In [93]:
testDFFinal = pd.merge(testDFFinal, eventsWithSessionsFilter, how="left" , on = "person")
testDFFinal = pd.merge(testDFFinal, eventsColor1, how="left" , on = "person")
testDFFinal = pd.merge(testDFFinal, eventsChannel1, how="left" , on = "person")
testDFFinal = pd.merge(testDFFinal, eventsCondition1, how="left" , on = "person")
testDFFinal = pd.merge(testDFFinal, eventsDevice1, how="left" , on = "person")
testDFFinal = pd.merge(testDFFinal, eventsModel1, how="left" , on = "person")
testDFFinal = pd.merge(testDFFinal, eventsStorage1, how="left" , on = "person")

In [94]:
testDFFinal = testDFFinal.fillna(value=0)

In [95]:
trainingDFFinal = pd.merge(trainingDFFinal, eventsWithSessionsFilter, how="left" , on = "person")
trainingDFFinal = pd.merge(trainingDFFinal,  eventsColor1, how="left" , on = "person")
trainingDFFinal = pd.merge(trainingDFFinal, eventsChannel1, how="left" , on = "person")
trainingDFFinal = pd.merge(trainingDFFinal, eventsCondition1, how="left" , on = "person")
trainingDFFinal = pd.merge(trainingDFFinal, eventsDevice1, how="left" , on = "person")
trainingDFFinal = pd.merge(trainingDFFinal, eventsModel1, how="left" , on = "person")
trainingDFFinal = pd.merge(trainingDFFinal, eventsStorage1, how="left" , on = "person")


In [96]:
trainingDFFinal = trainingDFFinal.fillna(value=0)
trainingDFFinal["label"] = labels

In [129]:
#Creamos los csv ya listo para entrenar a los algoritmos y para testear que tan buenos son.
trainingDFFinal.to_csv('setEntrenamiento.csv')
testDFFinal.to_csv('setTesteo.csv')