In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

events = pd.read_csv('events_up_to_01062018.csv')
training = pd.read_csv('labels_training_set.csv')

In [4]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [5]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2341681 entries, 0 to 2341680
Data columns (total 23 columns):
timestamp                   object
event                       object
person                      object
url                         object
sku                         float64
model                       object
condition                   object
storage                     object
color                       object
skus                        object
search_term                 object
staticpage                  object
campaign_source             object
search_engine               object
channel                     object
new_vs_returning            object
city                        object
region                      object
country                     object
device_type                 object
screen_resolution           object
operating_system_version    object
browser_version             object
dtypes: float64(1), object(22)
memory usage: 410.9+ MB


In [6]:
training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [7]:
eventsWithLabel = events.merge(training, how='left', left_on='person', right_on='person')
eventsWithLabel.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,0.0
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [8]:
eventsWithLabel['timestamp'] = pd.to_datetime(eventsWithLabel['timestamp'])

In [9]:
groupbyPersonas = eventsWithLabel.loc[:,['person', 'event']].groupby('person')
dicc = {}
for clave, data in groupbyPersonas:
    lista = []
    lista = data.drop_duplicates(subset = 'event', keep = 'first')['event'].tolist()
    dicc[clave] = lista

In [10]:
#Obtengo una lista de todos los eventos que tiene esa persona
eventsWithLabel['eventos'] = eventsWithLabel['person'].map(lambda x: dicc[x])
#Obtengo los dias de la semana
eventsWithLabel['dayweek'] = eventsWithLabel.timestamp.dt.dayofweek
#Obtengo la fecha
eventsWithLabel['fecha'] = eventsWithLabel.timestamp.dt.date
#Separo la marca
eventsWithLabel['marca'] = eventsWithLabel['model'].map(lambda x: str(x).split()[0])

In [11]:
#cantidad de personas que ingresaron 27624.
groupbyCantEventos = eventsWithLabel.groupby('person', as_index = False)['event'].agg({'cantEventos': 'count'})
groupbyCantEventos.sort_values(by = 'cantEventos', ascending = False)
eventsWithLabel = pd.merge(eventsWithLabel, groupbyCantEventos, on = 'person')

In [12]:
#Filtro a la gente que no tiene el evento visited_site
personasConVisitas = eventsWithLabel.loc[eventsWithLabel['event'] == 'visited site']\
                             .drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataNotVisit = eventsWithLabel.loc[eventsWithLabel['person'].isin(personasConVisitas)]

In [13]:
eventsWithLabel2 = dataNotVisit.loc[dataNotVisit['event'] == 'visited site'].groupby(['person', 'fecha'], as_index = False).agg({'city': 'first', 'region': 'first', 'country': 'first', 'device_type': 'first', 'operating_system_version': 'first', 'channel': 'first', 'new_vs_returning': 'first'})

In [14]:
eventsWithLabel3 = pd.merge(dataNotVisit, eventsWithLabel2, on = ['person', 'fecha'])
eventsWithLabel3.drop(['region_x', 'device_type_x', 'operating_system_version_x', 'country_x', 'city_x', 'channel_x', 'new_vs_returning_x'], axis = 1, inplace = True)
eventsWithLabel3.rename(columns = {'region_y': 'region', 'device_type_y': 'device_type', 'operating_system_version_y': 'SO', 'city_y': 'city', 'country_y': 'country', 'channel_y': 'channel', 'new_vs_returning_y': 'new_vs_returning'}, inplace =True)

In [15]:
def obtenerSistema(so):
    sistema = so.split()
    if len(sistema) > 1:
        otro = sistema[1].split('.')[0]
        return sistema[0] + ' ' + otro
    else:
        return sistema[0]

In [16]:
eventsWithLabel3['SO'] = eventsWithLabel3['SO'].map(obtenerSistema)

In [17]:
eventsWithLabel3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319285 entries, 0 to 2319284
Data columns (total 29 columns):
timestamp            datetime64[ns]
event                object
person               object
url                  object
sku                  float64
model                object
condition            object
storage              object
color                object
skus                 object
search_term          object
staticpage           object
campaign_source      object
search_engine        object
screen_resolution    object
browser_version      object
label                float64
eventos              object
dayweek              int64
fecha                object
marca                object
cantEventos          int64
city                 object
SO                   object
country              object
region               object
device_type          object
channel              object
new_vs_returning     object
dtypes: datetime64[ns](1), float64(2), int64(2), object(24)
memory 

In [18]:
testDF = eventsWithLabel3.loc[eventsWithLabel3['label'].isnull()]
testDF.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,fecha,marca,cantEventos,city,SO,country,region,device_type,channel,new_vs_returning
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
1,2018-05-18 00:30:30,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
2,2018-05-18 00:07:23,search engine hit,4886f805,,,,,,,,...,2018-05-18,,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
3,2018-05-18 00:11:56,checkout,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
4,2018-05-18 00:11:35,viewed product,4886f805,,9287.0,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New


In [19]:
trainingDF = eventsWithLabel3.loc[eventsWithLabel3['label'].notnull()]
trainingDF.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,fecha,marca,cantEventos,city,SO,country,region,device_type,channel,new_vs_returning
9,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
10,2018-05-18 00:23:33,viewed product,ad93850f,,318.0,iPhone 5s,Muito Bom,64GB,Prateado,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
11,2018-05-18 00:16:10,viewed product,ad93850f,,5907.0,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
12,2018-05-18 00:14:55,viewed product,ad93850f,,6023.0,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
13,2018-05-18 00:11:26,ad campaign hit,ad93850f,/comprar/iphone/iphone-5s,,,,,,,...,2018-05-18,,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning


In [20]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [21]:
eventosUnicos = eventsWithLabel3.drop_duplicates(subset="event", keep="first").reset_index()["event"]

In [22]:
label_encoder = LabelEncoder()
label_encoder.fit(eventosUnicos)
integer_encoded = label_encoder.transform(eventsWithLabel3["event"])

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [23]:
for i, event in enumerate(eventosUnicos):
    eventsWithLabel3[event] = onehot_encoded[:, label_encoder.transform(eventosUnicos)[i]]

In [24]:
eventsWithLabel3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319285 entries, 0 to 2319284
Data columns (total 39 columns):
timestamp            datetime64[ns]
event                object
person               object
url                  object
sku                  float64
model                object
condition            object
storage              object
color                object
skus                 object
search_term          object
staticpage           float64
campaign_source      object
search_engine        object
screen_resolution    object
browser_version      object
label                float64
eventos              object
dayweek              int64
fecha                object
marca                object
cantEventos          int64
city                 object
SO                   object
country              object
region               object
device_type          object
channel              object
new_vs_returning     object
viewed product       float64
search engine hit    float64
checkout

In [25]:
#Obtener las personas con conversiones
dataConversion = eventsWithLabel3.loc[eventsWithLabel3['event'] == 'conversion']
personasCompraron = dataConversion.drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataPersonasCompraron = eventsWithLabel3.loc[eventsWithLabel3['person'].isin(personasCompraron)]

In [26]:
dataPersonasCompraron.loc[dataPersonasCompraron["person"] == "bbe27ed9"].sort_values(by=["timestamp"])

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion
2318405,2018-01-03 15:02:58,visited site,bbe27ed9,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2318403,2018-01-03 15:02:58,generic listing,bbe27ed9,,,,,,,"3889,3853,7083,4285,12675,4033,1413,4177,4151,...",...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2318392,2018-01-03 15:02:58,search engine hit,bbe27ed9,,,,,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318391,2018-01-03 15:05:00,viewed product,bbe27ed9,,2697.0,iPhone 5s,Bom,64GB,Cinza espacial,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318398,2018-01-03 15:05:04,viewed product,bbe27ed9,,2694.0,iPhone 5s,Bom,32GB,Cinza espacial,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318402,2018-01-03 15:05:06,viewed product,bbe27ed9,,2692.0,iPhone 5s,Bom,16GB,Cinza espacial,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318400,2018-01-03 15:05:14,viewed product,bbe27ed9,,283.0,iPhone 5s,Muito Bom,16GB,Cinza espacial,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318401,2018-01-03 15:05:21,viewed product,bbe27ed9,,284.0,iPhone 5s,Excelente,16GB,Cinza espacial,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318390,2018-01-03 15:05:32,viewed product,bbe27ed9,,291.0,iPhone 5s,Excelente,16GB,Dourado,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318397,2018-01-03 15:05:34,viewed product,bbe27ed9,,277.0,iPhone 5s,Excelente,16GB,Prateado,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
dataPersonasCompraron.loc[dataPersonasCompraron["person"] == "99abca5a"].groupby("event").count()

Unnamed: 0_level_0,timestamp,person,url,sku,model,condition,storage,color,skus,search_term,...,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ad campaign hit,12,12,12,0,0,0,0,0,0,0,...,12,12,12,12,12,12,12,12,12,12
brand listing,29,29,0,0,0,0,0,0,29,0,...,29,29,29,29,29,29,29,29,29,29
checkout,17,17,0,17,17,17,17,17,1,0,...,17,17,17,17,17,17,17,17,17,17
conversion,2,2,0,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
generic listing,59,59,0,0,0,0,0,0,59,0,...,59,59,59,59,59,59,59,59,59,59
lead,1,1,0,0,1,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
search engine hit,1,1,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
searched products,5,5,0,0,0,0,0,0,0,5,...,5,5,5,5,5,5,5,5,5,5
staticpage,2,2,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
viewed product,617,617,0,617,617,617,617,617,9,0,...,617,617,617,617,617,617,617,617,617,617


In [28]:
eventsWithSeasons = eventsWithLabel3.sort_values(by = ["person", "timestamp"])
eventsWithSeasons["time_diff"] = (eventsWithSeasons.timestamp - (eventsWithSeasons.timestamp.shift())) / np.timedelta64(1, 's')
# eventsWithSeasons["time_diff"] = eventsWithSeasons.timestamp.diff()
eventsWithSeasons.loc[eventsWithSeasons.person != eventsWithSeasons.person.shift(), "time_diff"] = 0


In [44]:
eventsWithSeasons["season_change"] = ((eventsWithSeasons.event == "visited site") & (eventsWithSeasons.time_diff > 1800.0)) | (eventsWithSeasons.time_diff > 36000.0) | (eventsWithSeasons.person != eventsWithSeasons.person.shift())
eventsWithSeasons["season_id"] = eventsWithSeasons.groupby("person")["season_change"].cumsum()

In [45]:
eventsWithSeasons

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion,time_diff,season_change,season_id
2246122,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,1.0
2246125,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4632.0,True,2.0
2246123,2018-05-17 13:45:00,checkout,0008ed71,,8247.0,iPhone SE,Bom,64GB,Cinza espacial,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False,2.0
2246126,2018-05-17 16:21:54,visited site,0008ed71,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9414.0,True,3.0
2246124,2018-05-17 16:22:06,generic listing,0008ed71,,,,,,,"6594,6651,6664,7253,2820,6706,6721,12606,480,1...",...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.0,False,3.0
2246121,2018-05-17 16:28:37,checkout,0008ed71,,7505.0,LG G4 H818P,Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,391.0,False,3.0
10989,2018-05-03 22:08:29,visited site,00091926,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True,1.0
10988,2018-05-03 22:08:35,viewed product,00091926,,8568.0,Motorola Moto X Style,Muito Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,False,1.0
10983,2018-05-03 22:08:51,viewed product,00091926,,14734.0,Samsung Galaxy A7 2017,Novo,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,False,1.0
10987,2018-05-03 22:09:25,viewed product,00091926,,8568.0,Motorola Moto X Style,Muito Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,False,1.0


In [30]:
def logisticReg(x_train, x_test, y_train, y_test):
    logReg = LogisticRegression()
    logReg.fit(x_train, y_train)
    y_pred = logReg.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [31]:
def knn(k, x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [32]:
def decisionTree(max_depth, max_features, x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [33]:
def xgboost(max_depth, n_estimators, x_train, x_test, y_train, y_test):
    xgb = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)