In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

events = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
training = pd.read_csv('labels_training_set.csv', low_memory = False)

In [2]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

In [3]:
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [4]:
training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [5]:
eventsWithLabel = events.merge(training, how='left', left_on='person', right_on='person')
eventsWithLabel.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,0.0
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [6]:
eventsWithLabel['timestamp'] = pd.to_datetime(eventsWithLabel['timestamp'])

In [7]:
diccWeek = {0: "Monday",
            1: "Tuesday",
            2: "Wednesday",
            3: "Thursday",
            4: "Friday",
            5: "Saturday",
            6: "Sunday"}

In [8]:
#Obtengo los dias de la semana
eventsWithLabel['dayweek'] = eventsWithLabel.timestamp.dt.dayofweek
eventsWithLabel["dayweek"] = eventsWithLabel["dayweek"].map(lambda x: diccWeek.get(x))
#Obtengo la fecha
eventsWithLabel['fecha'] = eventsWithLabel.timestamp.dt.date
#Separo la marca
eventsWithLabel['marca'] = eventsWithLabel['model'].map(lambda x: str(x).split()[0])

In [9]:
groupbyCantEventos = eventsWithLabel.groupby('person', as_index = False)['event'].agg({'cantEventos': 'count'})
groupbyCantEventos.sort_values(by = 'cantEventos', ascending = False)
eventsWithLabel = pd.merge(eventsWithLabel, groupbyCantEventos, on = 'person')

In [10]:
#Filtro a la gente que no tiene el evento visited_site
personasConVisitas = eventsWithLabel.loc[eventsWithLabel['event'] == 'visited site']\
                             .drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataNotVisit = eventsWithLabel.loc[eventsWithLabel['person'].isin(personasConVisitas)]

In [11]:
eventsWithLabel2 = dataNotVisit.loc[dataNotVisit['event'] == 'visited site'].groupby(['person', 'fecha'], as_index = False).agg({'city': 'first', 'region': 'first', 'country': 'first', 'device_type': 'first', 'operating_system_version': 'first', 'channel': 'first', 'new_vs_returning': 'first'})

In [12]:
eventsWithLabel3 = pd.merge(dataNotVisit, eventsWithLabel2, on = ['person', 'fecha'])
eventsWithLabel3.drop(['region_x', 'device_type_x', 'operating_system_version_x', 'country_x', 'city_x', 'channel_x', 'new_vs_returning_x'], axis = 1, inplace = True)
eventsWithLabel3.rename(columns = {'region_y': 'region', 'device_type_y': 'device_type', 'operating_system_version_y': 'SO', 'city_y': 'city', 'country_y': 'country', 'channel_y': 'channel', 'new_vs_returning_y': 'new_vs_returning'}, inplace =True)

In [13]:
def obtenerSistema(so):
    sistema = so.split()
    if len(sistema) > 1:
        otro = sistema[1].split('.')[0]
        return sistema[0] + ' ' + otro
    else:
        return sistema[0]

In [14]:
eventsWithLabel3['SO'] = eventsWithLabel3['SO'].map(obtenerSistema)

In [15]:
eventsWithLabel3.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319285 entries, 0 to 2319284
Data columns (total 28 columns):
timestamp            2319285 non-null datetime64[ns]
event                2319285 non-null object
person               2319285 non-null object
url                  189408 non-null object
sku                  1305987 non-null float64
model                1306837 non-null object
condition            1305987 non-null object
storage              1305987 non-null object
color                1305987 non-null object
skus                 501073 non-null object
search_term          112253 non-null object
staticpage           11034 non-null object
campaign_source      189559 non-null object
search_engine        105685 non-null object
screen_resolution    204066 non-null object
browser_version      204069 non-null object
label                1160977 non-null float64
dayweek              2319285 non-null object
fecha                2319285 non-null object
marca                2319285 no

In [16]:
eventsWithLabel3 = eventsWithLabel3.drop(columns=["url", "sku", "skus", "search_term", "staticpage", "campaign_source", "search_engine", "screen_resolution", "browser_version"])

In [17]:
#Empezamos con las sesiones.
eventsWithSessions = eventsWithLabel3.sort_values(by = ["person", "timestamp"])
eventsWithSessions["time_diff"] = (eventsWithSessions.timestamp - (eventsWithSessions.timestamp.shift())) / np.timedelta64(1, 's')
eventsWithSessions.loc[eventsWithSessions.person != eventsWithSessions.person.shift(), "time_diff"] = 0


In [18]:
eventsWithSessions["session_change"] = ((eventsWithSessions.event == "visited site") & (eventsWithSessions.time_diff > 1800.0)) | (eventsWithSessions.time_diff > 36000.0) | (eventsWithSessions.person != eventsWithSessions.person.shift())
eventsWithSessions["session_id"] = eventsWithSessions.groupby("person")["session_change"].cumsum()

In [19]:
#Obtener las personas con conversiones
dataConversion = eventsWithSessions.loc[eventsWithSessions['event'] == 'conversion']
personasCompraron = dataConversion.drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataPersonasCompraron = eventsWithSessions.loc[eventsWithSessions['person'].isin(personasCompraron)]

In [20]:
dataPersonasCompraron["aConversion"] = (dataPersonasCompraron.event == "conversion") | (dataPersonasCompraron.person != dataPersonasCompraron.person.shift())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
dataPersonasCompraron["conversions"] = dataPersonasCompraron.groupby("person")["aConversion"].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
dataFirstConversion = dataPersonasCompraron.loc[dataPersonasCompraron.conversions == 1.0].groupby("person")["time_diff"].agg({"timeFirstConversion": "sum" })

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [23]:
eventsWithSessions = pd.merge(eventsWithSessions, dataFirstConversion, how='left', on = 'person')

In [24]:
eventsWithSessions['timeFirstConversion'] = eventsWithSessions['timeFirstConversion'].fillna(value='0.0')

In [25]:
dataSession =  eventsWithSessions.groupby(["person", "session_id"], as_index=False)["time_diff"].agg({"sessionDuration": "sum" })
dataSession["sessionDuration"] = dataSession["sessionDuration"]/60

In [26]:
dataSessionGroupByPerson = dataSession.loc[dataSession.sessionDuration > 0.0].groupby("person")
dataSession = dataSessionGroupByPerson.agg({"sessionDuration": {"sessionDuration": "mean"}, "session_id": {"cantSessions": "count"}})
dataSession.columns = dataSession.columns.droplevel(0)


  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [27]:
eventsWithSessions = pd.merge(eventsWithSessions, dataSession, how ='left', on = 'person')
eventsWithSessions["sessionDuration"] = eventsWithSessions["sessionDuration"].fillna(value = "0.0")
eventsWithSessions["cantSessions"] = eventsWithSessions["cantSessions"].fillna(value = "0.0")

In [28]:
eventsWithSessions

Unnamed: 0,timestamp,event,person,model,condition,storage,color,label,dayweek,fecha,...,region,device_type,channel,new_vs_returning,time_diff,session_change,session_id,timeFirstConversion,sessionDuration,cantSessions
0,2018-05-17 12:27:47,checkout,0008ed71,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,0.0,Thursday,2018-05-17,...,Unknown,Computer,Referral,New,0.0,True,1.0,0.0,120.417,2
1,2018-05-17 13:44:59,visited site,0008ed71,,,,,0.0,Thursday,2018-05-17,...,Unknown,Computer,Referral,New,4632.0,True,2.0,0.0,120.417,2
2,2018-05-17 13:45:00,checkout,0008ed71,iPhone SE,Bom,64GB,Cinza espacial,0.0,Thursday,2018-05-17,...,Unknown,Computer,Referral,New,1.0,False,2.0,0.0,120.417,2
3,2018-05-17 16:21:54,visited site,0008ed71,,,,,0.0,Thursday,2018-05-17,...,Unknown,Computer,Referral,New,9414.0,True,3.0,0.0,120.417,2
4,2018-05-17 16:22:06,generic listing,0008ed71,,,,,0.0,Thursday,2018-05-17,...,Unknown,Computer,Referral,New,12.0,False,3.0,0.0,120.417,2
5,2018-05-17 16:28:37,checkout,0008ed71,LG G4 H818P,Bom,32GB,Preto,0.0,Thursday,2018-05-17,...,Unknown,Computer,Referral,New,391.0,False,3.0,0.0,120.417,2
6,2018-05-03 22:08:29,visited site,00091926,,,,,,Thursday,2018-05-03,...,Rio Grande do Sul,Computer,Direct,New,0.0,True,1.0,0.0,1545.52,26
7,2018-05-03 22:08:35,viewed product,00091926,Motorola Moto X Style,Muito Bom,32GB,Preto,,Thursday,2018-05-03,...,Rio Grande do Sul,Computer,Direct,New,6.0,False,1.0,0.0,1545.52,26
8,2018-05-03 22:08:51,viewed product,00091926,Samsung Galaxy A7 2017,Novo,32GB,Preto,,Thursday,2018-05-03,...,Rio Grande do Sul,Computer,Direct,New,16.0,False,1.0,0.0,1545.52,26
9,2018-05-03 22:09:25,viewed product,00091926,Motorola Moto X Style,Muito Bom,32GB,Preto,,Thursday,2018-05-03,...,Rio Grande do Sul,Computer,Direct,New,34.0,False,1.0,0.0,1545.52,26


In [29]:
eventosUnicos = eventsWithLabel3.drop_duplicates(subset="event", keep="first").reset_index()["event"]

In [30]:
#Funcion que realiza el oneHotEncoding
def oneHotEncoding(column, uniqueArray, dataFrame):
    label_encoder = LabelEncoder()
    label_encoder.fit(uniqueArray)
    integer_encoded = label_encoder.transform(dataFrame[column])
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    for i, item in enumerate(uniqueArray):
        dataFrame[item] = onehot_encoded[:, label_encoder.transform(uniqueArray)[i]]


In [31]:
oneHotEncoding("event", eventosUnicos, eventsWithLabel3)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [32]:
eventsWithLabel3.head()

Unnamed: 0,timestamp,event,person,model,condition,storage,color,label,dayweek,fecha,...,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,staticpage,conversion
0,2018-05-18 00:11:59,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-05-18 00:30:30,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-05-18 00:07:23,search engine hit,4886f805,,,,,,Friday,2018-05-18,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-05-18 00:11:56,checkout,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-05-18 00:11:35,viewed product,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
eventsWithLabel3["color"] = eventsWithLabel3.color.str.split(" ", expand=True)

In [34]:
top20Colors = eventsWithLabel3['color'].value_counts().nlargest(20).index
top20Colors

Index([u'Preto', u'Dourado', u'Cinza', u'Branco', u'Prateado', u'Ouro',
       u'Rosa', u'Prata', u'Azul', u'Platinum', u'Ametista', u'Verde',
       u'Vermelho', u'Bambu', u'Black', u'Titânio', u'Indigo', u'Amarelo',
       u'Cabernet', u'Olympic'],
      dtype='object')

In [35]:
def colorGrouping(x):
    return x if x in top20Colors else 'OtroColor'

In [36]:
eventsWithLabel3['color'] = eventsWithLabel3['color'].apply(colorGrouping)

In [37]:
coloresUnicos = eventsWithLabel3.drop_duplicates(subset='color', keep='first').reset_index()["color"]

In [38]:
oneHotEncoding("color", coloresUnicos, eventsWithLabel3)

In [39]:
eventsWithLabel3['condition'].value_counts()

Bom                   541428
Excelente             374870
Muito Bom             354901
Bom - Sem Touch ID     31391
Novo                    3397
Name: condition, dtype: int64

In [40]:
eventsWithLabel3['condition'] = eventsWithLabel3['condition'].fillna(value='idk')
conditionUnicos = eventsWithLabel3.drop_duplicates(subset='condition', keep='first').reset_index()["condition"]

In [41]:
oneHotEncoding("condition", conditionUnicos, eventsWithLabel3)

In [42]:
eventsWithLabel3['storage'].value_counts()

16GB     436689
32GB     425751
64GB     226577
128GB     98510
8GB       94320
256GB     17501
4GB        5318
512MB      1321
Name: storage, dtype: int64

In [43]:
eventsWithLabel3['storage'] = eventsWithLabel3['storage'].fillna(value='idk')
storageUnicos = eventsWithLabel3.drop_duplicates(subset='storage', keep='first').reset_index()["storage"]

In [44]:
oneHotEncoding("storage", storageUnicos, eventsWithLabel3)

In [45]:
eventsWithLabel3.head(10)

Unnamed: 0,timestamp,event,person,model,condition,storage,color,label,dayweek,fecha,...,Bom,Novo,32GB,64GB,16GB,128GB,256GB,8GB,4GB,512MB
0,2018-05-18 00:11:59,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-05-18 00:30:30,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-05-18 00:07:23,search engine hit,4886f805,,idk,idk,OtroColor,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-05-18 00:11:56,checkout,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-05-18 00:11:35,viewed product,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2018-05-18 00:10:52,searched products,4886f805,,idk,idk,OtroColor,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2018-05-18 00:11:53,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2018-05-18 00:07:22,generic listing,4886f805,,idk,idk,OtroColor,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2018-05-18 00:07:22,visited site,4886f805,,idk,idk,OtroColor,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2018-05-18 00:11:27,viewed product,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza,0.0,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
eventsWithLabel3['model'].value_counts()

iPhone 6                                     106118
iPhone 5s                                    100448
iPhone 6S                                     96485
iPhone 7                                      58598
Samsung Galaxy S7 Edge                        49915
iPhone 7 Plus                                 44627
Samsung Galaxy S7                             39279
iPhone 6S Plus                                37653
iPhone SE                                     36671
Samsung Galaxy S8                             32513
iPhone 6 Plus                                 32000
Samsung Galaxy S6 Edge                        30555
Samsung Galaxy J5                             29121
iPhone 5c                                     27763
Samsung Galaxy S6 Flat                        27485
Motorola Moto G4 Plus                         24888
Samsung Galaxy J7 Prime                       24808
Samsung Galaxy S8 Plus                        20804
iPhone 5                                      18388
Samsung Gala

In [47]:
top50models = eventsWithLabel3['model'].value_counts().nlargest(50).index
top50models

Index([u'iPhone 6', u'iPhone 5s', u'iPhone 6S', u'iPhone 7',
       u'Samsung Galaxy S7 Edge', u'iPhone 7 Plus', u'Samsung Galaxy S7',
       u'iPhone 6S Plus', u'iPhone SE', u'Samsung Galaxy S8', u'iPhone 6 Plus',
       u'Samsung Galaxy S6 Edge', u'Samsung Galaxy J5', u'iPhone 5c',
       u'Samsung Galaxy S6 Flat', u'Motorola Moto G4 Plus',
       u'Samsung Galaxy J7 Prime', u'Samsung Galaxy S8 Plus', u'iPhone 5',
       u'Samsung Galaxy A5 2017', u'Samsung Galaxy A7 2017', u'iPhone 4S',
       u'Samsung Galaxy J7', u'Motorola Moto G5 Plus',
       u'Motorola Moto X Play 4G Dual', u'Motorola Moto X2',
       u'Samsung Galaxy S5', u'Motorola Moto G3 4G', u'Samsung Galaxy Note 8',
       u'Motorola Moto G5 ', u'Samsung Galaxy A5', u'Motorola Moto G2 3G Dual',
       u'Samsung Galaxy S6 Edge Plus', u'Motorola Moto Z Play',
       u'Samsung Galaxy A5 2016', u'Samsung Galaxy J5 Prime',
       u'Motorola Moto Z', u'Samsung Galaxy Gran Prime Duos TV',
       u'Lenovo Vibe K5', u'Motorola Mo

In [48]:
def modelGrouping(x):
    return x if x in top50models else 'OtroModelo'

In [49]:
eventsWithLabel3['model'] = eventsWithLabel3['model'].apply(modelGrouping)
modelosUnicos = eventsWithLabel3.drop_duplicates(subset='model', keep='first').reset_index()['model']

In [50]:
eventsWithLabel3.head()

Unnamed: 0,timestamp,event,person,model,condition,storage,color,label,dayweek,fecha,...,Bom,Novo,32GB,64GB,16GB,128GB,256GB,8GB,4GB,512MB
0,2018-05-18 00:11:59,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-05-18 00:30:30,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-05-18 00:07:23,search engine hit,4886f805,OtroModelo,idk,idk,OtroColor,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-05-18 00:11:56,checkout,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-05-18 00:11:35,viewed product,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
oneHotEncoding("model", modelosUnicos, eventsWithLabel3)

In [52]:
eventsWithLabel3.head()

Unnamed: 0,timestamp,event,person,model,condition,storage,color,label,dayweek,fecha,...,iPhone 5c,iPhone 4S,Samsung Galaxy S5 New Edition Duos,Samsung Galaxy J5,Samsung Galaxy S5,Samsung Galaxy Note 5,iPhone 4G,Samsung Galaxy J7 2016 Metal,Motorola Moto G2 3G Dual,Samsung Galaxy Gran Prime Duos TV
0,2018-05-18 00:11:59,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-05-18 00:30:30,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-05-18 00:07:23,search engine hit,4886f805,OtroModelo,idk,idk,OtroColor,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-05-18 00:11:56,checkout,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-05-18 00:11:35,viewed product,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,Friday,2018-05-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
regionWithMorePurchase = eventsWithLabel3.loc[eventsWithLabel3["region"] != "Unknown"]["region"].value_counts().nlargest(10).index
regionWithMorePurchase

Index([u'Sao Paulo', u'Minas Gerais', u'Rio de Janeiro', u'Bahia',
       u'Pernambuco', u'Parana', u'Ceara', u'Rio Grande do Sul',
       u'Federal District', u'Maranhao'],
      dtype='object')

In [54]:
eventsWithLabel3["regionWithMorePurchase"] = eventsWithLabel3["region"].map(lambda x: 1 if x in regionWithMorePurchase else 0)

In [55]:
channelUnicos = eventsWithLabel3.drop_duplicates(subset="channel", keep="first").reset_index()["channel"]
channelUnicos

0     Organic
1        Paid
2    Referral
3      Direct
4      Social
5       Email
6     Unknown
Name: channel, dtype: object

In [56]:
oneHotEncoding("channel", channelUnicos, eventsWithLabel3)

In [57]:
SOUnicos = eventsWithLabel3.drop_duplicates(subset="SO", keep="first").reset_index()["SO"]
SOUnicos

0         Android 7
1         Android 5
2         Android 6
3         Windows 8
4         Windows 7
5        Windows 10
6         Android 4
7            Mac OS
8            iOS 10
9     Windows Phone
10           iOS 11
11            Linux
12            iOS 7
13            iOS 9
14        Android 8
15       Windows XP
16        Chrome OS
17           Ubuntu
18          Android
19            Other
20            iOS 8
21    Windows Vista
22          Tizen 3
23    BlackBerry OS
24          FreeBSD
25        Android 2
26        Android 3
27           Fedora
28            iOS 3
29          Tizen 2
30       Windows RT
31            iOS 6
32            iOS 5
33       Symbian OS
34       Android 10
Name: SO, dtype: object

In [58]:
oneHotEncoding("SO", SOUnicos, eventsWithLabel3)

In [59]:
deviceTypeUnicos = eventsWithLabel3.drop_duplicates(subset="device_type", keep="first").reset_index()["device_type"]
deviceTypeUnicos

0    Smartphone
1      Computer
2        Tablet
3       Unknown
Name: device_type, dtype: object

In [60]:
oneHotEncoding("device_type", deviceTypeUnicos, eventsWithLabel3)

In [61]:
eventsFinale = eventsWithLabel3.drop(columns=["timestamp", "event", "cantEventos","condition", "model", "storage", "color", "fecha", "SO", "marca", "city", "country", "region", "channel", "device_type", "new_vs_returning" ])

In [62]:
eventsFinale.head()

Unnamed: 0,person,label,dayweek,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,...,iOS 3,Tizen 2,Windows RT,iOS 6,iOS 5,Symbian OS,Android 10,Smartphone,Computer,Tablet
0,4886f805,,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,4886f805,,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4886f805,,Friday,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4886f805,,Friday,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4886f805,,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [63]:
eventsWithSessionsFilter = eventsWithSessions.groupby("person")[["timeFirstConversion", "sessionDuration", "cantSessions"]].agg("first")

In [64]:
testDF = eventsFinale.loc[eventsFinale['label'].isnull()]
testDF.head()

Unnamed: 0,person,label,dayweek,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,...,iOS 3,Tizen 2,Windows RT,iOS 6,iOS 5,Symbian OS,Android 10,Smartphone,Computer,Tablet
0,4886f805,,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,4886f805,,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4886f805,,Friday,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4886f805,,Friday,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4886f805,,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [65]:
testDFFinal = testDF.groupby("person").sum()

In [66]:
testDFFinal

Unnamed: 0_level_0,label,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,...,iOS 3,Tizen 2,Windows RT,iOS 6,iOS 5,Symbian OS,Android 10,Smartphone,Computer,Tablet
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00091926,0.0,372.0,0.0,2.0,0.0,0.0,34.0,15.0,25.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.0,0.0
00091a7a,0.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
000ba417,0.0,153.0,1.0,6.0,0.0,14.0,6.0,1.0,24.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,0.0
000e4d9e,0.0,339.0,5.0,1.0,0.0,17.0,13.0,19.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,411.0,0.0
000e619d,0.0,28.0,3.0,1.0,6.0,8.0,5.0,6.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0
001001be,0.0,41.0,0.0,3.0,17.0,3.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0
0010e89a,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
0016c4b5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
001804a2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
001a2273,0.0,6.0,4.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0


In [67]:
testDFFinal2 = pd.merge(testDFFinal, eventsWithSessionsFilter, how="left" , on = "person")

In [68]:
testDFFinal2

Unnamed: 0_level_0,label,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,...,iOS 6,iOS 5,Symbian OS,Android 10,Smartphone,Computer,Tablet,timeFirstConversion,sessionDuration,cantSessions
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00091926,0.0,372.0,0.0,2.0,0.0,0.0,34.0,15.0,25.0,0.0,...,0.0,0.0,0.0,0.0,0.0,448.0,0.0,0.0,1545.52,26
00091a7a,0.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,5.78333,1
000ba417,0.0,153.0,1.0,6.0,0.0,14.0,6.0,1.0,24.0,0.0,...,0.0,0.0,0.0,0.0,0.0,206.0,0.0,784277,3269.4,4
000e4d9e,0.0,339.0,5.0,1.0,0.0,17.0,13.0,19.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,411.0,0.0,0.0,2051.05,10
000e619d,0.0,28.0,3.0,1.0,6.0,8.0,5.0,6.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,1051.38,4
001001be,0.0,41.0,0.0,3.0,17.0,3.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,68.0,0.0,0.0,2702,2762.49,3
0010e89a,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.716667,1
0016c4b5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0333333,1
001804a2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
001a2273,0.0,6.0,4.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,14.5833,1


In [69]:
trainingDF = eventsFinale.loc[eventsFinale['label'].notnull()]
trainingDF.head()

Unnamed: 0,person,label,dayweek,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,...,iOS 3,Tizen 2,Windows RT,iOS 6,iOS 5,Symbian OS,Android 10,Smartphone,Computer,Tablet
9,ad93850f,0.0,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,ad93850f,0.0,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11,ad93850f,0.0,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12,ad93850f,0.0,Friday,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13,ad93850f,0.0,Friday,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [70]:
trainingDFFinal = trainingDF.groupby("person").sum()
trainingDFFinal.head()

Unnamed: 0_level_0,label,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,...,iOS 3,Tizen 2,Windows RT,iOS 6,iOS 5,Symbian OS,Android 10,Smartphone,Computer,Tablet
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0008ed71,0.0,0.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
000c79fe,0.0,3.0,1.0,1.0,9.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0
001802e4,0.0,4.0,0.0,1.0,4.0,4.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0
0019e639,0.0,189.0,13.0,15.0,11.0,28.0,19.0,29.0,165.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,471.0,0.0
001ca5ee,0.0,52.0,6.0,1.0,0.0,8.0,15.0,7.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,0.0,0.0


In [71]:
trainingDFFinal2 = pd.merge(trainingDFFinal, eventsWithSessionsFilter, how="left" , on = "person")

In [72]:
eventsWithSessionsFilter.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 38242 entries, 0008ed71 to fffd1246
Data columns (total 3 columns):
timeFirstConversion    38242 non-null object
sessionDuration        38242 non-null object
cantSessions           38242 non-null object
dtypes: object(3)
memory usage: 1.2+ MB


In [73]:
label_array = np.array(trainingDFFinal2['label'])
data_array = np.array(trainingDFFinal2.drop(columns=['label']))

In [74]:
x_train, x_test, y_train, y_test = train_test_split(data_array, label_array, test_size = 0.33, random_state = 42)

In [75]:
def logisticReg(x_train, x_test, y_train, y_test):
    logReg = LogisticRegression(solver='sag', random_state=1)
    logReg.fit(x_train, y_train)
    y_pred = logReg.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [76]:
def knn(k, x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [77]:
def decisionTree(max_depth, x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [78]:
def xgboost(max_depth, n_estimators, x_train, x_test, y_train, y_test):
    xgb = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [None]:
def adaboost(nEstimators, learningRate, x_train, x_test, y_train, y_test):
    ada = AdaBoostClassifier(n_estimators=nEstimators, learning_rate=learningRate, random_state=0)
    ada.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)
#n_estimators is the number of models to iteratively train.
#learning_rate is the contribution of each model to the weights and defaults to 1. 
#Reducing the learning rate will mean the weights will be increased or decreased to a small degree, 
#forcing the model train slower (but sometimes resulting in better performance scores).

In [None]:
def bagging(baseEstimator, nEstimators, learningRate, max_feature, bootstrap, bootstrap_feature, x_train, x_test, y_train, y_test):
    bag = BaggingClassifier(base_estimator=base_estimator, n_estimators=nEstimators, max_features=max_feature, bootstrap=bootstrap, bootstrap_features=bootstrap_feature, random_state=seed)
    bag.fit(x_train, y_train)
    y_pred = bag.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [None]:
def gradientboost(nEstimators, learningRate, maxDepth, x_train, x_test, y_train, y_test):
    grad = GradientBoostingClassifier(n_estimators=nEstimators, learning_rate=learningRate, max_depth=maxDepth)
    grad.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)
#The maximum depth limits the number of nodes in the tree. 
#Tune this parameter for best performance; the best value depends on the interaction of the input variables.

In [None]:
result = logisticReg(x_train, x_test, y_train, y_test)
result

In [None]:
result1 = knn(10, x_train, x_test, y_train, y_test)
result1

In [80]:
resultDecTree = decisionTree(150, x_train, x_test, y_train, y_test)
resultDecTree

60.53387193512027

In [None]:
resultXgboost = xgboost(10, 50,x_train, x_test, y_train, y_test )
resultXgboost