In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

events = pd.read_csv('events_up_to_01062018.csv')
training = pd.read_csv('labels_training_set.csv')

In [48]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [49]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2341681 entries, 0 to 2341680
Data columns (total 23 columns):
timestamp                   object
event                       object
person                      object
url                         object
sku                         float64
model                       object
condition                   object
storage                     object
color                       object
skus                        object
search_term                 object
staticpage                  object
campaign_source             object
search_engine               object
channel                     object
new_vs_returning            object
city                        object
region                      object
country                     object
device_type                 object
screen_resolution           object
operating_system_version    object
browser_version             object
dtypes: float64(1), object(22)
memory usage: 410.9+ MB


In [50]:
training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [51]:
training["person"].value_counts().count()

19414

In [52]:
events["person"].value_counts().count()

38829

In [53]:
eventsWithLabel = events.merge(training, how='left', left_on='person', right_on='person')
eventsWithLabel.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,0.0
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [54]:
eventsWithLabel['timestamp'] = pd.to_datetime(eventsWithLabel['timestamp'])

In [55]:
# groupbyPersonas = eventsWithLabel.loc[:,['person', 'event']].groupby('person')
# dicc = {}
# for clave, data in groupbyPersonas:
#     lista = []
#     lista = data.drop_duplicates(subset = 'event', keep = 'first')['event'].tolist()
#     dicc[clave] = lista

In [56]:
#Obtengo una lista de todos los eventos que tiene esa persona
# eventsWithLabel['eventos'] = eventsWithLabel['person'].map(lambda x: dicc[x])
#Obtengo los dias de la semana
eventsWithLabel['dayweek'] = eventsWithLabel.timestamp.dt.dayofweek
#Obtengo la fecha
eventsWithLabel['fecha'] = eventsWithLabel.timestamp.dt.date
#Separo la marca
eventsWithLabel['marca'] = eventsWithLabel['model'].map(lambda x: str(x).split()[0])

In [57]:
#cantidad de personas que ingresaron 27624.
groupbyCantEventos = eventsWithLabel.groupby('person', as_index = False)['event'].agg({'cantEventos': 'count'})
groupbyCantEventos.sort_values(by = 'cantEventos', ascending = False)
eventsWithLabel = pd.merge(eventsWithLabel, groupbyCantEventos, on = 'person')

In [58]:
#Filtro a la gente que no tiene el evento visited_site
personasConVisitas = eventsWithLabel.loc[eventsWithLabel['event'] == 'visited site']\
                             .drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataNotVisit = eventsWithLabel.loc[eventsWithLabel['person'].isin(personasConVisitas)]

In [59]:
eventsWithLabel2 = dataNotVisit.loc[dataNotVisit['event'] == 'visited site'].groupby(['person', 'fecha'], as_index = False).agg({'city': 'first', 'region': 'first', 'country': 'first', 'device_type': 'first', 'operating_system_version': 'first', 'channel': 'first', 'new_vs_returning': 'first'})

In [60]:
eventsWithLabel3 = pd.merge(dataNotVisit, eventsWithLabel2, on = ['person', 'fecha'])
eventsWithLabel3.drop(['region_x', 'device_type_x', 'operating_system_version_x', 'country_x', 'city_x', 'channel_x', 'new_vs_returning_x'], axis = 1, inplace = True)
eventsWithLabel3.rename(columns = {'region_y': 'region', 'device_type_y': 'device_type', 'operating_system_version_y': 'SO', 'city_y': 'city', 'country_y': 'country', 'channel_y': 'channel', 'new_vs_returning_y': 'new_vs_returning'}, inplace =True)

In [61]:
def obtenerSistema(so):
    sistema = so.split()
    if len(sistema) > 1:
        otro = sistema[1].split('.')[0]
        return sistema[0] + ' ' + otro
    else:
        return sistema[0]

In [62]:
eventsWithLabel3['SO'] = eventsWithLabel3['SO'].map(obtenerSistema)

In [63]:
eventsWithLabel3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319285 entries, 0 to 2319284
Data columns (total 28 columns):
timestamp            datetime64[ns]
event                object
person               object
url                  object
sku                  float64
model                object
condition            object
storage              object
color                object
skus                 object
search_term          object
staticpage           object
campaign_source      object
search_engine        object
screen_resolution    object
browser_version      object
label                float64
dayweek              int64
fecha                object
marca                object
cantEventos          int64
city                 object
SO                   object
country              object
region               object
device_type          object
channel              object
new_vs_returning     object
dtypes: datetime64[ns](1), float64(2), int64(2), object(23)
memory usage: 513.1+ MB


In [64]:
testDF = eventsWithLabel3.loc[eventsWithLabel3['label'].isnull()]
testDF.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,fecha,marca,cantEventos,city,SO,country,region,device_type,channel,new_vs_returning
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
1,2018-05-18 00:30:30,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
2,2018-05-18 00:07:23,search engine hit,4886f805,,,,,,,,...,2018-05-18,,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
3,2018-05-18 00:11:56,checkout,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New
4,2018-05-18 00:11:35,viewed product,4886f805,,9287.0,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,...,2018-05-18,Samsung,9,Cantagalo,Android 7,Brazil,Rio de Janeiro,Smartphone,Organic,New


In [65]:
trainingDF = eventsWithLabel3.loc[eventsWithLabel3['label'].notnull()]
trainingDF.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,fecha,marca,cantEventos,city,SO,country,region,device_type,channel,new_vs_returning
9,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
10,2018-05-18 00:23:33,viewed product,ad93850f,,318.0,iPhone 5s,Muito Bom,64GB,Prateado,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
11,2018-05-18 00:16:10,viewed product,ad93850f,,5907.0,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
12,2018-05-18 00:14:55,viewed product,ad93850f,,6023.0,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,...,2018-05-18,iPhone,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning
13,2018-05-18 00:11:26,ad campaign hit,ad93850f,/comprar/iphone/iphone-5s,,,,,,,...,2018-05-18,,65,São Paulo,Android 5,Brazil,Sao Paulo,Smartphone,Paid,Returning


In [66]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [67]:
eventosUnicos = eventsWithLabel3.drop_duplicates(subset="event", keep="first").reset_index()["event"]

In [68]:
label_encoder = LabelEncoder()
label_encoder.fit(eventosUnicos)
integer_encoded = label_encoder.transform(eventsWithLabel3["event"])

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)


In [69]:
for i, event in enumerate(eventosUnicos):
    eventsWithLabel3[event] = onehot_encoded[:, label_encoder.transform(eventosUnicos)[i]]

In [70]:
eventsWithLabel3.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319285 entries, 0 to 2319284
Data columns (total 38 columns):
timestamp            2319285 non-null datetime64[ns]
event                2319285 non-null object
person               2319285 non-null object
url                  189408 non-null object
sku                  1305987 non-null float64
model                1306837 non-null object
condition            1305987 non-null object
storage              1305987 non-null object
color                1305987 non-null object
skus                 501073 non-null object
search_term          112253 non-null object
staticpage           2319285 non-null float64
campaign_source      189559 non-null object
search_engine        105685 non-null object
screen_resolution    204066 non-null object
browser_version      204069 non-null object
label                1160977 non-null float64
dayweek              2319285 non-null int64
fecha                2319285 non-null object
marca                2319285 

In [73]:
eventsWithSessions = eventsWithLabel3.sort_values(by = ["person", "timestamp"])
eventsWithSessions["time_diff"] = (eventsWithSessions.timestamp - (eventsWithSessions.timestamp.shift())) / np.timedelta64(1, 's')
# eventsWithSeasons["time_diff"] = eventsWithSeasons.timestamp.diff()
eventsWithSessions.loc[eventsWithSessions.person != eventsWithSessions.person.shift(), "time_diff"] = 0


In [74]:
eventsWithSessions["season_change"] = ((eventsWithSessions.event == "visited site") & (eventsWithSessions.time_diff > 1800.0)) | (eventsWithSessions.time_diff > 36000.0) | (eventsWithSessions.person != eventsWithSessions.person.shift())
eventsWithSessions["season_id"] = eventsWithSessions.groupby("person")["season_change"].cumsum()

In [75]:
eventsWithSessions.info(null_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319285 entries, 2246122 to 2218426
Data columns (total 41 columns):
timestamp            2319285 non-null datetime64[ns]
event                2319285 non-null object
person               2319285 non-null object
url                  189408 non-null object
sku                  1305987 non-null float64
model                1306837 non-null object
condition            1305987 non-null object
storage              1305987 non-null object
color                1305987 non-null object
skus                 501073 non-null object
search_term          112253 non-null object
staticpage           2319285 non-null float64
campaign_source      189559 non-null object
search_engine        105685 non-null object
screen_resolution    204066 non-null object
browser_version      204069 non-null object
label                1160977 non-null float64
dayweek              2319285 non-null int64
fecha                2319285 non-null object
marca                23

In [76]:
#Eliminar URL, SKU, campaign_source, search_engine, screen_resolution, browser_version, search_term, staticpage
eventsWithSessions

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion,time_diff,season_change,season_id
2246122,2018-05-17 12:27:47,checkout,0008ed71,,3372.0,Samsung Galaxy S6 Flat,Muito Bom,32GB,Dourado,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,1.0
2246125,2018-05-17 13:44:59,visited site,0008ed71,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4632.0,True,2.0
2246123,2018-05-17 13:45:00,checkout,0008ed71,,8247.0,iPhone SE,Bom,64GB,Cinza espacial,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False,2.0
2246126,2018-05-17 16:21:54,visited site,0008ed71,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9414.0,True,3.0
2246124,2018-05-17 16:22:06,generic listing,0008ed71,,,,,,,"6594,6651,6664,7253,2820,6706,6721,12606,480,1...",...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.0,False,3.0
2246121,2018-05-17 16:28:37,checkout,0008ed71,,7505.0,LG G4 H818P,Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,391.0,False,3.0
10989,2018-05-03 22:08:29,visited site,00091926,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True,1.0
10988,2018-05-03 22:08:35,viewed product,00091926,,8568.0,Motorola Moto X Style,Muito Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,False,1.0
10983,2018-05-03 22:08:51,viewed product,00091926,,14734.0,Samsung Galaxy A7 2017,Novo,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,False,1.0
10987,2018-05-03 22:09:25,viewed product,00091926,,8568.0,Motorola Moto X Style,Muito Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,False,1.0


In [77]:
eventsWithSessions.groupby(eventsWithSessions.new_vs_returning).count()

Unnamed: 0_level_0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion,time_diff,season_change,season_id
new_vs_returning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
New,666133,666133,666133,59643,362841,362973,362841,362841,362841,148415,...,666133,666133,666133,666133,666133,666133,666133,666133,666133,666133
Returning,1653152,1653152,1653152,129765,943146,943864,943146,943146,943146,352658,...,1653152,1653152,1653152,1653152,1653152,1653152,1653152,1653152,1653152,1653152


In [78]:
eventsWithSessions.loc[eventsWithSessions["person"] == "fe4029d3"]

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion,time_diff,season_change,season_id
556036,2018-05-23 11:56:47,visited site,fe4029d3,,,,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True,1.0
555997,2018-05-23 11:56:48,conversion,fe4029d3,,12618.0,Samsung Galaxy Gran Neo Duos,Bom,8GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,False,1.0
555994,2018-05-23 11:57:04,viewed product,fe4029d3,,2820.0,Samsung Galaxy Win Duos,Bom,8GB,Branco,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,False,1.0
555995,2018-05-23 11:57:09,viewed product,fe4029d3,,2820.0,Samsung Galaxy Win Duos,Bom,8GB,Branco,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,False,1.0
556013,2018-05-23 12:04:17,generic listing,fe4029d3,,,,,,,"6636,1061,6707,2750,7224,12605,12619,11346,277...",...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,428.0,False,1.0
556014,2018-05-23 12:04:29,brand listing,fe4029d3,,,,,,,"6357,3371,6371,2777,10896,2718,3191,6791,2893,...",...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,12.0,False,1.0
556015,2018-05-23 12:05:00,brand listing,fe4029d3,,,,,,,2796,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,31.0,False,1.0
555970,2018-05-23 12:05:07,viewed product,fe4029d3,,2796.0,Samsung Galaxy Note 3,Bom,32GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,False,1.0
556016,2018-05-23 12:05:09,brand listing,fe4029d3,,,,,,,"6357,3371,6371,2777,10896,2718,3191,6791,2893,...",...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,False,1.0
555966,2018-05-23 12:05:17,viewed product,fe4029d3,,10896.0,Samsung Galaxy A7 2017,Bom,32GB,Dourado,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,False,1.0


In [79]:
#Obtener las personas con conversiones
dataConversion = eventsWithSessions.loc[eventsWithSessions['event'] == 'conversion']
personasCompraron = dataConversion.drop_duplicates(subset = 'person', keep = 'first')['person'].tolist()
dataPersonasCompraron = eventsWithSessions.loc[eventsWithSessions['person'].isin(personasCompraron)]

In [80]:
# eventsWithSeasons["season_duration"] = eventsWithSeasons.groupby(["person", "season_id"])["time_diff"].

dataPersonasCompraron["aConversion"] = (dataPersonasCompraron.event == "conversion") | (dataPersonasCompraron.person != dataPersonasCompraron.person.shift())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [81]:
dataPersonasCompraron["conversions"] = dataPersonasCompraron.groupby("person")["aConversion"].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [82]:
dataPersonasCompraron

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,visited site,ad campaign hit,brand listing,lead,conversion,time_diff,season_change,season_id,aConversion,conversions
657435,2018-05-17 11:11:45,generic listing,000ba417,,,,,,,"6594,6650,6663,1059,12606,11346,2774,7337,2773...",...,0.0,0.0,0.0,0.0,0.0,0.0,True,1.0,True,1.0
657455,2018-05-17 11:11:45,visited site,000ba417,,,,,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,False,1.0,False,1.0
657401,2018-05-17 11:13:28,viewed product,000ba417,,9454.0,Samsung Galaxy A7 2016,Bom,16GB,Dourado,,...,0.0,0.0,0.0,0.0,0.0,103.0,False,1.0,False,1.0
657394,2018-05-17 11:13:38,viewed product,000ba417,,9468.0,Samsung Galaxy A7 2016,Bom,16GB,Rosa,,...,0.0,0.0,0.0,0.0,0.0,10.0,False,1.0,False,1.0
657397,2018-05-17 11:13:40,viewed product,000ba417,,9440.0,Samsung Galaxy A7 2016,Bom,16GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,2.0,False,1.0,False,1.0
657399,2018-05-17 11:13:45,viewed product,000ba417,,9442.0,Samsung Galaxy A7 2016,Excelente,16GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,5.0,False,1.0,False,1.0
657396,2018-05-17 11:13:46,viewed product,000ba417,,9441.0,Samsung Galaxy A7 2016,Muito Bom,16GB,Preto,,...,0.0,0.0,0.0,0.0,0.0,1.0,False,1.0,False,1.0
657434,2018-05-17 11:13:54,generic listing,000ba417,,,,,,,"6594,6650,6663,1059,12606,11346,2774,7337,2773...",...,0.0,0.0,0.0,0.0,0.0,8.0,False,1.0,False,1.0
657390,2018-05-17 11:14:04,viewed product,000ba417,,10744.0,Samsung Galaxy A3 2016,Excelente,16GB,Rose,,...,0.0,0.0,0.0,0.0,0.0,10.0,False,1.0,False,1.0
657400,2018-05-17 11:14:07,viewed product,000ba417,,10742.0,Samsung Galaxy A3 2016,Bom,16GB,Rose,,...,0.0,0.0,0.0,0.0,0.0,3.0,False,1.0,False,1.0


In [83]:
dataFirstConversion = dataPersonasCompraron.loc[dataPersonasCompraron.conversions == 1.0].groupby("person")["time_diff"].agg({"timeFirstConversion": "sum" })

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [84]:
eventsWithSessions = pd.merge(eventsWithSessions, dataFirstConversion, how='left', on = 'person')

In [85]:
eventsWithSessions['timeFirstConversion'] = eventsWithSessions['timeFirstConversion'].fillna(value='0.0')

In [86]:
dataSession =  eventsWithSessions.groupby(["person", "season_id"], as_index=False)["time_diff"].agg({"seasonDuration": "sum" })
dataSession["seasonDuration"] = dataSession["seasonDuration"]/60

In [87]:
dataSession = dataSession.loc[dataSession.seasonDuration > 0.0].groupby("person")["seasonDuration"].agg({"seasonDuration": "mean" })

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [88]:
eventsWithSessions = pd.merge(eventsWithSessions, dataSession, how='left', on = 'person')

In [96]:
eventsWithSessions.groupby("person").sum()

Unnamed: 0_level_0,sku,staticpage,label,dayweek,cantEventos,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,conversion,time_diff,season_change,season_id,seasonDuration
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0008ed71,19124.0,0.0,0.0,18,36,0.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,14450.0,3.0,14.0,7.225000e+02
00091926,2689082.0,0.0,0.0,1665,200704,372.0,0.0,2.0,0.0,0.0,34.0,15.0,25.0,0.0,0.0,2411014.0,26.0,6241.0,6.923938e+05
00091a7a,16463.0,0.0,0.0,0,100,3.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,0.0,347.0,1.0,10.0,5.783333e+01
000ba417,1049583.0,0.0,0.0,736,42436,153.0,1.0,6.0,0.0,14.0,6.0,1.0,24.0,0.0,1.0,784657.0,4.0,543.0,6.734973e+05
000c79fe,39776.0,0.0,0.0,17,289,3.0,1.0,1.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,620.0,1.0,17.0,1.756667e+02
000e4d9e,1469071.0,0.0,0.0,673,168921,339.0,5.0,1.0,0.0,17.0,13.0,19.0,17.0,0.0,0.0,1230627.0,10.0,1319.0,8.429795e+05
000e619d,200221.0,0.0,0.0,197,4624,28.0,3.0,1.0,6.0,8.0,5.0,6.0,11.0,0.0,0.0,252330.0,4.0,197.0,7.149350e+04
001001be,117727.0,0.0,0.0,138,4624,41.0,0.0,3.0,17.0,3.0,3.0,0.0,0.0,0.0,1.0,497248.0,3.0,71.0,1.878492e+05
0010e89a,9142.0,0.0,0.0,8,16,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,43.0,1.0,4.0,2.866667e+00
0016c4b5,16942.0,0.0,0.0,4,16,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,1.333333e-01


In [90]:
def logisticReg(x_train, x_test, y_train, y_test):
    logReg = LogisticRegression()
    logReg.fit(x_train, y_train)
    y_pred = logReg.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [91]:
def knn(k, x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [92]:
def decisionTree(max_depth, max_features, x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [93]:
def xgboost(max_depth, n_estimators, x_train, x_test, y_train, y_test):
    xgb = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)