In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_auc_score


In [2]:
datos = pd.read_csv("events_up_to_01062018.csv",low_memory=False)

In [3]:
datos['timestamp'] = pd.to_datetime(datos['timestamp'])
datos['dia'] = datos['timestamp'].dt.day
datos['mes'] = datos['timestamp'].dt.month


In [4]:
quincenas = [False,0,2,4,6,8,10]

datos['mes_q'] = datos['mes'].apply(lambda x: quincenas[x])

datos['principio_fin'] = 0
datos.loc[datos['dia'] >15,'principio_fin'] = 1

datos['quincena'] = datos['principio_fin'] + datos['mes_q']

In [5]:
compras = datos.loc[datos['event'] == 'conversion',['person','quincena','timestamp']]
compras = compras.groupby(['person','quincena']).count()

compras = compras.unstack().fillna(0)
compras.columns = compras.columns.droplevel(0)
compras_por_q = ['primera_enero','segunda_enero','primera_febrero','segunda_febrero','primera_marzo'\
                        ,'segunda_marzo','primera_abril','segunda_abril','primera_mayo','segunda_mayo']

compras.columns= compras_por_q

In [6]:
for i in compras_por_q:
    compras.loc[compras[i]>0,i] = 1
compras

Unnamed: 0_level_0,primera_enero,segunda_enero,primera_febrero,segunda_febrero,primera_marzo,segunda_marzo,primera_abril,segunda_abril,primera_mayo,segunda_mayo
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
000ba417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
001001be,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
001804a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0019e639,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
001b0bf9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0020152e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
002aea56,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002ed810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0038a117,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0043a48e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [7]:
eventos_x_person = datos[['person','event','mes']]
eventos_x_person = eventos_x_person.loc[eventos_x_person['mes']<5]
eventos_x_person = eventos_x_person.groupby(['person','event']).agg({'mes':'count'})
eventos_x_person = eventos_x_person.unstack(-1)
eventos_x_person.columns = eventos_x_person.columns.droplevel(0)
eventos_x_person = eventos_x_person.reset_index(level=0,drop=False)
eventos_x_person.fillna(0,inplace=True)

In [8]:
model = pd.merge(eventos_x_person,compras,on='person',how='left')
model.fillna(0,inplace=True)
model.head()

Unnamed: 0,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,...,primera_enero,segunda_enero,primera_febrero,segunda_febrero,primera_marzo,segunda_marzo,primera_abril,segunda_abril,primera_mayo,segunda_mayo
0,00091a7a,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0019e639,13.0,52.0,11.0,1.0,15.0,0.0,6.0,10.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,001ca5ee,1.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,001dfc31,3.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,001e9aea,11.0,4.0,3.0,0.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
model.drop(columns={'conversion','segunda_mayo'},inplace=True)
features = model.iloc[:,1:19].columns
features

Index(['ad campaign hit', 'brand listing', 'checkout', 'generic listing',
       'lead', 'search engine hit', 'searched products', 'staticpage',
       'viewed product', 'visited site', 'primera_enero', 'segunda_enero',
       'primera_febrero', 'segunda_febrero', 'primera_marzo', 'segunda_marzo',
       'primera_abril', 'segunda_abril'],
      dtype='object')

In [10]:
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,\
                             GradientBoostingClassifier, ExtraTreesClassifier,\
                             BaggingClassifier, VotingClassifier)
rnd_clf = RandomForestClassifier(n_estimators = 100 , criterion = 'entropy',random_state = 0)
rnd_clf.fit(model.iloc[:,1:19],model['primera_mayo'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
for name, importance in zip(features, rnd_clf.feature_importances_):
    print('"' + name + '"'+" : ",importance,',')

"ad campaign hit" :  0.11116898149671445 ,
"brand listing" :  0.11627447275048773 ,
"checkout" :  0.07562332388002177 ,
"generic listing" :  0.10586079085468096 ,
"lead" :  0.0205583376536257 ,
"search engine hit" :  0.09302297402912287 ,
"searched products" :  0.0918162266823346 ,
"staticpage" :  0.0310447335944957 ,
"viewed product" :  0.1832806971088985 ,
"visited site" :  0.10979415066599184 ,
"primera_enero" :  0.0015601337166751105 ,
"segunda_enero" :  0.002375365127647378 ,
"primera_febrero" :  0.0055464850425284064 ,
"segunda_febrero" :  0.004488260648631165 ,
"primera_marzo" :  0.00999861520338791 ,
"segunda_marzo" :  0.008994795301255826 ,
"primera_abril" :  0.00750963264259272 ,
"segunda_abril" :  0.021082023600907336 ,


In [12]:
from sklearn.model_selection import train_test_split

test_s = 0.25
random_s = 0

x = np.array(model[features])
y = np.array(model['primera_mayo'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_s, random_state=random_s)

In [13]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_auc_score

print('VALORES INICIALES PARA K')

k_valores = [5,7,10,20,50]
mejor_k = 0
mejor_precision = 0

for k in k_valores:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    pred = knn.predict(x_test)
    precision = roc_auc_score(np.array(y_test), pred)
    print('La precisión para k=', k, 'es:',precision)
    if precision > mejor_precision:
        mejor_precision = precision
        mejor_k = k
        
print('Con k=', mejor_k, 'se obtuvo la mayor precisión:', mejor_precision)

VALORES INICIALES PARA K
La precisión para k= 5 es: 0.4992934526613283
La precisión para k= 7 es: 0.4997644842204428
La precisión para k= 10 es: 0.5
La precisión para k= 20 es: 0.5
La precisión para k= 50 es: 0.5
Con k= 10 se obtuvo la mayor precisión: 0.5


In [14]:
k_valores = []

for i in range(mejor_k-2, mejor_k+3):
    k_valores.append(i)
    
for k in k_valores:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    pred = knn.predict(x_test)
    precision = roc_auc_score(y_test, pred)
    print('La precisión para k =', k, 'es:',precision)
    if precision > mejor_precision:
        mejor_precision = precision
        mejor_k = k
        
print('Con k=', mejor_k, 'se obtuvo la mayor precisión:', mejor_precision)

La precisión para k = 8 es: 0.5
La precisión para k = 9 es: 0.5
La precisión para k = 10 es: 0.5
La precisión para k = 11 es: 0.5
La precisión para k = 12 es: 0.5
Con k= 10 se obtuvo la mayor precisión: 0.5


In [15]:
print(pd.Series(y_test).value_counts())

0.0    2123
1.0      61
dtype: int64
