In [114]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [115]:
df1 = pd.read_csv('data/features_basicas.csv', low_memory=False, index_col='person')
df1["proprosion"] = np.where(df1["cant_viewed_product"]>0, (df1["cant_checkouts"]/df1["cant_viewed_product"]),0)

In [116]:
df2 = pd.read_csv('data/vistas_promedio.csv', low_memory=False, index_col='person')
df3 = pd.read_csv('data/featureUsuarioRealizaAlMenos10EventosEn20Minutos.csv', low_memory=False, index_col='person')

df_unidos = df1.join(df2).fillna(0)
df_unidos = df_unidos.join(df3).fillna(0)

In [117]:
labels = pd.read_csv('data/labels_training_set.csv', low_memory=False, index_col='person')
df_test = df_unidos.join(labels, how= "inner")
df_test

Unnamed: 0_level_0,cant_conversions,cant_checkouts,cant_viewed_product,cant_searched_product,cant_ad_campaign_hit,ad_campaign_hit,lead,cant_brand_listing,brand listing,total_sesiones,promedio_eventos_por_sesion,proprosion,mas_prom,conte_10,label
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ad93850f,0.0,1.0,20.0,0.0,10.0,True,False,15.0,True,5,13.000000,0.050000,False,True,0
1b9f7cf6,0.0,1.0,9.0,8.0,0.0,False,False,3.0,True,3,8.333333,0.111111,False,True,0
de8fe91b,0.0,1.0,27.0,13.0,0.0,False,False,3.0,True,2,26.500000,0.037037,False,True,0
45baf068,0.0,1.0,6.0,0.0,5.0,True,False,3.0,True,1,20.000000,0.166667,False,True,0
99abca5a,2.0,18.0,630.0,5.0,12.0,True,True,30.0,True,49,16.714286,0.028571,True,True,0
64f45e8d,0.0,2.0,32.0,1.0,1.0,True,False,1.0,True,3,15.666667,0.062500,False,True,0
ade6b3bf,0.0,1.0,92.0,5.0,13.0,True,False,3.0,True,12,11.583333,0.010870,True,True,0
d73ae9ef,0.0,1.0,5.0,0.0,0.0,False,False,4.0,True,1,17.000000,0.200000,False,True,0
b2cde41a,0.0,1.0,27.0,0.0,1.0,True,False,0.0,False,4,9.250000,0.037037,False,True,0
15ea8012,5.0,4.0,23.0,6.0,7.0,True,False,0.0,False,3,25.000000,0.173913,False,True,0


In [118]:
features = df_test.columns.tolist()
features.remove('label')
print(features)

['cant_conversions', 'cant_checkouts', 'cant_viewed_product', 'cant_searched_product', 'cant_ad_campaign_hit', 'ad_campaign_hit', 'lead', 'cant_brand_listing', 'brand listing', 'total_sesiones', 'promedio_eventos_por_sesion', 'proprosion', 'mas_prom', 'conte_10']


In [119]:
rf = RandomForestClassifier(n_estimators=10000, n_jobs=-1, min_samples_split=150,\
                            class_weight='balanced')

In [120]:
def partir_set(inicio,fin):
    df_fit = pd.concat([df_test.iloc[ : inicio],df_test.iloc[ fin:]])
    df_prub = df_test.iloc[inicio:fin]
    return df_fit,df_prub


In [130]:
df_1,df_2 = partir_set(1942,3848)
X_train = df_1[features]
Y_train = df_1['label']

X_test = df_2[features]
Y_test = df_2['label']


In [131]:
rf.fit(X_train,Y_train)
rf.score(X_test,Y_test)

0.82371458551941235

In [133]:
Y_pred = rf.predict(X_test)
print(roc_auc_score(Y_test, Y_pred))

0.75921961326


In [134]:
Y_truths = Y_test
print(classification_report(Y_truths, Y_pred))


             precision    recall  f1-score   support

          0       0.98      0.83      0.90      1810
          1       0.18      0.69      0.28        96

avg / total       0.94      0.82      0.87      1906



In [135]:
print(confusion_matrix(Y_truths, Y_pred))

[[1504  306]
 [  30   66]]


In [137]:
df_1,df_2 = partir_set(0,1942)
X_train = df_1[features]
Y_train = df_1['label']

X_test = df_2[features]
Y_test = df_2['label']

In [138]:
rf.fit(X_train,Y_train)
rf.score(X_test,Y_test)

0.79042224510813597

In [139]:
Y_pred = rf.predict(X_test)
print(roc_auc_score(Y_test, Y_pred))

0.70691071496


In [140]:
Y_truths = Y_test
print(classification_report(Y_truths, Y_pred))



             precision    recall  f1-score   support

          0       0.97      0.80      0.88      1826
          1       0.16      0.61      0.26       116

avg / total       0.92      0.79      0.84      1942



In [141]:
print(confusion_matrix(Y_truths, Y_pred))

[[1464  362]
 [  45   71]]
