In [None]:
import pandas as pd
import numpy as np
import calendar

# !!!! borrar, va en otro notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Setting random seed.
seed = 42

In [None]:
df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('data/brands.csv')
df_os = pd.read_csv('data/os.csv')
df_browsers = pd.read_csv('data/browsers.csv')
df_y = pd.read_csv('data/labels_training_set.csv')
df_y = df_y.groupby('person').sum()


df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')

In [None]:
# Los atributos con pocos valores posibles se pasan a variables categoricas para ahorrar memoria
df['event'] = df['event'].astype('category')
df['condition'] = df['condition'].astype('category')
df['storage'] = df['storage'].astype('category')
df['search_engine'] = df['search_engine'].astype('category')
df['channel'] = df['channel'].astype('category')
df['device_type'] = df['device_type'].astype('category')

df['brand'] = df['brand'].astype('category')
df['operating_system'] = df['operating_system'].astype('category')
df['browser'] = df['browser'].astype('category')

# El tiempo es mejor manejarlo como tal
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df['month_number'] = df['timestamp'].dt.month
df['month_name'] = df['month_number'].apply(lambda x: calendar.month_abbr[x])
df['week_day'] = df['timestamp'].dt.weekday
df['week_number'] = df['timestamp'].dt.week
df['week_day_name'] = df['timestamp'].dt.weekday_name
df['day_date'] = df['timestamp'].dt.to_period('D')
df['day_dom'] = df['timestamp'].dt.day
df['day_doy'] = df['timestamp'].dt.dayofyear
df['hour_count'] = df['timestamp'].dt.hour

In [None]:
df.sort_values(['person', 'timestamp'])

In [None]:
df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'

### Users DataFrame

In [None]:
# Total checkouts and conversions
udf_tmp1 = df.groupby('person').agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp1.columns = ['total_conversions', 'total_checkouts']
udf_tmp1['total_conversions'] = udf_tmp1['total_conversions'].astype('int')
udf_tmp1['total_checkouts'] = udf_tmp1['total_checkouts'].astype('int')

udf_tmp1['has_conversion'] = udf_tmp1['total_conversions'] > 0
udf_tmp1['has_checkout'] = udf_tmp1['total_checkouts'] > 0

udf_tmp1 = udf_tmp1.astype('int') 

In [None]:
display(len(udf_tmp1))
udf_tmp1[udf_tmp1['total_conversions'] > 10].sort_values('total_conversions', ascending=False)

In [None]:
# Has conversions or checkouts in may
gb = df[df['month_number'] == 5].groupby('person')
udf_tmp2 = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp2.columns = ['total_conversions_month_5', 'total_checkouts_month_5']

udf_tmp2['total_conversions_month_5'] = udf_tmp2['total_conversions_month_5'].astype('int')
udf_tmp2['total_checkouts_month_5'] = udf_tmp2['total_checkouts_month_5'].astype('int')

udf_tmp2['has_conversion_month_5'] = udf_tmp2['total_conversions_month_5'] > 0
udf_tmp2['has_checkout_month_5'] = udf_tmp2['total_checkouts_month_5'] > 0

udf_tmp2 = udf_tmp2.astype('int')

In [None]:
udf_tmp2.head(10)

In [None]:
udf = udf_tmp1.merge(udf_tmp2, how='outer', on='person')
udf = udf.fillna(0)
udf = udf.astype('int')
display(len(udf))
udf.head()

In [None]:
display(len(udf))
display(len(df_events['person'].unique()))
display(len(df['person'].unique()))
udf.head(20)

In [None]:
# Dummys de todos 0s y todos 1s
gb = df[['person', 'week_number']].groupby('person')
dummy = gb.agg('sum')

dummy = dummy['week_number'] * 0
dummy = dummy.to_frame()
print("dummy-size: "+str(len(dummy))+" | y-size: "+str(len(df_y)))
y_df_tmp = y_df.set_index(y_df['person'])[['label']]

dummy_final = dummy.merge(y_df_tmp, how='outer', left_index=True, right_index=True, indicator=True)
#display(dummy.head())
#display(y_df.head())
#display(dummy_final.head())
dummy['label'] = dummy['week_number'] * 0
dummy = dummy[['label']]
dummy = dummy[~dummy.index.isin(y_df.index)]
display(len(dummy))
dummy
display(len(dummy_final.query('_merge != "both"')))
dummy_final = dummy_final.query('_merge != "both"')[['week_number']]
dummy_final.columns = ['label']
dummy_final['label'] = dummy_final['label'] + 1
print("checking sum is zero: ")
display(dummy_final.sum())
dummy_final.head()

In [None]:
dummy_final.to_csv('submit-zeros.csv', header=True)

<Siguiente minado>

# Submission Framework

In [None]:
def require(x1, x2):
    if x1 != x2:
        print('ERROR - {} must be equal to {}'.format(str(x1), str(x2)))
        raise ValueError('Oh la la.') 

def df_label_xor(df1, df2):
    
    merged = df1.merge(df2, how='outer', left_index=True, right_index=True, indicator=True)
    merged = merged.query('_merge != "both"')
    return merged
    
# Crea la matriz X y el vector y para entrenar
def fr1_extract_X_y(df, df_y):
    require(len(df), 38829)
    require(len(df_y), 19414)
    
    data = udf.merge(df_y, how='inner', left_index= True, right_index=True)
    require(len(data), 19414)
    
    X = data.drop('label', axis=1).values
    y = df_y.values
    
    return X, y

# Splitea para generar los set de entrenamiento y de prueba
def fr2_train_test_split(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, stratify=y, random_state=seed)
    return X_train, X_test, y_train, y_test

# Customizado, se tiene que hacer uno por algoritmo
def fr3_decision_tree(X_train, X_test, y_train):
    tree = DecisionTreeClassifier(criterion='gini',
                              min_samples_leaf=5,
                              min_samples_split=5,
                              max_depth=3,
                              random_state=seed)

    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    return y_pred
    
def fr4_accuracy_score(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    print('{} accuracy score: {}'.format(model_name, accuracy))    
    
# Crea la matriz X a predecir
def fr5_extract_X_to_predict(df, df_y, model):
    require(len(df), 38829)
    require(len(df_y), 19414)
       
    data = df_label_xor(df, df_y)
    data = data.drop(['label', '_merge'], axis=1)

    require(len(data), 19415)

    predictions = model.predict(data.values)
    return data, predictions

# Devuelve la cantidad de 1s predecidos
def fr6_1s_predicted(predictions):
    print('Ammount of 1s: {}'.format(predictions.sum()))
    
def fr7_to_csv(df, predictions, name_csv):
    submission = df
    submission['label'] = predictions
    submission = submission['label']
    
    require(len(submission), 19415)
    
    submission.to_csv(name_csv, header=True)

In [None]:
# Framework para entrenar

# SÓLO CAMBIAR ESTOS PARÁMETROS
fr_df, fr_df_y = udf, df_y

# De acá no tienen que tocar nada
X, y = fr1_extract_X_y(fr_df, fr_df_y)
X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed)
y_pred = fr3_decision_tree(X_train, X_test, y_train)
fr4_accuracy_score(y_test, y_pred, 'DecissionTreeClassifier')

In [None]:
# Framework para predecir, luego se submitea el archivo generado

# SÓLO CAMBIAR ESTOS PARÁMETROS
fr_model = tree

# De acá no tienen que tocar nada
X_to_predict, predictions = fr5_extract_X_to_predict(fr_df, fr_df_y, fr_model)
fr6_1s_predicted(predictions)
fr7_to_csv(X_to_predict, predictions, 'submission-decission-tree-1.csv')