# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning

# Feature Engineering

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

En este notebook se buscan atributos nuevos para concatenar al set de datos original, así pudiendo armar un modelo predictivo más robusto y eficiente.

In [None]:
import pandas as pd
import numpy as np
import calendar

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Setting random seed.
seed = 42

In [None]:
df = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['month_number'] = df['timestamp'].dt.month

df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'



### user-features: Checkouts y conversiones por usuario

In [None]:
# Total checkouts and conversions
udf_tmp1 = df.groupby('person').agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp1.columns = ['total_conversions', 'total_checkouts']
udf_tmp1['total_checkouts'] = udf_tmp1['total_checkouts'].astype('int')
udf_tmp1['total_conversions'] = udf_tmp1['total_conversions'].astype('int')

udf_tmp1['has_checkout'] = udf_tmp1['total_checkouts'] > 0
udf_tmp1['has_conversion'] = udf_tmp1['total_conversions'] > 0

udf_tmp1 = udf_tmp1.astype('int') 

In [None]:
display(len(udf_tmp1))
udf_tmp1[udf_tmp1['total_conversions'] > 10].sort_values('total_conversions', ascending=False)

### user-monthly-features: Checkouts y conversiones por usuario por mes

In [None]:
udf_tmp2 = df['person'].drop_duplicates().to_frame().set_index('person')
display(udf_tmp2.head())

for i in range(1,6):
    gb = df[df['month_number'] == i].groupby('person')
    udf_tmp2i = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
    udf_tmp2i.columns = ['total_conversions_month_{}'.format(i), 'total_checkouts_month_{}'.format(i)]

    udf_tmp2i['total_checkouts_month_{}'.format(i)] = udf_tmp2i['total_checkouts_month_{}'.format(i)].astype('int')
    udf_tmp2i['total_conversions_month_{}'.format(i)] = udf_tmp2i['total_conversions_month_{}'.format(i)].astype('int')

    udf_tmp2i['has_checkout_month_{}'.format(i)] = udf_tmp2i['total_checkouts_month_{}'.format(i)] > 0
    udf_tmp2i['has_conversion_month_{}'.format(i)] = udf_tmp2i['total_conversions_month_{}'.format(i)] > 0

    udf_tmp2 = udf_tmp2.merge(udf_tmp2i, how='outer', left_index=True, right_index=True)

udf_tmp2 = udf_tmp2.fillna(0)
udf_tmp2 = udf_tmp2.astype('int')
udf_tmp2.head(2)

### user-may-features: Checkouts y conversiones por usuario en mayo (último mes registrado)

In [None]:
# Has conversions or checkouts in may
gb = df[df['month_number'] != 5].groupby('person')
udf_tmp3 = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp3.columns = ['total_conversions_month_1_4', 'total_checkouts_month_1_4']

udf_tmp3['total_checkouts_month_1_4'] = udf_tmp3['total_checkouts_month_1_4'].astype('int')
udf_tmp3['total_conversions_month_1_4'] = udf_tmp3['total_conversions_month_1_4'].astype('int')

udf_tmp3['has_checkout_month_1_4'] = udf_tmp3['total_checkouts_month_1_4'] > 0
udf_tmp3['has_conversion_month_1_4'] = udf_tmp3['total_conversions_month_1_4'] > 0

udf_tmp3 = udf_tmp3.astype('int')
udf_tmp3.head(10)

### user-lastweek-features: Checkouts y conversiones por usuario en la última semana registrada

In [None]:
gb = df[df['timestamp'] > pd.to_datetime('2018-05-23')].groupby('person')
udf_tmp4 = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp4.columns = ['total_conversions_last_week', 'total_checkouts_last_week']

udf_tmp4['total_checkouts_last_week'] = udf_tmp4['total_checkouts_last_week'].astype('int')
udf_tmp4['total_conversions_last_week'] = udf_tmp4['total_conversions_last_week'].astype('int')

udf_tmp4['has_checkout_last_week'] = udf_tmp4['total_checkouts_last_week'] > 0
udf_tmp4['has_conversion_last_week'] = udf_tmp4['total_conversions_last_week'] > 0

udf_tmp4 = udf_tmp4.astype('int')
udf_tmp4.head(10)

### user-differentmonths-features: Cuan esparcidas (en meses) estan las conversiones de los usuarios

In [None]:
udf_tmp5 = udf_tmp2['has_conversion_month_1']
display(len(udf_tmp2))
display(len(udf_tmp5))
for i in range(2,6):
    udf_tmp5 = udf_tmp5 + udf_tmp2['has_conversion_month_{}'.format(i)]
    
udf_tmp5 = udf_tmp5.to_frame()
udf_tmp5.columns = ['amount_of_months_that_has_bought']

for i in range(6):
    print('Users that have bought in {} different months: {}'.format(i, len(udf_tmp5[udf_tmp5['amount_of_months_that_has_bought'] >= i])))

udf_tmp5.head(5)

---

**Se guarda todo en `user-features.csv`**

In [None]:
udf = udf_tmp2
udf = udf.merge(udf_tmp3, how='outer', on='person')
udf = udf.merge(udf_tmp4, how='outer', on='person')
udf = udf.merge(udf_tmp5, how='outer', on='person')
udf = udf.merge(udf_tmp1, how='outer', on='person')
udf = udf.fillna(0)
udf = udf.astype('int')
display(len(udf))
udf.head(20)

In [None]:
# Confirmamos no haber perdido datos en el medio

display(len(udf))
display(len(df['person'].unique()))

In [None]:
udf.to_csv('data/user-features.csv', index=False)

In [None]:
final = udf.columns.tolist()
inicio = df.columns.tolist()
dif = []
contador = 1
for columna in inicio:
    if not columna in final:
        dif.append(columna)

for columna in final:
    if columna in inicio:
        contador+=1
print(len(inicio) == contador)
contador
final