# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning

# Feature Engineering

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

En este notebook se buscan atributos nuevos para concatenar al set de datos original, así pudiendo armar un modelo predictivo más robusto y eficiente.

In [None]:
import pandas as pd

df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('./data/brands.csv')
df_os = pd.read_csv('./data/os.csv')
df_browsers = pd.read_csv('./data/browsers.csv')

df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['month_number'] = df['timestamp'].dt.month

df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'
df['is_viewed_product'] = df['event'] == 'viewed product'

In [None]:
with pd.option_context('display.max_column',0):
  display(df.head())

### Eventos en total

Checkouts y conversiones en total por usuario.

A motivos prácticos, en todo este notebook con eventos nos referimos solamente a:

* Checkout: Un usuario ingresa al checkout de compra de un producto

* Conversión: Un usuario realiza una conversión comprando un producto.

In [None]:
udf_tmp1 = df.groupby('person').agg({'is_checkout':'sum', 
                                     'is_conversion':'sum', 
                                     'session_total_events':'sum',
                                     'session_first':'sum',                                                                         'session_total_conversions':'sum',
                                     'session_total_checkouts':'sum',
                                     'session_total_conversions':'sum'})

udf_tmp1.columns = ['total_checkouts',
                    'total_conversions',
                    'total_events',
                    'total_sessions',
                    'total_session_checkouts',
                    'total_session_conversions']

udf_tmp1['total_checkouts'] = udf_tmp1['total_checkouts'].astype('int')
udf_tmp1['total_conversions'] = udf_tmp1['total_conversions'].astype('int')
udf_tmp1['total_events'] = udf_tmp1['total_events'].astype('int')
udf_tmp1['total_sessions'] = udf_tmp1['total_sessions'].astype('int')
udf_tmp1['total_session_checkouts'] = udf_tmp1['total_session_checkouts'].astype('int')
udf_tmp1['total_session_conversions'] = udf_tmp1['total_session_conversions'].astype('int')

udf_tmp1['avg_events_per_session'] = udf_tmp1['total_events'] / udf_tmp1['total_sessions']

udf_tmp1['has_checkout'] = udf_tmp1['total_checkouts'] > 0
udf_tmp1['has_conversion'] = udf_tmp1['total_conversions'] > 0

udf_tmp1 = udf_tmp1.astype('int') 

udf_tmp1.head()

### Eventos por mes

Checkouts y conversiones por usuario por mes

In [None]:
udf_tmp2 = df['person'].drop_duplicates().to_frame().set_index('person')

for i in range(1,6):
    gb = df[df['month_number'] == i].groupby('person')
    udf_tmp2i = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
    udf_tmp2i.columns = ['total_conversions_month_{}'.format(i), 'total_checkouts_month_{}'.format(i)]

    udf_tmp2i['total_checkouts_month_{}'.format(i)] = udf_tmp2i['total_checkouts_month_{}'.format(i)].astype('int')
    udf_tmp2i['total_conversions_month_{}'.format(i)] = udf_tmp2i['total_conversions_month_{}'.format(i)].astype('int')

    udf_tmp2i['has_checkout_month_{}'.format(i)] = udf_tmp2i['total_checkouts_month_{}'.format(i)] > 0
    udf_tmp2i['has_conversion_month_{}'.format(i)] = udf_tmp2i['total_conversions_month_{}'.format(i)] > 0

    udf_tmp2 = udf_tmp2.merge(udf_tmp2i, how='outer', left_index=True, right_index=True)

udf_tmp2 = udf_tmp2.fillna(0)
udf_tmp2 = udf_tmp2.astype('int')

udf_tmp2.head()

### Eventos sin contar mayo

Checkouts y conversiones por usuario sin contar mayo (último mes registrado)

In [None]:
gb = df[df['month_number'] != 5].groupby('person')
udf_tmp3 = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp3.columns = ['total_conversions_months_1_to_4', 'total_checkouts_months_1_to_4']

udf_tmp3['total_checkouts_months_1_to_4'] = udf_tmp3['total_checkouts_months_1_to_4'].astype('int')
udf_tmp3['total_conversions_months_1_to_4'] = udf_tmp3['total_conversions_months_1_to_4'].astype('int')

udf_tmp3['has_checkout_months_1_to_4'] = udf_tmp3['total_checkouts_months_1_to_4'] > 0
udf_tmp3['has_conversion_months_1_to_4'] = udf_tmp3['total_conversions_months_1_to_4'] > 0

udf_tmp3 = udf_tmp3.astype('int')

udf_tmp3.head()

### Eventos en última semana

Checkouts y conversiones por usuario en la última semana registrada

In [None]:
gb = df[df['timestamp'] > pd.to_datetime('2018-05-23')].groupby('person')
udf_tmp4 = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp4.columns = ['total_conversions_last_week', 'total_checkouts_last_week']

udf_tmp4['total_checkouts_last_week'] = udf_tmp4['total_checkouts_last_week'].astype('int')
udf_tmp4['total_conversions_last_week'] = udf_tmp4['total_conversions_last_week'].astype('int')

udf_tmp4['has_checkout_last_week'] = udf_tmp4['total_checkouts_last_week'] > 0
udf_tmp4['has_conversion_last_week'] = udf_tmp4['total_conversions_last_week'] > 0

udf_tmp4 = udf_tmp4.astype('int')

udf_tmp1.head()

### Distribución mensual de las conversiones

Cuan esparcidas (en meses) estan las conversiones de los usuarios

In [None]:
udf_tmp5 = udf_tmp2['has_conversion_month_1']
for i in range(2,6):
    udf_tmp5 = udf_tmp5 + udf_tmp2['has_conversion_month_{}'.format(i)]
    
udf_tmp5 = udf_tmp5.to_frame()
udf_tmp5.columns = ['amount_of_months_that_has_bought']

for i in range(6):
    print('Users that have bought in {} different months: {}'.format(i, len(udf_tmp5[udf_tmp5['amount_of_months_that_has_bought'] >= i])))

udf_tmp5.head()

### Informacion de los últimos eventos registrados por usuario

Información como días hasta la última conversión, que día de la semana fue esta y demás.

(En esta celda por evento nos referimos a cualquier tipo de evento)

In [None]:
df_event = df.groupby('person').agg({'timestamp':'max'})
df_event.columns = ['timestamp_last_event']

df_checkout = df[df['event'] == 'checkout']
df_checkout = df_checkout.groupby('person').agg({'timestamp': 'max'})
df_checkout.columns = ['timestamp_last_checkout']

df_conversion = df[df['event'] == 'conversion']
df_conversion = df_conversion.groupby('person').agg({'timestamp': 'max'})
df_conversion.columns = ['timestamp_last_conversion']

df_viewed_product = df[df['event'] == 'viewed_product']
df_viewed_product = df_viewed_product.groupby('person').agg({'timestamp': 'max'})
df_viewed_product.columns = ['timestamp_last_viewed_product']

df_timelapse = df_event.merge(df_checkout, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_conversion, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_viewed_product, how='outer', on='person')

#Fecha arbitraria cuando el usuario nunca hizo checkout/conversion/viewed_product
df_timelapse = df_timelapse.fillna(pd.to_datetime('2018-01-01')) 

df_timelapse.head()

In [None]:
udf_tmp6 = df_timelapse.loc[:]

udf_tmp6['days_to_last_event'] = pd.to_datetime('2018-06-01').dayofyear - df_timelapse['timestamp_last_event'].dt.dayofyear

udf_tmp6['days_to_last_checkout'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['days_to_last_conversion'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['days_to_last_viewed_product'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_viewed_product'].dt.dayofyear

udf_tmp6.head()

In [None]:
udf_tmp6['doy_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofyear
udf_tmp6['dow_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofweek
udf_tmp6['dom_last_event'] = udf_tmp6['timestamp_last_event'].dt.day
udf_tmp6['woy_last_event'] = udf_tmp6['timestamp_last_event'].dt.weekofyear

udf_tmp6['doy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['dow_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofweek
udf_tmp6['dom_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.day
udf_tmp6['woy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.weekofyear

udf_tmp6['doy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['dow_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofweek
udf_tmp6['dom_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.day
udf_tmp6['woy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.weekofyear

udf_tmp6['doy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofyear
udf_tmp6['dow_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofweek
udf_tmp6['dom_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.day
udf_tmp6['woy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.weekofyear


udf_tmp6.head()

---

**Se guarda todo en `user-features.csv`**

In [None]:
udf = udf_tmp1
udf = udf.merge(udf_tmp2, how='outer', on='person')
udf = udf.merge(udf_tmp3, how='outer', on='person')
udf = udf.merge(udf_tmp4, how='outer', on='person')
udf = udf.merge(udf_tmp5, how='outer', on='person')
udf = udf.merge(udf_tmp6, how='outer', on='person')

udf = udf.fillna(0)
udf = udf.astype('int')

udf.head()

In [None]:
# Confirmamos no haber perdido datos en el medio

assert(len(udf)==len(df['person'].unique()))
display(len(udf))

In [None]:
udf.reset_index().to_csv('data/user-features.csv', index=False)