# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning

# Feature Engineering

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

En este notebook se buscan atributos nuevos para concatenar al set de datos original, así pudiendo armar un modelo predictivo más robusto y eficiente.

In [None]:
import pandas as pd
import numpy as np

df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('./data/brands.csv')
df_os = pd.read_csv('./data/os.csv')
df_browsers = pd.read_csv('./data/browsers.csv')
df_prices = pd.read_csv('./data/prices.csv')

df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')
df = df.merge(df_prices, how='left', on='sku')

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['month_number'] = df['timestamp'].dt.month

df['is_viewed_product'] = df['event'] == 'viewed_product'
df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'
df['is_viewed_product'] = df['event'] == 'viewed product'
df['session_checkout_first'] = df['session_first'] & df['session_has_checkout']
df['session_conversion_first'] = df['session_first'] & df['session_has_conversion']
df['session_ad_first'] = df['session_first'] & df['session_ad']
df['session_ad_checkout_event'] = df['session_first'] & df['session_ad']
df['session_ad_conversion_event'] = df['session_first'] & df['session_ad']

In [None]:
with pd.option_context('display.max_column',0):
  display(df.sample(5))

### Eventos en total

Checkouts y conversiones en total por usuario.

A motivos prácticos, en todo este notebook con eventos nos referimos solamente a:

* Checkout: Un usuario ingresa al checkout de compra de un producto

* Conversión: Un usuario realiza una conversión comprando un producto.

In [None]:
udf_tmp1 = df.groupby('person').agg({'is_viewed_product':'sum',
                                     'is_checkout':'sum', 
                                     'is_conversion':'sum', 
                                     'event':'count',
                                     'session_first':'sum',                                                                         'session_total_conversions':'sum',
                                     'session_checkout_first':'sum',
                                     'session_conversion_first':'sum',
                                     'session_ad':'sum',
                                     'session_ad_first':'sum'})

# Cosa rara que aparece solo
del udf_tmp1['session_total_conversions']

udf_tmp1.columns = ['total_viewed_products',
                    'total_checkouts',
                    'total_conversions',
                    'total_events',
                    'total_sessions',
                    'total_session_checkouts',
                    'total_session_conversions',
                    'total_events_ad_session',
                    'total_ad_sessions']


udf_tmp1['avg_events_per_session'] = udf_tmp1['total_events'] / udf_tmp1['total_sessions']
udf_tmp1['avg_events_per_ad_session'] = (udf_tmp1['total_events_ad_session'] / udf_tmp1['total_ad_sessions']).replace([np.inf, -np.inf], np.nan).fillna(0)
udf_tmp1['percentage_of_ad_sessions'] = udf_tmp1['total_ad_sessions'] / udf_tmp1['total_sessions']

udf_tmp1['has_checkout'] = udf_tmp1['total_checkouts'] > 0
udf_tmp1['has_conversion'] = udf_tmp1['total_conversions'] > 0


udf_tmp1.head()

### Eventos por mes

Checkouts y conversiones por usuario por mes

In [None]:
udf_tmp2 = df['person'].drop_duplicates().to_frame().set_index('person')

for i in range(1,6):
    gb = df[df['month_number'] == i].groupby('person')
    udf_tmp2i = gb.agg({'is_viewed_product':'sum',
                                     'is_checkout':'sum', 
                                     'is_conversion':'sum', 
                                     'event':'count',
                                     'session_first':'sum',                                                                         'session_total_conversions':'sum',
                                     'session_checkout_first':'sum',
                                     'session_conversion_first':'sum',
                                     'session_ad':'sum',
                                     'session_ad_first':'sum'})
    
    # Cosa rara que aparece solo
    del udf_tmp2i['session_total_conversions']

    udf_tmp2i.columns = ['total_viewed_products_month_{}'.format(i),
                    'total_checkouts_month_{}'.format(i),
                    'total_conversions_month_{}'.format(i),
                    'total_events_month_{}'.format(i),
                    'total_sessions_month_{}'.format(i),
                    'total_session_checkouts_month_{}'.format(i),
                    'total_session_conversions_month_{}'.format(i),
                    'total_events_ad_session_month_{}'.format(i),
                    'total_ad_sessions_month_{}'.format(i)]

    
    udf_tmp2i['has_checkout_month_{}'.format(i)] = udf_tmp2i['total_checkouts_month_{}'.format(i)] > 0
    udf_tmp2i['has_conversion_month_{}'.format(i)] = udf_tmp2i['total_conversions_month_{}'.format(i)] > 0

    udf_tmp2 = udf_tmp2.merge(udf_tmp2i, how='outer', left_index=True, right_index=True)

udf_tmp2 = udf_tmp2.fillna(0)

udf_tmp2.head()

### Eventos sin contar mayo

Checkouts y conversiones por usuario sin contar mayo (último mes registrado)

In [None]:
gb = df[df['month_number'] != 5].groupby('person')

udf_tmp3 = gb.agg({'is_viewed_product':'sum',
                   'is_checkout':'sum', 
                   'is_conversion':'sum', 
                   'event':'count',
                   'session_first':'sum',                                                                         
                   'session_total_conversions':'sum',
                   'session_checkout_first':'sum',
                   'session_conversion_first':'sum',
                   'session_ad':'sum',
                   'session_ad_first':'sum'})
    
# Cosa rara que aparece solo
del udf_tmp3['session_total_conversions']

udf_tmp3.columns = ['total_viewed_products_months_1_to_4',
                    'total_checkouts_months_1_to_4',
                    'total_conversions_months_1_to_4',
                    'total_events_months_1_to_4',
                    'total_sessions_months_1_to_4',
                    'total_session_checkouts_months_1_to_4',
                    'total_session_conversions_months_1_to_4',
                    'total_events_ad_session_months_1_to_4',
                    'total_ad_sessions_months_1_to_4']


udf_tmp3['has_checkout_months_1_to_4'] = udf_tmp3['total_checkouts_months_1_to_4'] > 0
udf_tmp3['has_conversion_months_1_to_4'] = udf_tmp3['total_conversions_months_1_to_4'] > 0


udf_tmp3.head()

### Eventos en última semana

Checkouts y conversiones por usuario en la última semana registrada

In [None]:
gb = df[df['timestamp'] > pd.to_datetime('2018-05-23')].groupby('person')

udf_tmp4 = gb.agg({'is_viewed_product':'sum',
                   'is_checkout':'sum', 
                   'is_conversion':'sum', 
                   'event':'count',
                   'session_first':'sum',                                                                         
                   'session_total_conversions':'sum',
                   'session_checkout_first':'sum',
                   'session_conversion_first':'sum',
                   'session_ad':'sum',
                   'session_ad_first':'sum'})
    
# Cosa rara que aparece solo
del udf_tmp4['session_total_conversions']

udf_tmp4.columns = ['total_viewed_products_last_week',
                    'total_checkouts_last_week',
                    'total_conversions_last_week',
                    'total_events_last_week',
                    'total_sessions_last_week',
                    'total_session_checkouts_last_week',
                    'total_session_conversions_last_week',
                    'total_events_ad_session_last_week',
                    'total_ad_sessions_last_week']


udf_tmp4['has_checkout_last_week'] = udf_tmp4['total_checkouts_last_week'] > 0
udf_tmp4['has_conversion_last_week'] = udf_tmp4['total_conversions_last_week'] > 0


udf_tmp1.head()

### Distribución mensual de las conversiones

Cuan esparcidas (en meses) estan las conversiones de los usuarios

In [None]:
udf_tmp5 = udf_tmp2['has_conversion_month_1']
for i in range(2,6):
    udf_tmp5 = udf_tmp5 + udf_tmp2['has_conversion_month_{}'.format(i)]
    
udf_tmp5 = udf_tmp5.to_frame()
udf_tmp5.columns = ['amount_of_months_that_has_bought']

for i in range(6):
    print('Users that have bought in {} different months: {}'.format(i, len(udf_tmp5[udf_tmp5['amount_of_months_that_has_bought'] >= i])))

udf_tmp5.head()

### Informacion de los últimos eventos registrados por usuario

Información como días hasta la última conversión, que día de la semana fue esta y demás.

(En esta celda por evento nos referimos a cualquier tipo de evento)

In [None]:
df_event = df.groupby('person').agg({'timestamp':'max'})
df_event.columns = ['timestamp_last_event']

df_checkout = df[df['event'] == 'checkout']
df_checkout = df_checkout.groupby('person').agg({'timestamp': 'max'})
df_checkout.columns = ['timestamp_last_checkout']

df_conversion = df[df['event'] == 'conversion']
df_conversion = df_conversion.groupby('person').agg({'timestamp': 'max'})
df_conversion.columns = ['timestamp_last_conversion']

df_viewed_product = df[df['event'] == 'viewed_product']
df_viewed_product = df_viewed_product.groupby('person').agg({'timestamp': 'max'})
df_viewed_product.columns = ['timestamp_last_viewed_product']

df_timelapse = df_event.merge(df_checkout, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_conversion, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_viewed_product, how='outer', on='person')

#Fecha arbitraria cuando el usuario nunca hizo checkout/conversion/viewed_product
df_timelapse = df_timelapse.fillna(pd.to_datetime('2018-01-01')) 

df_timelapse.head()

In [None]:
udf_tmp6 = df_timelapse.loc[:]

udf_tmp6['days_to_last_event'] = pd.to_datetime('2018-06-01').dayofyear - df_timelapse['timestamp_last_event'].dt.dayofyear

udf_tmp6['days_to_last_checkout'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['days_to_last_conversion'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['days_to_last_viewed_product'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_viewed_product'].dt.dayofyear

udf_tmp6.head()

In [None]:
udf_tmp6['doy_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofyear
udf_tmp6['dow_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofweek
udf_tmp6['dom_last_event'] = udf_tmp6['timestamp_last_event'].dt.day
udf_tmp6['woy_last_event'] = udf_tmp6['timestamp_last_event'].dt.weekofyear

udf_tmp6['doy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['dow_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofweek
udf_tmp6['dom_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.day
udf_tmp6['woy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.weekofyear

udf_tmp6['doy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['dow_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofweek
udf_tmp6['dom_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.day
udf_tmp6['woy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.weekofyear

udf_tmp6['doy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofyear
udf_tmp6['dow_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofweek
udf_tmp6['dom_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.day
udf_tmp6['woy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.weekofyear


udf_tmp6.head()

### Precios de la ultima conversion realizada por el usuario

In [None]:
df_prices.head()

In [None]:
udf_tmp7 = df[df['event'] == 'conversion']
udf_tmp7.set_index('person', inplace=True)
udf_tmp7 = udf_tmp7.groupby('person').agg({'timestamp': 'max','sku':'max'})
udf_tmp7.reset_index(inplace=True)
udf_tmp7 = udf_tmp7.merge(df_prices, how='inner', on='sku')
udf_tmp7['sku'] =  udf_tmp7['sku'].astype('int')
udf_tmp7 = udf_tmp7.rename(columns={'precio_reales': 'last_conversion_price', 'sku': 'last_conversion_sku'})

udf_tmp7.set_index('person',inplace=True)
del udf_tmp7['timestamp']
udf_tmp7['last_conversion_price'] =  udf_tmp7['last_conversion_price'].astype('float')
udf_tmp7.head()

### Porcentaje de la actividad de la ultima semana
Cantidad de eventos de la ultima semana sobre el total

In [None]:
udf_tmp8 = df.groupby('person').agg({'event': 'count'})
udf_tmp8 = udf_tmp8.rename(columns = {'event': 'total_events'})
df_last_week = df[df['timestamp'] > pd.to_datetime('2018-05-23')]
df_last_week = df_last_week.groupby('person').agg({'event': 'count'})
df_last_week = df_last_week.rename(columns = {'event': 'events_last_week'})
udf_tmp8 = udf_tmp8.merge(df_last_week, how='outer', on='person')
udf_tmp8 = udf_tmp8.fillna(0) #Si no tuvo actividad en la última semana el porcentaje es 0
udf_tmp8['percentage_last_week_activity'] = udf_tmp8['events_last_week'] / udf_tmp8['total_events']

del udf_tmp8['total_events']
del udf_tmp8['events_last_week']

udf_tmp8.head()

### Porcentaje de la actividad del ultimo mes
Cantidad de eventos del último mes sobre el total

In [None]:
udf_tmp9 = df.groupby('person').agg({'event': 'count'})
udf_tmp9 = udf_tmp9.rename(columns = {'event': 'total_events'})
df_last_month = df[df['timestamp'] > pd.to_datetime('2018-05-01')]
df_last_month = df_last_month.groupby('person').agg({'event': 'count'})
df_last_month = df_last_month.rename(columns = {'event': 'events_last_month'})
udf_tmp9 = udf_tmp9.merge(df_last_month, how='outer', on='person')
udf_tmp9 = udf_tmp9.fillna(0) #Si no tuvo actividad en el ultimo mes el porcentaje es 0
udf_tmp9['percentage_last_month_activity'] = udf_tmp9['events_last_month'] / udf_tmp9['total_events']

del udf_tmp9['total_events']
del udf_tmp9['events_last_month']

udf_tmp9.head()

### Días entre el último checkout y última actividad
Se guardan los días entre el último evento del usuario y el último checkout

In [None]:
df_last_checkout = df[df['event'] == 'checkout']
df_last_checkout = df_last_checkout.groupby('person').agg({'timestamp': 'max'})
df_last_checkout = df_last_checkout.rename(columns = {'timestamp': 'timestamp_last_checkout'})
df_last_event = df.groupby('person').agg({'timestamp': 'max'})
df_last_event = df_last_event.rename(columns = {'timestamp': 'timestamp_last_event'})

udf_tmp10 = df_last_checkout.merge(df_last_event, how='outer', on='person')
udf_tmp10['days_between_last_event_and_checkout'] = udf_tmp10['timestamp_last_event'].dt.dayofyear - udf_tmp10['timestamp_last_checkout'].dt.dayofyear
udf_tmp10 = udf_tmp10.fillna(180) #Se utiliza el tope de tiempo si no tuvo una conversión
udf_tmp10['days_between_last_event_and_checkout'] = udf_tmp10['days_between_last_event_and_checkout'].astype('int')
del udf_tmp10['timestamp_last_event']
del udf_tmp10['timestamp_last_checkout']

udf_tmp10.head()

### Estados de celulares

Utilizando la lógica de que hay empresas que compran celulares en mal estado y en una primera instancia (obviamente) lo ven, se agrega una columna que indique porcentaje de celulares en estado Bom - Sem Touch ID vs Bom sobre todos los celulares vistos

In [None]:
df_viewed_product = df[df['event'] == 'viewed product']

df_regular_phones = df_viewed_product[(df_viewed_product['condition'] == 'Bom') | (df_viewed_product['condition'] == 'Bom - Sem Touch ID')]
df_regular_phones = df_regular_phones.groupby('person').agg({'condition': 'count'})
df_regular_phones = df_regular_phones.rename(columns = {'condition': 'amount_regular_phones'})

df_all_phones = df_viewed_product.groupby('person').agg({'condition': 'count'})
df_all_phones = df_all_phones.rename(columns = {'condition': 'amount_total_phones'})

udf_tmp11 = df_regular_phones.merge(df_all_phones, how='outer', on='person')

udf_tmp11['percentage_regular_celphones_activity'] = udf_tmp11['amount_regular_phones'] / udf_tmp11['amount_total_phones']

del udf_tmp11['amount_regular_phones']
del udf_tmp11['amount_total_phones']

udf_tmp11.head()

## Varianza logarítmica de productos vistos

In [None]:
import numpy as np

udf_tmp12 = df[df['event'] == 'viewed product']
udf_tmp12 = udf_tmp12.groupby('person').agg({'precio_reales': 'var'})
udf_tmp12['precio_reales'] = np.log(udf_tmp12['precio_reales']) 
udf_tmp12['precio_reales'] = udf_tmp12['precio_reales'].replace([np.inf, -np.inf], 0)
udf_tmp12 = udf_tmp12.rename(columns={'precio_reales': 'var_viewed'})
udf_tmp12.head()

## Compró más de la media

In [None]:
conversions = df[df['event'] == 'conversion']
monto_total = np.sum(conversions['precio_reales'])
cantidad = conversions['precio_reales'].count()
media = monto_total/cantidad

udf_tmp13 = conversions.groupby('person').agg({'precio_reales': 'max'})
udf_tmp13['conversion_gt_media'] = udf_tmp13['precio_reales'] > media
udf_tmp13['conversion_gt_media'] = udf_tmp13['conversion_gt_media'].astype('int')

del udf_tmp13['precio_reales']

udf_tmp13.head()

----

**Se guarda todo en `user-features.csv`**

In [None]:
udf = udf_tmp1
udf = udf.merge(udf_tmp2, how='outer', on='person')
udf = udf.merge(udf_tmp3, how='outer', on='person')
udf = udf.merge(udf_tmp4, how='outer', on='person')
udf = udf.merge(udf_tmp5, how='outer', on='person')
udf = udf.merge(udf_tmp6, how='outer', on='person')
udf = udf.merge(udf_tmp7, how='outer', on='person')
udf = udf.merge(udf_tmp8, how='outer', on='person')
udf = udf.merge(udf_tmp9, how='outer', on='person')
udf = udf.merge(udf_tmp10, how='outer', on='person')
udf = udf.merge(udf_tmp11, how='outer', on='person')
udf = udf.merge(udf_tmp12, how='outer', on='person') 
udf = udf.merge(udf_tmp13, how='outer', on='person') 


udf = udf.fillna(0)
udf = udf.astype('int')

udf.head()

In [None]:
# Confirmamos no haber perdido datos en el medio

assert(len(udf)==len(df['person'].unique()))
display(len(udf))

In [None]:
udf.reset_index().to_csv('data/user-features.csv', index=False)