In [None]:
import pandas as pd
import numpy as np
import calendar



In [None]:
df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('data/brands.csv')
df_os = pd.read_csv('data/os.csv')
df_browsers = pd.read_csv('data/browsers.csv')


df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')

In [None]:
# Los atributos con pocos valores posibles se pasan a variables categoricas para ahorrar memoria
df['event'] = df['event'].astype('category')
df['condition'] = df['condition'].astype('category')
df['storage'] = df['storage'].astype('category')
df['search_engine'] = df['search_engine'].astype('category')
df['channel'] = df['channel'].astype('category')
df['device_type'] = df['device_type'].astype('category')

df['brand'] = df['brand'].astype('category')
df['operating_system'] = df['operating_system'].astype('category')
df['browser'] = df['browser'].astype('category')

# El tiempo es mejor manejarlo como tal
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df['month_number'] = df['timestamp'].dt.month
df['month_name'] = df['month_number'].apply(lambda x: calendar.month_abbr[x])
df['week_day'] = df['timestamp'].dt.weekday
df['week_number'] = df['timestamp'].dt.week
df['week_day_name'] = df['timestamp'].dt.weekday_name
df['day_date'] = df['timestamp'].dt.to_period('D')
df['day_dom'] = df['timestamp'].dt.day
df['day_doy'] = df['timestamp'].dt.dayofyear
df['hour_count'] = df['timestamp'].dt.hour

In [None]:
df.sort_values(['person', 'timestamp'])

In [None]:
df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'

### Users DataFrame

In [None]:
udf = df.groupby('person').agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf.columns = ['total_conversions', 'total_checkouts']
udf['total_conversions'] = udf['total_conversions'].astype('int')
udf['total_checkouts'] = udf['total_checkouts'].astype('int')

udf['has_conversion'] = udf['total_conversions'] > 0
udf['has_checkout'] = udf['total_checkouts'] > 0

In [None]:
udf[udf['total_conversions'] > 10].sort_values('total_conversions', ascending=False)

In [None]:
# Has conversions or checkouts in may
gb = df[df['month_number'] == 5].groupby('person')
udf_tmp = gb.agg({'is_conversion':'sum', 'is_checkout':'sum'})
udf_tmp.columns = ['total_conversions_month_5', 'total_checkouts_month_5']

udf_tmp['total_conversions_month_5'] = udf_tmp['total_conversions_month_5'].astype('int')
udf_tmp['total_checkouts_month_5'] = udf_tmp['total_checkouts_month_5'].astype('int')

udf_tmp['has_conversion_month_5'] = udf_tmp['total_conversions_month_5'] > 0
udf_tmp['has_checkout_month_5'] = udf_tmp['total_checkouts_month_5'] > 0

In [None]:
udf_tmp.head(10)

In [None]:
udf = udf.merge(udf_tmp, on='person')

<Siguiente minado>