# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning

# Investigación Previa

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

Este notebook pretende, con mucha ayuda del trabajo realizado en el [TP1](https://fdelmazo.github.io/7506-Datos/TP1/TP1.html), encontrar y recopilar información a utilizar en el resto del trabajo.

In [None]:
import pandas as pd
import numpy as np
from ggplot import * # pip install ggplot # https://stackoverflow.com/a/50607072/10728610
import time
from sklearn.manifold import TSNE
import sklearn.cluster as cluster
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# !unzip -q ../TP1/data/events.zip -d ../TP1/data
df_tp1 = pd.read_csv('../TP1/data/events.csv', low_memory=False)
df_tp2 = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_labels = pd.read_csv('./data/labels_training_set.csv', low_memory=False)

## Comparación TP1 y TP2

In [None]:
def labelize(df):
    df_tmp = df
    df_tmp['timestamp'] = pd.to_datetime(df_tmp['timestamp'])
    
    df_tmp['is_useful_conversion'] = (df_tmp['event'] == 'conversion') \
        & (df_tmp['timestamp'] > pd.to_datetime('2018-06-01'))
    
    df_tmp = df_tmp[['person', 'is_useful_conversion']]
    gb = df_tmp.groupby('person')
    df_tmp = gb.sum()
    df_tmp.columns = ['label']
    df_tmp['label'] = df_tmp['label'].astype('bool')
    df_tmp['label'] = df_tmp['label'].astype('int')
    
    return df_tmp

In [None]:
df_tp1_labels = labelize(df_tp1)

display(df_tp1_labels.head())
display("Usuarios en TP1: " + str(len(df_tp1_labels)))
display("Usuarios que no convirtieron: " + str(len(df_tp1_labels[df_tp1_labels['label'] == 0])))
display("Usuarios que convirtieron: " + str(len(df_tp1_labels[df_tp1_labels['label'] == 1])))

In [None]:
df_tp2_labels = labelize(df_tp2)

display(df_tp2_labels.head())
display("Usuarios en TP2: " + str(len(df_tp2_labels)))
display("Usuarios que no convirtieron: " + str(len(df_tp2_labels[df_tp2_labels['label'] == 0])))
display("Usuarios que convirtieron: " + str(len(df_tp2_labels[df_tp2_labels['label'] == 1])))

In [None]:
# Chequeando que todos los usuarios tienen actividad después de 2018-06-01
df_tmp = df_tp1
df_tmp['timestamp'] = pd.to_datetime(df_tmp['timestamp'])
df_tmp = df_tmp.loc[df_tmp['timestamp'] > pd.to_datetime('2018-06-01')]

display(len(df_tp1['person'].unique()))
display(len(df_tmp['person'].unique()))
display(len(df_tmp[df_tmp['timestamp'] < pd.to_datetime('2018-06-16')]['person'].unique()))

In [None]:
display("Usuarios en set de entrenamiento: " + str(len(df_labels)))
display("Usuarios que no convirtieron: " + str(len(df_labels[df_labels['label'] == 0])))
display("Usuarios que convirtieron: " + str(len(df_labels[df_labels['label'] == 1])))

**Información Recopilada**

1. No se repiten usuarios en los datasets.
2. En el **primer** dataset (TP1) hay 27624 usuarios.
3. De los cuales 13967 tuvieron actividad en Junio.
4. Entre el 1 y el 15 (inclusive) de Junio 82 usuarios compraron productos.
2. En el **segundo** dataset hay 19414 usuarios.
3. De los cuales 980 compraron en Junio.

**Conclusión**

Hacer un merge de los datos del TP1 con los del TP2 presentaría un *skewness* en el set de datos, por la despreciabilidad de estos. Es mejor no hacerlo.

## Visualización con reducción de dimensiones

Usamos TSNE

In [None]:
df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_labels = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()

df_users = df_labels.merge(df_users, how='inner', on='person')
del df_users['label']

display(df)

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=3, n_iter=300)
tsne_results = tsne.fit_transform(df_users.values)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
df = df_users.merge(df_labels, how='inner', on='person')

df_tsne = df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]

chart = ggplot(df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by has_conversion")
chart

##### Ahora ploteamos con K-means

In [None]:
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

In [None]:
data = df_tsne.copy()
data['x'] = data['x-tsne']
data['y'] = data['y-tsne']

plt.scatter(data['x'], data['y'], c='b', **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)

In [None]:
def plot_clusters(data, algorithm, args, kwds, plot_kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data['x'], data['y'], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)
    return labels

In [None]:
labels = plot_clusters(data, cluster.KMeans, (), {'n_clusters':6}, plot_kwds)

In [None]:
display(labels)
df_users['k_mean'] = labels



##### Ahora ploteamos con HDBScan

In [None]:
import hdbscan
plot_clusters(data, hdbscan.HDBSCAN, (), {'min_cluster_size':22}, plot_kwds)

## Análisis comportamiento antes de compra

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('./data/brands.csv')
df_os = pd.read_csv('./data/os.csv')
df_browsers = pd.read_csv('./data/browsers.csv')
df_prices = pd.read_csv('./data/prices.csv')

df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')
df = df.merge(df_prices, how='left', on='sku')

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['session_timestamp_first'] = pd.to_datetime(df['session_timestamp_first'])
df['session_timestamp_last'] = pd.to_datetime(df['session_timestamp_last'])
df['month_number'] = df['timestamp'].dt.month

df['is_viewed_product'] = df['event'] == 'viewed product'
df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'
df['session_checkout_first'] = df['session_first'] & df['session_has_checkout']
df['session_conversion_first'] = df['session_first'] & df['session_has_conversion']
df['session_ad_first'] = df['session_first'] & df['session_ad']
df['session_ad_checkout_event'] = df['session_first'] & df['session_checkout_first']
df['session_ad_conversion_event'] = df['session_first'] & df['session_conversion_first']

In [None]:
df

In [None]:
df = df.sort_values(['person', 'timestamp'])
df_lm = df[df['timestamp'] >= pd.to_datetime('2018-05-01')]
df_blm = df[df['timestamp'] < pd.to_datetime('2018-05-01')]
df_lw = df[df['timestamp'] >= pd.to_datetime('2018-05-23')]

In [None]:
amount_of_boughts = df['session_conversion_first'].sum()
amount_of_boughts_lm = df_lm['session_conversion_first'].sum()
amount_of_boughts_blm = df_blm['session_conversion_first'].sum()
amount_of_boughts_lw = df_lw['session_conversion_first'].sum()

users_that_bought = df[['person', 'session_conversion_first']]['person'].unique()
users_that_bought_lm = df_lm[['person', 'session_conversion_first']]['person'].unique()
users_that_bought_blm = df_blm[['person', 'session_conversion_first']]['person'].unique()
users_that_bought_lw = df_lw[['person', 'session_conversion_first']]['person'].unique()

amount_of_users_that_bought = len(users_that_bought)
amount_of_users_that_bought_lm = len(users_that_bought_lm)
amount_of_users_that_bought_blm = len(users_that_bought_blm)
amount_of_users_that_bought_lw = len(users_that_bought_lw)

print(f"{amount_of_users_that_bought} made {amount_of_boughts} boughts.")
print(f"{amount_of_users_that_bought_lm} made {amount_of_boughts_lm} boughts last month.")
print(f"{amount_of_users_that_bought_blm} made {amount_of_boughts_blm} boughts before last month.")
print(f"{amount_of_users_that_bought_lw} made {amount_of_boughts_lw} boughts last week.")

users_that_bought_before_and_in_lm = list(filter(lambda x: x in users_that_bought_lm, users_that_bought_blm))

In [None]:
for i in range(1, 6):
    df_month = df[df['timestamp'].dt.month == i]

    amount_of_boughts_in_month = df_month['session_conversion_first'].sum()
    users_that_bought_in_month = df_month[['person', 'session_conversion_first']]['person'].unique()
    amount_of_users_that_bought_in_month = len(users_that_bought_in_month)

    print(f"{amount_of_users_that_bought_in_month} made {amount_of_boughts_in_month} boughts in month {i}.")

In [None]:
# !!!!
df_lm[df_lm['session_total_conversions'] > 0]

---

##### Sujeto 000ba417

In [None]:
df[(df['person'] == '000ba417') & (df['event'] == 'conversion')]

In [None]:
df[(df['person'] == '000ba417') & (df['session_id'] == 4)]

In [None]:
df[(df['person'] == '000ba417') & (df['session_id'] == 3)]

In [None]:
df[(df['person'] == '000ba417') & (df['session_id'] == 1)]

---
##### Sujeto 001001be

In [None]:
df[(df['person'] == '001001be') & (df['event'] == 'conversion')]

In [None]:
df[(df['person'] == '001001be') & (df['session_id'] == 0)]

---
##### Sujeto 0019e639

In [None]:
df[(df['person'] == '0019e639') & (df['event'] == 'conversion')]

In [None]:
df[(df['person'] == '0019e639') & (df['session_id'] == 18)]

In [None]:
df[(df['person'] == '0019e639') & (df['session_id'] == 17)]

In [None]:
df[(df['person'] == '0019e639') & (df['session_id'] == 16)]

In [None]:
df[(df['person'] == '0019e639') & (df['session_id'] == 15)]

In [None]:
df[(df['person'] == '0019e639') & (df['session_id'] == 14)]

##### Sujeto 002ed810

In [None]:
df[(df['person'] == '002ed810') & (df['event'] == 'conversion')]

In [None]:
df[(df['person'] == '002ed810') & (df['session_id'] == 2)]

In [None]:
df[(df['person'] == '002ed810') & (df['session_id'] == 1)]

In [None]:
df[(df['person'] == '002ed810') & (df['session_id'] == 0)]