### Configuracion inicial

In [150]:
#Importo librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

In [79]:
%matplotlib inline
plt.style.use('default')
sns.set()

In [80]:
#Funciones auxiliares
def mostrar_porcentaje_barplot(ax):
    suma = 0
    for p in ax.patches:
        suma += p.get_height()
    for p in ax.patches:
        ax.annotate(str(np.round(100 *(p.get_height() / suma),decimals=2)) + "%", (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
def mostrar_valores_barplot(ax):
    for p in ax.patches:
        ax.annotate(np.round(p.get_height(),decimals=2), (p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

In [81]:
#Cargo el DataFrame
df = pd.read_csv('events.csv', low_memory = False, parse_dates = ['timestamp'], infer_datetime_format = True,
                    dtype = {'event': 'category','condition': 'category','storage': 'category', 'color': 'category', 'staticpage': 'category', 'campaign_source': 'category', 'search_engine': 'category', 'channel': 'category', 'new_vs_returning': 'category', 'region': 'category', 'country': 'category', 'device_type': 'category'})
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-31 23:38:05,ad campaign hit,0004b0a2,/comprar/iphone/iphone-5s,,,,,,,...,,,,,,,,,,
1,2018-05-31 23:38:05,visited site,0004b0a2,,,,,,,,...,,Paid,New,Camaragibe,Pernambuco,Brazil,Smartphone,360x640,Android 6,Chrome Mobile 39
2,2018-05-31 23:38:09,viewed product,0004b0a2,,2694.0,iPhone 5s,Bom,32GB,Cinza espacial,,...,,,,,,,,,,
3,2018-05-31 23:38:40,checkout,0004b0a2,,2694.0,iPhone 5s,Bom,32GB,Cinza espacial,,...,,,,,,,,,,
4,2018-05-29 13:29:25,viewed product,0006a21a,,15338.0,Samsung Galaxy S8,Bom,64GB,Dourado,,...,,,,,,,,,,


## Chequeos de integridad y calidad de los datos


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1011288 entries, 0 to 1011287
Data columns (total 23 columns):
timestamp                   1011288 non-null datetime64[ns]
event                       1011288 non-null category
person                      1011288 non-null object
url                         82756 non-null object
sku                         563838 non-null object
model                       564284 non-null object
condition                   563836 non-null category
storage                     563836 non-null category
color                       563836 non-null category
skus                        221699 non-null object
search_term                 48967 non-null object
staticpage                  3598 non-null category
campaign_source             82796 non-null category
search_engine               50957 non-null category
channel                     87378 non-null category
new_vs_returning            87378 non-null category
city                        87378 non-null object


Como podemos ver, todas las filas especifican 'timestamp', 'event' y 'person'

### Analizamos los valores que pueden tomar las columnas categoricas

In [83]:
valores = df['condition'].unique()
for valor in valores:
    print(valor, end = ', ')

nan, Bom, Muito Bom, Excelente, Bom - Sem Touch ID, Novo, 

In [84]:
valores = df['storage'].unique()
for valor in valores:
    print(valor, end = ', ')

nan, 32GB, 64GB, 256GB, 16GB, 8GB, 128GB, 4GB, 512MB, 

In [85]:
valores = df['channel'].unique()
for valor in valores:
    print(valor, end = ', ')

nan, Paid, Organic, Direct, Social, Referral, Email, Unknown, 

In [86]:
valores = df['staticpage'].unique()
for valor in valores:
    print(valor, end = ', ')

nan, how-to-buy, trust-trocafone, Quiosks, FaqEcommerce, AboutUs, Conditions, TermsAndConditionsEcommerce, CustomerService, galaxy-s8, how-to-sell, TermsAndConditionsReturnEcommerce, club-trocafone, black_friday, PrivacyEcommerce, 

## Analisis tipos de evento

In [87]:
colUsadasEventos = df.groupby(by = 'event').count()
colUsadasEventos

Unnamed: 0_level_0,timestamp,person,url,sku,model,condition,storage,color,skus,search_term,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ad campaign hit,82827,82827,82756,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
brand listing,98635,98635,0,0,0,0,0,0,98324,0,...,0,0,0,0,0,0,0,0,0,0
checkout,33735,33735,0,33735,33733,33733,33733,33733,0,0,...,0,0,0,0,0,0,0,0,0,0
conversion,1172,1172,0,1172,1172,1172,1172,1172,0,0,...,0,0,0,0,0,0,0,0,0,0
generic listing,67534,67534,0,0,0,0,0,0,67421,0,...,0,0,0,0,0,0,0,0,0,0
lead,448,448,0,0,448,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
search engine hit,50957,50957,0,0,0,0,0,0,0,0,...,50957,0,0,0,0,0,0,0,0,0
searched products,56073,56073,0,0,0,0,0,0,55954,48967,...,0,0,0,0,0,0,0,0,0,0
staticpage,3598,3598,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
viewed product,528931,528931,0,528931,528931,528931,528931,528931,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
columns = list(df)
for event, row in colUsadasEventos.iterrows():
    print(event, end = ':')
    
    for column in columns:
        if(row.get(column) != 0):
            print(' ' + column, end = ',')
            
    print()

ad campaign hit: timestamp, event, person, url, campaign_source,
brand listing: timestamp, event, person, skus,
checkout: timestamp, event, person, sku, model, condition, storage, color,
conversion: timestamp, event, person, sku, model, condition, storage, color,
generic listing: timestamp, event, person, skus,
lead: timestamp, event, person, model,
search engine hit: timestamp, event, person, search_engine,
searched products: timestamp, event, person, skus, search_term,
staticpage: timestamp, event, person, staticpage,
viewed product: timestamp, event, person, sku, model, condition, storage, color,
visited site: timestamp, event, person, channel, new_vs_returning, city, region, country, device_type, screen_resolution, operating_system_version, browser_version,


## Columnas usadas por cada evento

Todos los eventos contienen informacion sobre **'timestamp'**, **'event'** y **'person'**, y ademas utilizan las siguientes columnas:

**ad campaign hit:**
    url, campaign_source

**brand listing:**
    skus

**checkout:** 
    sku, model, condition, storage, color

**conversion:**
    sku, model, condition, storage, color

**generic listing:**
    skus

**lead:**
    model

**search engine hit:** 
    search_engine

**searched products:** 
    skus, search_term

**staticpage:** 
    staticpage

**viewed product:** 
    sku, model, condition, storage, color

**visited site:** 
    channel, new_vs_returning, city, region, country, device_type, screen_resolution, operating_system_version, browser_version
