# Análisis de events.csv

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('max_columns', 8, 'max_rows', 21)

## Limpieza del data frame

In [2]:
# Columnas a eliminar:
# 'index', 'ref_hash', 'device_countrycode', 'session_user_agent', 'trans_id', 'event_uuid'
# Columnas a utilizar:
cols = [ 'date', 'event_id', 'ref_type', 'application_id', 'attributed',
            'device_countrycode', 'device_os_version', 'device_brand',
            'device_model', 'device_city', 'user_agent', 'carrier', 'kind',
            'device_os', 'wifi' ,'connection_type', 'ip_address', 'device_language' ]

In [3]:
events = pd.read_csv('data/events.csv.gzip', compression='gzip', low_memory=False, parse_dates=['date'], index_col=['date'], usecols=cols)
events.head()

Unnamed: 0_level_0,event_id,ref_type,application_id,attributed,...,wifi,connection_type,ip_address,device_language
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-20 01:42:49.120,0,1891515180541284343,210,False,...,False,,7544543351571901618,3.301378e+18
2019-04-20 01:42:49.340,1,1891515180541284343,210,False,...,False,,6949523255335024165,
2019-04-20 01:42:49.365,1,1891515180541284343,210,False,...,False,,6428537280982666957,
2019-04-20 01:42:51.438,2,1891515180541284343,210,False,...,False,,7607371352198017145,
2019-04-20 01:42:51.838,1,1891515180541284343,210,False,...,False,,2901772839007473756,


In [4]:
events.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7744581 entries, 2019-04-20 01:42:49.120000 to 2019-04-23 00:01:13.518000
Data columns (total 17 columns):
event_id              int64
ref_type              int64
application_id        int64
attributed            bool
device_countrycode    int64
device_os_version     float64
device_brand          float64
device_model          float64
device_city           float64
user_agent            float64
carrier               float64
kind                  float64
device_os             float64
wifi                  bool
connection_type       object
ip_address            int64
device_language       float64
dtypes: bool(2), float64(9), int64(5), object(1)
memory usage: 960.2+ MB


In [5]:
events.isnull().all().any()

False

In [6]:
events['event_id'].nunique(), events['event_id'].max(), events['event_id'].min()

(516, 576, 0)

In [7]:
events['event_id'] = events['event_id'].astype(np.uint16)

In [8]:
events['ref_type'].value_counts()

1891515180541284343    6421584
1494519392962156891    1322997
Name: ref_type, dtype: int64

In [9]:
events['ref_type'], x = events['ref_type'].factorize()
events['ref_type'] = events['ref_type'].astype(np.uint8)

In [10]:
events['application_id'].nunique(), events['application_id'].max(), events['application_id'].min()

(250, 370, 1)

In [11]:
events['application_id'], x = events['application_id'].factorize()
events['application_id'] = events['application_id'].astype(np.uint8)

In [12]:
events['device_os_version'].nunique(), events['device_os_version'].max(), events['device_os_version'].min()

(96, 9.135481865204238e+18, 1.0040843832529456e+16)

In [13]:
events['device_os_version'], x = events['device_os_version'].factorize()
events['device_os_version'] = events['device_os_version'].astype(np.uint8)

In [14]:
events['device_brand'].nunique(), events['device_brand'].max(), events['device_brand'].min()

(506, 9.221510769222537e+18, 4359145860523661.0)

In [15]:
events['device_brand'], x = events['device_brand'].factorize()
events['device_brand'] = events['device_brand'].astype(np.uint16)

In [16]:
events['device_model'].nunique(), events['device_model'].max(), events['device_model'].min()

(4720, 9.223357288405795e+18, 796734562250222.0)

In [17]:
events['device_model'], x = events['device_model'].factorize()
events['device_model'] = events['device_model'].astype(np.uint16)

In [18]:
events['device_city'].nunique(), events['device_city'].max(), events['device_city'].min()

(753, 9.221371124193763e+18, 6480067067412837.0)

In [19]:
events['device_city'], x = events['device_city'].factorize()
events['device_city'] = events['device_city'].astype(np.uint16)

In [20]:
events['user_agent'].value_counts(dropna=False)

NaN             4403098
7.683619e+18      91393
9.329817e+17      80125
9.172915e+18      74717
2.146196e+18      70985
1.883833e+17      56682
8.402407e+18      52560
8.986422e+18      42673
4.477329e+18      40154
3.143542e+18      39673
                 ...   
1.341526e+17          1
4.504366e+18          1
3.555248e+18          1
5.692165e+18          1
2.942781e+17          1
5.175955e+18          1
4.570518e+17          1
7.746894e+18          1
2.314199e+18          1
1.897129e+18          1
Name: user_agent, Length: 14646, dtype: int64

In [21]:
events['user_agent'].nunique(), events['user_agent'].max(), events['user_agent'].min()

(14645, 9.222756238767758e+18, 504638159916931.0)

In [22]:
events['user_agent'], x = events['user_agent'].factorize()
events['user_agent'] = events['user_agent'].astype(np.uint16)

In [23]:
events['carrier'].nunique(), events['carrier'].max(), events['carrier'].min()

(259, 9.220464057766128e+18, 2.359612869853312e+16)

In [24]:
events['carrier'], x = events['carrier'].factorize()
events['carrier'] = events['carrier'].astype(np.uint16)

In [25]:
events['kind'].nunique(), events['kind'].max(), events['kind'].min()

(513, 9.2200236982374e+18, 1.6215264622799048e+16)

In [26]:
events['kind'], x = events['kind'].factorize()
events['kind'] = events['kind'].astype(np.uint16)

In [27]:
events['device_city'].nunique(), events['device_city'].max(), events['device_city'].min()

(754, 65535, 0)

In [28]:
events['device_os'], x = events['device_os'].factorize()
events['device_os'] = events['device_os'].astype(np.uint8)

In [29]:
events['connection_type'].value_counts(dropna=False)

NaN          5935285
Cable/DSL    1291512
Cellular      517204
Corporate        527
Dialup            53
Name: connection_type, dtype: int64

In [30]:
events['connection_type'] = events['connection_type'].astype('category')

In [31]:
events['ip_address'].nunique()

381784

In [32]:
events['ip_address'], x = events['ip_address'].factorize()
events['ip_address'] = events['ip_address'].astype(np.uint16)

In [33]:
events['device_language'].nunique(), events['carrier'].max(), events['carrier'].min()

(209, 65535, 0)

In [34]:
events['device_language'], x = events['device_language'].factorize()
events['device_language'] = events['device_language'].astype(np.uint16)

In [35]:
events.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7744581 entries, 2019-04-20 01:42:49.120000 to 2019-04-23 00:01:13.518000
Data columns (total 17 columns):
event_id              uint16
ref_type              uint8
application_id        uint8
attributed            bool
device_countrycode    int64
device_os_version     uint8
device_brand          uint16
device_model          uint16
device_city           uint16
user_agent            uint16
carrier               uint16
kind                  uint16
device_os             uint8
wifi                  bool
connection_type       category
ip_address            uint16
device_language       uint16
dtypes: bool(2), category(1), int64(1), uint16(9), uint8(4)
memory usage: 302.8 MB
