# Análisis de auctions.csv

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('max_columns', 8, 'max_rows', 10)

## Limpieza del data frame

In [2]:
# La lectura es lenta y require mucha memoria
# Es mejor leer el DataFrame en la sección siguiente directamente
auctions = pd.read_csv('data/auctions.csv.gzip', compression='gzip', low_memory=False, parse_dates = ['date'])
auctions.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0


In [3]:
auctions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47409528 entries, 0 to 47409527
Data columns (total 4 columns):
date           datetime64[ns]
device_id      int64
ref_type_id    int64
source_id      int64
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.4 GB


In [4]:
auctions['platform'].value_counts()

KeyError: 'platform'

In [None]:
auctions['ref_type_id'].value_counts()

In [None]:
auctions['source_id'].value_counts()

In [None]:
auctions['source_id'] = auctions['source_id'].astype(np.uint8)
auctions['platform'] = auctions['platform'].astype(np.uint8)
auctions['ref_type_id'] = auctions['ref_type_id'].astype(np.uint8)

In [None]:
auctions['country'].value_counts()

In [None]:
auctions['auction_type_id'].isnull().all()

In [None]:
auctions = auctions.drop('country', axis=1)
auctions = auctions.drop('auction_type_id', axis=1)
auctions.head()

In [None]:
auctions['date'] = auctions['date'].astype("datetime64")

In [None]:
auctions.info()

In [None]:
auctions.head()

In [None]:
len(auctions)

## Lectura y limpieza en un paso del dataframe

In [None]:
types = {'platform': np.uint8, 'ref_type_id': np.uint8, 'source_id': np.uint8, 'device_id': np.uint64}
auctions = pd.read_csv('data/auctions.csv.gzip', compression='gzip', low_memory=False, dtype=types,
                       parse_dates=['date'], index_col=['date'],
                       usecols=['date', 'platform', 'ref_type_id', 'source_id', 'device_id'])
auctions.head()

In [None]:
auctions.isnull().any()

## Análisis exploratorio

In [None]:
# Ordenamos el índice para acelerar las operaciones
auctions.sort_index()

In [None]:
# Veamos cuántas subastas hay en promedio por día
daily_auctions = auctions.resample('D').size()
daily_auctions.mean()

In [None]:
hourly_auctions = auctions.resample('H').size()
hourly_auctions.plot(kind='line', figsize=(15, 8), title='Cantidad de subastas por hora', grid=True)

In [None]:
ax = hourly_auctions.plot(kind='box', figsize=(12, 8), title='Boxplot de la cantidad de subastas por hora', grid=True)
ax.set_xlabel('')
ax.set_xticklabels('')
ax.title.set_size(20)

In [None]:
def hourly_auctions_per_day(date, ax=None):
    auctions[date].resample('H').size()\
                  .plot(kind='line', figsize=(20, 8), title=date, ax=ax)

In [None]:
fig, ax_array = plt.subplots(2, 3, figsize=(20,8))
fig.suptitle('Cantidad de subastas por hora', size=20)

dates = ['2019-03-05', '2019-03-06', '2019-03-08', '2019-03-10', '2019-03-12', '2019-03-13']
for day, ax in zip(dates, ax_array.flatten()):
    hourly_auctions_per_day(day, ax)
    ax.set_xlabel('')

In [None]:
daily_auctions = auctions.resample('D').size()
ax = daily_auctions.plot(kind='bar', figsize=(20, 8), title='Cantidad de subastas por día', rot=0)
ax.set_xlabel('')
ax.set_xticklabels(list(map(str, daily_auctions.index.date)))

In [None]:
platform_size = auctions.groupby([pd.Grouper(freq='H'), 'platform']).size()
platform_total = platform_size.unstack('platform').cumsum()
row_total = platform_total.sum(axis='columns')
platform_cum_pct = platform_total.div(row_total, axis='index')

ax = platform_cum_pct.plot(kind='area', figsize=(20,8), cmap='coolwarm', xlim=('2019-03-05', None), ylim=(0, 1), legend=False)
ax.figure.suptitle('Evolución del porcentaje de subastas por plataforma', size=20)
ax.set_xlabel('')
ax.yaxis.tick_right()

plot_kwargs = dict(xycoords='axes fraction', size=20)
ax.annotate(xy=(.6, .4), s='Plataforma 1 (posiblemente Android)', color='w', **plot_kwargs)
ax.annotate(xy=(.2, .85), s='Plataforma 2 (posiblemente iOS)', color='w', **plot_kwargs)

In [None]:
# Veamos cuántos device_id diferentes hay
auctions['device_id'].nunique()

In [None]:
# Veamos cuántos dispositivos aparecen menos de 5 veces
(auctions.groupby('device_id').size() <= 5).sum()

In [None]:
# Veamos cuántos dispositivos aparecen más de 3000 veces
(auctions.groupby('device_id').size() >= 3000).sum()

In [None]:
devices_count = auctions.groupby('device_id').size()
devices_count_clipped = devices_count.clip(lower=0, upper=3000)
ax = devices_count_clipped.hist(bins=15, log=True, figsize=(20,8))
ax.set_title('Histograma de apariciones de dispositivos', size=20)

In [None]:
clicks = pd.read_csv('data/clicks.csv.gzip', compression='gzip', parse_dates=['created'], index_col=['created'])
clicks.sort_index()
hourly_clicks = clicks.resample('6H').size()
hourly_clicks.plot(kind='line', figsize=(20, 5), label='Clicks',
                   title='Cantidad de clicks cada 6 horas a lo largo de los 9 días', xlim=['07-Mar-2019', '14-Mar-2019'],
                   grid=True, fontsize=16)

In [None]:
installs = pd.read_csv('data/installs.csv.gzip', compression='gzip', parse_dates=['created'], index_col=['created'])
installs.sort_index()
hourly_installs = installs.resample('H').size()
ax = hourly_installs.plot(kind='line', figsize=(20, 5),
                   title='Cantidad de instalaciones por hora a lo largo de los 9 días',
                   grid=True, fontsize=16)
ax.set_xlabel('')

In [None]:
events = pd.read_csv('data/events.csv.gzip', compression='gzip', parse_dates=['date'], index_col=['date'], low_memory=False)
events.sort_index()
hourly_events = events.resample('H').size()
ax = hourly_events.plot(kind='line', figsize=(20, 5), title='Cantidad de eventos por hora a lo largo de los 9 días',
                        grid=True, fontsize=16)
ax.set_xlabel('')