# Análisis de auctions.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

pd.set_option('max_columns', 8, 'max_rows', 10)

## Limpieza del data frame

In [None]:
# La lectura es lenta y require mucha memoria
# Es mejor leer el DataFrame en la sección siguiente directamente
auctions = pd.read_csv('data/auctions.csv.gzip', compression='gzip', low_memory=False, parse_dates=['date'])
auctions.head()

In [None]:
auctions.info()

In [None]:
auctions['device_id'].nunique()

In [None]:
auctions['ref_type_id'].value_counts()

In [None]:
auctions['source_id'].value_counts()

In [None]:
auctions['source_id'] = auctions['source_id'].astype(np.uint8)
auctions['ref_type_id'] = auctions['ref_type_id'].astype(np.uint8)

In [None]:
auctions.isnull().any()

In [None]:
auctions.info()

In [None]:
auctions.head()

In [None]:
len(auctions)

## Lectura y limpieza en un paso del dataframe

In [None]:
types = {'device_id': np.uint64, 'ref_type_id': np.uint8, 'source_id': np.uint8}
auctions = pd.read_csv('data/auctions.csv.gzip', compression='gzip', low_memory=False, dtype=types,
                       parse_dates=['date'], index_col=['date'])
auctions.head()

In [None]:
auctions.isnull().any()

In [None]:
auctions.info()

## Análisis exploratorio

In [None]:
# Ordenamos el índice para acelerar las operaciones
auctions = auctions.sort_index()

In [None]:
# Veamos cuántas subastas hay en promedio por día
daily_auctions = auctions.resample('D').size()
daily_auctions.mean()

In [None]:
hourly_auctions = auctions.resample('H').size()
hourly_auctions.plot(kind='line', figsize=(15, 8), title='Cantidad de subastas por hora', grid=True)

In [None]:
ax = hourly_auctions.plot(kind='box', figsize=(12, 8), title='Boxplot de la cantidad de subastas por hora', grid=True)
ax.set_xlabel('')
ax.set_xticklabels('')
ax.title.set_size(20)

In [None]:
def hourly_auctions_per_day(date, ax=None):
    auctions[date].resample('H').size()\
                  .plot(kind='line', figsize=(20, 8), title=date, ax=ax)

In [None]:
fig, ax_array = plt.subplots(2, 3, figsize=(20,8))
fig.suptitle('Cantidad de subastas por hora', size=20)

dates = ['2019-04-18', '2019-04-19', '2019-04-21', '2019-04-23', '2019-04-25', '2019-04-26']
for day, ax in zip(dates, ax_array.flatten()):
    hourly_auctions_per_day(day, ax)
    ax.set_xlabel('')

In [None]:
daily_auctions = auctions.resample('D').size()
ax = daily_auctions.plot(kind='bar', figsize=(20, 8), title='Cantidad de subastas por día', rot=0)
ax.set_xlabel('')
ax.set_xticklabels(list(map(str, daily_auctions.index.date)))

In [None]:
# Veamos cuántos device_id diferentes hay
auctions['device_id'].nunique()

In [None]:
# Veamos cuántos dispositivos aparecen menos de 5 veces
(auctions.groupby('device_id').size() <= 5).sum()

In [None]:
# Veamos cuántos dispositivos aparecen más de 5000 veces
(auctions.groupby('device_id').size() >= 5000).sum()

In [None]:
devices_count = auctions.groupby('device_id').size()
devices_count_clipped = devices_count.clip(lower=0, upper=5000)
ax = devices_count_clipped.hist(bins=15, log=True, figsize=(20,8))
ax.set_title('Histograma de apariciones de dispositivos', size=20)

# Target

In [None]:
target = pd.read_csv('data/target_competencia_ids.csv', low_memory=False)
target.head()

In [None]:
target['obj'].value_counts()

In [None]:
len(target)

In [None]:
# Obtengo los ref_hash únicos, eliminando el "_sc"
target_hash = target.loc[target['ref_hash'].transform(lambda x: x.split('_')[1]) == 'sc']['ref_hash'].transform(lambda x: x.split('_')[0])
target_hash = target_hash.astype(np.int64)
target_hash = target_hash.to_frame()
target_hash.head()

In [None]:
len(target_hash)