# Trabajo Práctico N° 2
## Objetivo:

Para cada dispositivo presentado por Jampp, determinar el tiempo que transcurrirá hasta que el mismo aparezca nuevamente en una subasta, y el tiempo hasta que el usuario del mismo decida instalar una nueva aplicación.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import datetime as dt

## Analizo los datos de la primer ventana

In [2]:
auct = pd.read_csv('data/auctions_ventana1.csv', dtype = {"device_id": 'category', "ref_type_id": np.int8, "source_id": np.int8})

In [3]:
auct.dtypes

date             object
device_id      category
ref_type_id        int8
source_id          int8
dtype: object

In [4]:
auct['date'] = pd.to_datetime(auct['date'])

In [5]:
auct.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-20 23:57:27.912838,1109595589636746168,7,0
1,2019-04-20 23:57:28.381114,5896614299191635403,1,0
2,2019-04-20 23:57:28.515423,4172466725848941608,1,0
3,2019-04-20 23:57:28.700884,2616279795187318849,7,0
4,2019-04-20 23:57:28.868312,8034952072073026056,1,0


## Veo cuantas veces aparece cada dispositivo en una subasta

Inicio sencillamente contando la cantidad de subastas en las que participó cada dispositivo, y lo agrego como un nuevo feature

In [6]:
dfApT = auct.groupby('device_id').agg({'date': 'count'}).reset_index()
dfApT.columns = ['ref_hash', '#subastas']

In [7]:
dfApT.head()

Unnamed: 0,ref_hash,#subastas
0,1000061425870948777,8
1,1000503394293263005,343
2,1001008640113335510,24
3,1001123163431776865,68
4,1001144380199556647,78


## Veo la cantidad de veces que aparece cada dispositivo segun el source_id

Ahora me interesa ver la cantidad de veces que un dispositivo participó en una subasta desde cada tipo de fuente desde donde se produce la subasta. De esta manera los source_id más populares tomaran un valor mayor, luego tomo la desviación estándar de la cantidad para cada device_id

In [8]:
auct['apariciones'] = 1

In [9]:
dfA = auct.groupby(['device_id', 'source_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfA.columns = dfA.columns.droplevel(0)
dfA.columns = ['ref_hash', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
dfA.head()

Unnamed: 0,ref_hash,0,1,2,3,4,5,6,7,8,9
0,1000061425870948777,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000503394293263005,225.0,79.0,1.0,18.0,1.0,0.0,0.0,0.0,19.0,0.0
2,1001008640113335510,9.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001123163431776865,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
4,1001144380199556647,46.0,15.0,1.0,0.0,1.0,0.0,1.0,0.0,14.0,0.0


In [10]:
dfA['Std_#src_id'] = dfA.iloc[:,1:].std(axis = 1)
dfA.head()

Unnamed: 0,ref_hash,0,1,2,3,4,5,6,7,8,9,Std_#src_id
0,1000061425870948777,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.20101
1,1000503394293263005,225.0,79.0,1.0,18.0,1.0,0.0,0.0,0.0,19.0,0.0,71.32718
2,1001008640113335510,9.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.25357
3,1001123163431776865,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,19.809089
4,1001144380199556647,46.0,15.0,1.0,0.0,1.0,0.0,1.0,0.0,14.0,0.0,14.649991


## Veo la cantidad de veces que aparece cada dispositivo en una subasta según el ref_type

In [11]:
auct['ref_type_id'].value_counts()

1    13313564
7     2331211
Name: ref_type_id, dtype: int64

In [57]:
dfApRef = auct.groupby(['device_id', 'ref_type_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApRef.columns = dfApRef.columns.droplevel(0)
dfApRef['median_ref_type'] = dfApRef.median(axis = 1)
dfApRef.head()

ref_type_id,Unnamed: 1,1,7,median_ref_type
0,1000061425870948777,8.0,0.0,8.0
1,1000503394293263005,0.0,343.0,343.0
2,1001008640113335510,24.0,0.0,24.0
3,1001123163431776865,68.0,0.0,68.0
4,1001144380199556647,0.0,78.0,78.0


## Cantidad de apariciones de un dispositivo en las subastas por día

Calculo la cantidad de apariciones para cada dispositivo en las subastas por día, luego tomo la mediana para las apariciones.

In [13]:
auct['fecha'] = auct['date'].dt.date

In [14]:
dfApDay = auct.groupby(['device_id', 'fecha']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApDay.columns = dfApDay.columns.droplevel(0)
dfApDay.columns = ['ref_hash', '2019-04-18', '2019-04-19', '2019-04-20']
dfApDay['median_auct_day'] = dfApDay.median(axis = 1)

In [15]:
dfApDay.head()

Unnamed: 0,ref_hash,2019-04-18,2019-04-19,2019-04-20,median_auct_day
0,1000061425870948777,2.0,2.0,4.0,3.0
1,1000503394293263005,111.0,99.0,133.0,122.0
2,1001008640113335510,24.0,0.0,0.0,12.0
3,1001123163431776865,16.0,20.0,32.0,26.0
4,1001144380199556647,34.0,27.0,17.0,30.5


## calculo cuanto tiempo tarda un dispositivo en aparecer en una subasta

La idea es determinar el tiempo que transcurrió entre cada aparición de un dispositivo en una subasta, para luego tomar el tiempo promedio de aparición de un dispositivo en una subasta y de acuerdo a eso predecir.

In [18]:
df_auctions = auct.groupby(['device_id', 'date']).agg({'source_id': lambda x: x}).reset_index()
df_auctions['date_posterior'] = pd.to_datetime(df_auctions['date'].shift(1))
df_auctions = pd.DataFrame(df_auctions.groupby('device_id').apply(lambda x: abs(x.date_posterior - x.date)))

In [None]:
df_auctions = df_auctions.reset_index()
df_auctions['time_to_auction'] = df_auctions[0]/np.timedelta64(1,'s')
df_auctions = df_auctions[['device_id', 'time_to_auction']]
df_auctions

Finalmente calculo el tiempo promedio en SEGUNDOS que tarda cada dispositivo para aparecer en una subasta

In [25]:
df_auctions = df_auctions.groupby('device_id').agg({'time_to_auction': 'mean'}).reset_index()

In [58]:
df_auctions.columns = ['ref_hash', 'time_to_auction']
df_auctions.head()

Unnamed: 0,ref_hash,time_to_auction
0,1000061425870948777,32461.086293
1,1000503394293263005,1443.557915
2,1001008640113335510,8115.695813
3,1001123163431776865,4680.438985
4,1001144380199556647,6584.318884


## Creo un solo dataframe con los datos estadísticos

In [60]:
auctions = dfApT.merge(dfA[['ref_hash', 'Std_#src_id']], on = 'ref_hash')


In [61]:
auctions = auctions.merge(dfApDay[['ref_hash', 'median_auct_day']], on = 'ref_hash')
auctions = auctions.merge(df_auctions[['ref_hash', 'time_to_auction']], on = 'ref_hash')
auctions.head()

Unnamed: 0,ref_hash,#subastas,Std_#src_id,median_auct_day,time_to_auction
0,1000061425870948777,8,2.20101,3.0,32461.086293
1,1000503394293263005,343,71.32718,122.0,1443.557915
2,1001008640113335510,24,5.25357,12.0,8115.695813
3,1001123163431776865,68,19.809089,26.0,4680.438985
4,1001144380199556647,78,14.649991,30.5,6584.318884


## Clicks

In [38]:
clks = pd.read_csv("data/clks_ventana1.csv", dtype = {'advertiser_id': np.int8, 'action_id': np.float32, 'source_id': np.int8, 'country_codde': 'category',  'carrier_id': np.float16, 'specs_brand': 'category', 'brand': np.float16, 'ref_type': 'category', 'ref_hash': 'category'})

## Installs

In [39]:
inst = pd.read_csv('data/installs_ventana1.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'ref_hash': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})

In [40]:
inst['created'] = pd.to_datetime(inst['created'])
inst.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language
0,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,6287817205707153877,,8.355495513718673e+18,adjust.com,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,,8291809486355890410,4.060929664968129e+18
1,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,6287817205707153877,,2.3557720913769155e+18,adjust.com,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,,4006811922873399949,3.3013777759777e+18
2,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,6287817205707153877,,6.156971151807135e+18,adjust.com,,,,,,3386455054590810771,3.3013777759777e+18
3,2019-04-20 18:19:27.485,1,1494519392962156891,7522785771858684314,,False,True,6287817205707153877,,6.208879341562586e+18,adjust.com,,35e7504f-cd9b-4a78-b89f-7335a8bd735a,app_open,,,7297343258015606683,3.3013777759777e+18
4,2019-04-20 03:40:21.239,1,1494519392962156891,7882044913917355073,,False,True,6287817205707153877,,4.566898029552894e+18,adjust.com,,f3a8649c-bd47-4874-a20e-8d3d5cedac2e,app_open,,,4764130939738113581,4.060929664968129e+18


In [41]:
inst.dtypes

created               datetime64[ns]
application_id                 int16
ref_type                    category
ref_hash                    category
click_hash                  category
attributed                      bool
implicit                        bool
device_countrycode             int64
device_brand                category
device_model                category
session_user_agent            object
user_agent                    object
event_uuid                    object
kind                        category
wifi                          object
trans_id                      object
ip_address                     int64
device_language             category
dtype: object

## Calculo el tiempo que tarda en convertir cada dispositivo

In [42]:
df_installs = inst.groupby(['ref_hash', 'created']).agg({'attributed': lambda x: x}).reset_index()
df_installs['created_posterior'] = pd.to_datetime(df_installs['created'].shift(1))
df_instals = pd.DataFrame(df_installs.groupby('ref_hash').apply(lambda x: abs(x.created_posterior - x.created)))

In [52]:
df_installs = df_instals.reset_index()[['ref_hash', 0]]
df_installs['time_to_install'] = df_installs[0]/np.timedelta64(1,'s')
df_installs = df_installs[['ref_hash', 'time_to_install']]
df_installs = df_installs.groupby('ref_hash').agg({'time_to_install': 'mean'}).reset_index()

In [55]:
df_installs.head()

Unnamed: 0,ref_hash,time_to_install
0,1000289045777700145,
1,100141508580250250,90345.801
2,1001467977812205098,128349.036
3,1001828092778461413,71246.599
4,1001971889699153637,106397.796


## Agrego los dispositivos con installs y el tiempo que tarda cada dispositivo en realizar una instalación

In [62]:
inst['conversion'] = inst['attributed'].transform(lambda x: 1 if (x == True) else 0)
df_inst_id = inst[['ref_hash', 'conversion']]
auctions = auctions.merge(df_inst_id, on = 'ref_hash')
auctions = auctions.merge(df_installs, on = 'ref_hash')

In [63]:
auctions['conversion'].value_counts()

0    77306
1      289
Name: conversion, dtype: int64

In [64]:
auctions

Unnamed: 0,ref_hash,#subastas,Std_#src_id,median_auct_day,time_to_auction,conversion,time_to_install
0,1000503394293263005,343,71.327180,122.0,1443.557915,0,21273.7045
1,1000503394293263005,343,71.327180,122.0,1443.557915,0,21273.7045
2,1002405839250362430,6,1.264911,2.5,74923.931477,0,3758.6900
3,1003301025712584351,17,3.653005,7.0,25168.605614,0,38461.1590
4,1004194154420705846,40,9.899495,16.0,6604.414547,0,119185.2170
5,100609081247429593,77,17.003594,32.5,5926.372191,0,64853.3180
6,1006126025843610775,96,22.741054,40.5,5044.530492,0,29074.8615
7,1006126025843610775,96,22.741054,40.5,5044.530492,0,29074.8615
8,1006132342130922053,116,36.332416,58.0,3627.787038,0,75106.4630
9,1006882255090464173,159,28.923078,67.5,2074.120490,0,11029.8270


## Pruebo con Gradient Boosting

In [47]:
df_y = auctions['conversion']
df_X = auctions.drop(['ref_hash','conversion'], axis=1)

GBC = GradientBoostingClassifier(random_state=23, n_estimators=50, min_samples_split=50)
scores = cross_val_score(GBC , df_X, df_y, scoring="roc_auc", cv=5)
scores.mean()

0.7330512372256826

## Pruebo con XGBoost

In [48]:
XGBC = xgb.XGBClassifier(learning_rate =0.075, n_estimators=95, max_depth=4, min_child_weight=6, 
                         gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
                         scale_pos_weight=0.8, seed = 15)

scores = cross_val_score(XGBC , df_X, df_y, scoring="roc_auc", cv=5)
scores.mean()

0.7473716132776052