# Trabajo Práctico N° 2
## Objetivo:

Para cada dispositivo presentado por Jampp, determinar el tiempo que transcurrirá hasta que el mismo aparezca nuevamente en una subasta, y el tiempo hasta que el usuario del mismo decida instalar una nueva aplicación.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import datetime as dt

## Analizo los datos de la primer ventana

In [2]:
auct = pd.read_csv('data/auctions_ventana1.csv', dtype = {"device_id": 'category', "ref_type_id": np.int8, "source_id": np.int8})

In [3]:
auct.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-20 23:57:27.912838,1109595589636746168,7,0
1,2019-04-20 23:57:28.381114,5896614299191635403,1,0
2,2019-04-20 23:57:28.515423,4172466725848941608,1,0
3,2019-04-20 23:57:28.700884,2616279795187318849,7,0
4,2019-04-20 23:57:28.868312,8034952072073026056,1,0


In [4]:
auct['date'] = pd.to_datetime(auct['date'])

In [5]:
auct.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-20 23:57:27.912838,1109595589636746168,7,0
1,2019-04-20 23:57:28.381114,5896614299191635403,1,0
2,2019-04-20 23:57:28.515423,4172466725848941608,1,0
3,2019-04-20 23:57:28.700884,2616279795187318849,7,0
4,2019-04-20 23:57:28.868312,8034952072073026056,1,0


## Veo cuantas veces aparece cada dispositivo en una subasta

In [6]:
dfApT = auct.groupby('device_id').agg({'date': 'count'}).reset_index()
dfApT.columns = ['device_id', 'count_subs']

In [7]:
dfApT.head()

Unnamed: 0,device_id,count_subs
0,1000061425870948777,8
1,1000503394293263005,343
2,1001008640113335510,24
3,1001123163431776865,68
4,1001144380199556647,78


## Veo la cantidad de veces que aparece cada dispositivo segun el source_id

In [8]:
auct['apariciones'] = 1

In [9]:
dfA = auct.groupby(['device_id', 'source_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfA.columns = dfA.columns.droplevel(0)
dfA.head()

source_id,Unnamed: 1,0,1,2,3,4,5,6,7,8,9
0,1000061425870948777,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000503394293263005,225.0,79.0,1.0,18.0,1.0,0.0,0.0,0.0,19.0,0.0
2,1001008640113335510,9.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001123163431776865,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
4,1001144380199556647,46.0,15.0,1.0,0.0,1.0,0.0,1.0,0.0,14.0,0.0


In [10]:
dfA.columns = ['device_id', 'count_src_0', 'count_src_1', 'count_src_2', 'count_src_3', 'count_src_4', 'count_src_5', 'count_src_6', 'count_src_7', 'count_src_8', 'count_src_9']

In [11]:
dfA.head()

Unnamed: 0,device_id,count_src_0,count_src_1,count_src_2,count_src_3,count_src_4,count_src_5,count_src_6,count_src_7,count_src_8,count_src_9
0,1000061425870948777,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000503394293263005,225.0,79.0,1.0,18.0,1.0,0.0,0.0,0.0,19.0,0.0
2,1001008640113335510,9.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001123163431776865,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
4,1001144380199556647,46.0,15.0,1.0,0.0,1.0,0.0,1.0,0.0,14.0,0.0


## Veo la cantidad de veces que aparece cada dispositivo en una subasta según el ref_type

In [12]:
auct['ref_type_id'].value_counts()

1    13313564
7     2331211
Name: ref_type_id, dtype: int64

In [13]:
dfApRef = auct.groupby(['device_id', 'ref_type_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApRef.columns = dfApRef.columns.droplevel(0)
dfApRef.columns = ['device_id', 'count_ref_1', 'count_ref_7']
dfApRef.head()

Unnamed: 0,device_id,count_ref_1,count_ref_7
0,1000061425870948777,8.0,0.0
1,1000503394293263005,0.0,343.0
2,1001008640113335510,24.0,0.0
3,1001123163431776865,68.0,0.0
4,1001144380199556647,0.0,78.0


## Promedio de apariciones de un dispositivo por día

In [14]:
auct['fecha'] = auct['date'].dt.date

In [23]:
dfApDay = auct.groupby(['device_id', 'fecha']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApDay.columns = dfApDay.columns.droplevel(0)
dfApDay.columns = ['device_id', '2019-04-18', '2019-04-19', '2019-04-20']

In [24]:
dfApDay.head()

Unnamed: 0,device_id,2019-04-18,2019-04-19,2019-04-20
0,1000061425870948777,2.0,2.0,4.0
1,1000503394293263005,111.0,99.0,133.0
2,1001008640113335510,24.0,0.0,0.0
3,1001123163431776865,16.0,20.0,32.0
4,1001144380199556647,34.0,27.0,17.0


## Creo un solo dataframe con los datos estadísticos

In [21]:
auctions = dfApT.merge(dfA, on = 'device_id')
auctions = auctions.merge(dfApRef, on = 'device_id')

In [25]:
auctions = auctions.merge(dfApDay, on = 'device_id')
auctions

Unnamed: 0,device_id,count_subs,count_src_0,count_src_1,count_src_2,count_src_3,count_src_4,count_src_5,count_src_6,count_src_7,count_src_8,count_src_9,count_ref_1,count_ref_7,2019-04-18,2019-04-19,2019-04-20
0,1000061425870948777,8,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,2.0,2.0,4.0
1,1000503394293263005,343,225.0,79.0,1.0,18.0,1.0,0.0,0.0,0.0,19.0,0.0,0.0,343.0,111.0,99.0,133.0
2,1001008640113335510,24,9.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,24.0,0.0,0.0
3,1001123163431776865,68,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,68.0,0.0,16.0,20.0,32.0
4,1001144380199556647,78,46.0,15.0,1.0,0.0,1.0,0.0,1.0,0.0,14.0,0.0,0.0,78.0,34.0,27.0,17.0
5,1001510820906853249,72,44.0,14.0,1.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,72.0,0.0,38.0,21.0,13.0
6,1001650136929210538,1479,0.0,751.0,0.0,693.0,0.0,30.0,5.0,0.0,0.0,0.0,1479.0,0.0,462.0,687.0,330.0
7,1001967990146853539,18,4.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,18.0,0.0,0.0
8,1002156562376142390,48,1.0,27.0,0.0,0.0,2.0,18.0,0.0,0.0,0.0,0.0,48.0,0.0,9.0,0.0,39.0
9,1002405839250362430,6,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,2.0,3.0


## Clicks

In [17]:
clks = pd.read_csv("data/clks_ventana1.csv", dtype = {'advertiser_id': np.int8, 'action_id': np.float32, 'source_id': np.int8, 'country_codde': 'category',  'carrier_id': np.float16, 'specs_brand': 'category', 'brand': np.float16, 'ref_type': 'category', 'ref_hash': 'category'})

## Installs

In [18]:
inst = pd.read_csv('data/installs_ventana1.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'ref_hash': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})

In [26]:
inst.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,6287817205707153877,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,4243443387795468703,3.3013777759777e+18
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,6287817205707153877,,7.80553892759877e+18,adjust.com,,,,,,4724288679627032761,3.3013777759777e+18
2,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,6287817205707153877,,8.355495513718673e+18,adjust.com,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,,8291809486355890410,4.060929664968129e+18
3,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,6287817205707153877,,2.3557720913769155e+18,adjust.com,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,,4006811922873399949,3.3013777759777e+18
4,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,6287817205707153877,,6.156971151807135e+18,adjust.com,,,,,,3386455054590810771,3.3013777759777e+18


## Agrego los device_id con installs

In [27]:
inst['conversion'] = inst['attributed'].transform(lambda x: 1 if (x == True) else 0)
df_inst_id = inst[['ref_hash', 'conversion']]
df_inst_id.columns = ['device_id', 'conversion']
auctions = auctions.merge(df_inst_id, on = 'device_id')

In [29]:
auctions['conversion'].value_counts()

0    29849
1      108
Name: conversion, dtype: int64

In [33]:
auctions.head()

Unnamed: 0,device_id,count_subs,count_src_0,count_src_1,count_src_2,count_src_3,count_src_4,count_src_5,count_src_6,count_src_7,count_src_8,count_src_9,count_ref_1,count_ref_7,2019-04-18,2019-04-19,2019-04-20,conversion
0,1002405839250362430,6,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,2.0,3.0,0
1,100609081247429593,77,0.0,53.0,0.0,0.0,0.0,0.0,5.0,19.0,0.0,0.0,0.0,77.0,45.0,20.0,12.0,0
2,1007262066891761130,202,110.0,83.0,0.0,6.0,0.0,0.0,0.0,3.0,0.0,0.0,202.0,0.0,4.0,12.0,186.0,0
3,1008745782458692845,150,49.0,78.0,0.0,18.0,3.0,0.0,0.0,1.0,1.0,0.0,150.0,0.0,121.0,7.0,22.0,0
4,1012281155022640104,169,112.0,46.0,0.0,0.0,0.0,0.0,2.0,0.0,9.0,0.0,169.0,0.0,88.0,67.0,14.0,0


## Pruebo con Gradient Boosting

In [34]:
df_y = auctions['conversion']
df_X = auctions.drop(['device_id','conversion'], axis=1)

GBC = GradientBoostingClassifier(random_state=23, n_estimators=50, min_samples_split=50)
scores = cross_val_score(GBC , df_X, df_y, scoring="roc_auc", cv=5)
scores.mean()

0.6720899565088553

## Pruebo con XGBoost

In [35]:
XGBC = xgb.XGBClassifier(learning_rate =0.075, n_estimators=95, max_depth=4, min_child_weight=6, 
                         gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
                         scale_pos_weight=0.8, seed = 15)

scores = cross_val_score(XGBC , df_X, df_y, scoring="roc_auc", cv=5)
scores.mean()

0.750012167550963