# Trabajo Práctico N° 2
## Objetivo:

Para cada dispositivo presentado por Jampp, determinar el tiempo que transcurrirá hasta que el mismo aparezca nuevamente en una subasta, y el tiempo hasta que el usuario del mismo decida instalar una nueva aplicación.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import datetime as dt

## Primero veo los dispositivos del target

In [17]:
target = pd.read_csv('data/target_competencia_ids.csv')

In [18]:
target.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,0
1,1000169251625791246_st,0
2,1000395625957344683_sc,0
3,1000395625957344683_st,0
4,1003027494996471685_sc,0


In [19]:
target.size

16148

Tomo los ref_hash

In [20]:
target['ref_hash'] = target['ref_hash'].transform(lambda x: str(x)[:-3])

In [21]:
target['ref_hash'] = target['ref_hash'].astype(np.int64)
target.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246,0
1,1000169251625791246,0
2,1000395625957344683,0
3,1000395625957344683,0
4,1003027494996471685,0


In [22]:
target.drop_duplicates(subset = 'ref_hash', inplace = True)

In [23]:
target.count()

ref_hash    4037
obj         4037
dtype: int64

## Calculo cuanto tiempo tarda un dispositivo en aparecer en una subasta contando desde el inicio de la ventana 2

La idea es determinar el tiempo que transcurrió entre cada aparición de un dispositivo en una subasta, para luego tomar el tiempo mínimo de aparición de un dispositivo en una subasta y de acuerdo a eso predecir utilizando los features de la ventana anterior.

In [None]:
auct_predict = pd.read_csv('data/auctions_ventana2.csv', dtype = { "ref_type_id": np.int8, "source_id": np.int8})

In [None]:
auct_predict['date'] = pd.to_datetime(auct_predict['date'])
auct_predict['date_inicial'] = dt.datetime(2019,4, 19)
auct_predict['timeToAuction'] = (auct_predict['date'] - auct_predict['date_inicial'])/np.timedelta64(1,'s')

In [None]:
auct_predict['timeToAuction'] = auct_predict['timeToAuction'].transform(lambda x: (72 * 60 * 60) if (x < 0) else x)
auct_predict.head()

Tomo el tiempo mínimo, en SEGUNDOS, que tardó cada dispositivo en aparecer en una subasta

In [None]:
auction_time = auct_predict.groupby('device_id').agg({'timeToAuction': 'min'}).reset_index()
auction_time.columns = ['ref_hash', 'predict_value']
auction_time.head()

# Analizo los datos de la primer ventana

In [None]:
auct = pd.read_csv('data/auctions_ventana1.csv', dtype = { "ref_type_id": np.int8, "source_id": np.int8})

In [None]:
auct.dtypes

In [None]:
auct['date'] = pd.to_datetime(auct['date'])

In [None]:
auct.head()

## Veo cuantas veces aparece cada dispositivo en una subasta

Inicio sencillamente contando la cantidad de subastas en las que participó cada dispositivo, y lo agrego como un nuevo feature

In [None]:
auction_count = auct.groupby('device_id').agg({'date': 'count'}).reset_index()
auction_count.columns = ['ref_hash', 'auctions_count']

In [None]:
auction_count.head()

Creo un único set de datos con los primeros features creados usando los ids de los dispositivos de la ventana 2

In [None]:
data = auction_count.merge(auction_time, on = 'ref_hash')

In [None]:
data.head()

## Pruebo con Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

y = data['predict_value']
X = data.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

In [None]:
predictions = cross_val_predict(RFR, X, y, cv=10)
predictions

In [None]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')

## Calculo el RMSE

In [None]:
# convierto los valores a MSE scores
mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)

In [None]:
rmse_scores.mean()

## Pruebo con xgboost

In [14]:
import xgboost as xgb

In [None]:
y = data['predict_value']
X = data.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Veo la cantidad de veces que aparece cada dispositivo segun el source_id

Ahora me interesa ver la cantidad de veces que un dispositivo participó en una subasta desde cada tipo de fuente desde donde se produce la subasta. De esta manera los source_id más populares tomaran un valor mayor, luego tomo la desviación estándar de la cantidad para cada device_id

In [None]:
auct['apariciones'] = 1

In [None]:
auction_by_sourceID = auct.groupby(['device_id', 'source_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
auction_by_sourceID.columns = auction_by_sourceID.columns.droplevel(0)
auction_by_sourceID.columns = ['ref_hash', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
auction_by_sourceID.head()

In [None]:
auction_by_sourceID['auctions_by_srcID'] = auction_by_sourceID.iloc[:,1:].std(axis = 1)
auction_by_sourceID.head()

Agrego el nuevo feature a los datos

In [None]:
data1 = data.merge(auction_by_sourceID[['ref_hash', 'auctions_by_srcID']], on = 'ref_hash')


In [None]:
data1.head()

Vuelvo a probar el modelo con el nuevo feature

In [None]:
y = data1['predict_value']
X = data1.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

In [None]:
cross_val_predict(RFR, X, y, cv=10)

In [None]:
scores = cross_val_score(RFR, X, y, cv=10, scoring='neg_mean_squared_error')

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Pruebo xgboost

In [None]:
y = data1['predict_value']
X = data1.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El nuevo feature logró bajar el score promedio de RMSE, así que lo mantenemos en el dataset de features.

## Veo la cantidad de veces que aparece cada dispositivo en una subasta según el ref_type

In [None]:
auct['ref_type_id'].value_counts()

In [None]:
dfApRef = auct.groupby(['device_id', 'ref_type_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApRef.columns = dfApRef.columns.droplevel(0)
dfApRef['median_count_ref_type'] = dfApRef.median(axis = 1)
dfApRef.head()

Me interesa ver si cada usuario sólo tiene apariciones para el mismo ref_type, lo chequeo para ver si me servirá o no el feature

In [None]:
dfApRef.columns = ['ref_hash', 'auctions_ref_type1', 'auctions_ref_type7', 'median_count_ref_type']
dfApRef.head()

In [None]:
dfApRef['aparece_en_distinto_ref_type'] = ((dfApRef['auctions_ref_type1'] > 0) & (dfApRef['auctions_ref_type7'] > 0))

In [None]:
dfApRef.head()

In [None]:
dfApRef['aparece_en_distinto_ref_type'].value_counts()

Como hay valores para ref_types distintos puedo probar agregar este feature y ver que sucede

In [None]:
data2 = data1.merge(dfApRef[['ref_hash', 'median_count_ref_type']], on = 'ref_hash')
data2.head()

In [None]:
y = data2['predict_value']
X = data2.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

In [None]:
cross_val_predict(RFR, X, y, cv=10)

In [None]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## XGBoost

In [None]:
y = data2['predict_value']
X = data2.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El valor promedio del score volvió a subir, pero el feature importance según Random Forest indica que el feature es importante, además subió muy poco, así que lo mantendremos por ahora para ver qué sucede a medida que agrego más features.

## Cantidad de apariciones de un dispositivo en las subastas por día

Calculo la cantidad de apariciones para cada dispositivo en las subastas por día, luego tomo la mediana para las apariciones.

In [None]:
auct['fecha'] = auct['date'].dt.date

In [None]:
dfApDay = auct.groupby(['device_id', 'fecha']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApDay.columns = dfApDay.columns.droplevel(0)
dfApDay.columns = ['ref_hash', '2019-04-18', '2019-04-19', '2019-04-20']
dfApDay['auctions_by_day'] = dfApDay.median(axis = 1)

In [None]:
dfApDay.head()

Agrego el nuevo feature

In [None]:
data3 = data1.merge(dfApDay[['ref_hash', 'auctions_by_day']], on = 'ref_hash')
data3.head()

Ahora puebo el modelo

In [None]:
y = data3['predict_value']
X = data3.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

In [None]:
cross_val_predict(RFR, X, y, cv=10)

In [None]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')


In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Pruebo con Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor(random_state=23, n_estimators=50, min_samples_split=50)
scores = cross_val_score(GBR , X, y, scoring = "neg_mean_squared_error", cv=5)


In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Ahora pruebo con XGBoost 

In [None]:
XGB = xgb.XGBRegressor()

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El score volvió a subir un poco con random forest, pero con xgboost bajó, así que voy a mantener este feature

## Agrego como feature el tiempo minimo de aparición de un dispositivo durante la ventana anterior a la ventana en la que voy a predecir.

La idea de este feature es estudiar el comportamiento de los dispositivos, sus tiempos de aparición y calcular el mínimo tiempo entre ellos para tener un estimativo del tiempo a predecir.


In [None]:
apariciones_auctions = auct[['date', 'device_id']]
apariciones_auctions.head()

In [None]:
apariciones_auctions['date'] = pd.to_datetime(apariciones_auctions['date'])
apariciones_auctions['date_inicial'] = dt.datetime(2019,4, 18)
apariciones_auctions['timeToAuction'] = (apariciones_auctions['date'] - apariciones_auctions['date_inicial'])/np.timedelta64(1,'s')
apariciones_auctions.head()

In [None]:
apariciones_auctions = apariciones_auctions.groupby('device_id').agg({'timeToAuction': 'min'}).reset_index()
apariciones_auctions.columns = ['ref_hash', 'timeToAuction_min']
apariciones_auctions.head()

Agrego el nuevo feature 

In [None]:
data4 = data3.merge(apariciones_auctions, on = 'ref_hash')

In [None]:
data4.head()

## Pruebo con XGBoost

In [None]:
y = data4['predict_value']
X = data4.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Este feature mejoró muchísimo el score, lo mantengo

## Agrego Features sobre los eventos

In [None]:
evts = pd.read_csv("data/events_ventana1.csv", dtype = {"event_id": np.int16, "application_id": np.int16, 'device_countrycode': 'category', 'device_os_version': 'category', 'device_brand': 'category', 'device_model': 'category', 'device_city': 'category', 'session_user_agent': 'category', 'trans_id': 'category', 'user_agent': 'category', 'carrier' : 'category', 'kind': 'category', 'device_os': 'category', 'connection_type': 'category', 'ip_address': 'category', 'device_language': 'category'})

In [None]:
evts.head()

## Aplicaciones populares

Agrego Features tomando en cuenta cuales la popularidad de las applicaciones. La idea es generar un feature que asigne un número a cada dispositivo de acuerdo al id de la applicación. De esta manera podría inferirse que si una applicación es más popular que otra es entonces más probable que se genere un evento sobre la misma.

In [None]:
apps_populares = evts[['application_id', 'ref_hash']].groupby('application_id').count().reset_index()
apps_populares.columns = ['application_id', 'popularidad_app']
apps_populares.head()

In [None]:
apps_populares = evts[['ref_hash', 'application_id']].merge(apps_populares, on = 'application_id')
apps_populares.head()

In [None]:
apps_counts = evts[['date', 'ref_hash', 'application_id']].groupby(['ref_hash', 'application_id']).count().reset_index()
apps_counts.head()

In [None]:
# Le asigno a cada dispositivo la popularidad de la applicación en la cual generó más eventos
apps_pops = apps_counts.groupby(['ref_hash']).agg({'date': 'max'}).reset_index()
apps_pops.columns = ['ref_hash', 'popularidad_events']
apps_pops.head()

Agrego el nuevo feature a los datos

In [None]:
data5 = data4.merge(apps_pops, on = 'ref_hash', how = 'left')
data.head()

In [None]:
# A los dispositivos que no registran eventos, se le asignará una popularidad de valor 0
#data5['popularidad'] = data5['popularidad_events'].fillna(0.0)

## Pruebo el feature con XGBoost

In [None]:
y = data5['predict_value']
X = data5.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

con este feature el score volvió a suir, lo descarto

## Agrego un feature tomando en cuenta la cantidad de eventos en los que participó cada dispositivo

In [None]:
count_events = evts.groupby('ref_hash').agg({'date': 'count'}).reset_index()
count_events.columns = ['ref_hash', 'count_events']
count_events.head()

Agrego el nuevo feature y veo qué sucede con el score

In [None]:
data6 = data4.merge(count_events, on = 'ref_hash', how = 'left')
#data['count_events'] = data['count_events'].fillna(0)

## Vuelvo a probar el modelo con XGBoost

In [None]:
y = data6['predict_value']
X = data6.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

se puede ver que este feature tampoco es bueno, actualmente data4 es el set de features con mejor score

# Pruebo los features anteriores para predecir los tiempos de conversiones

In [2]:
inst = pd.read_csv('data/installs_v2.csv.gzip', dtype = {'application_id': np.int16, 'ref_type': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})

  interactivity=interactivity, compiler=compiler, result=result)


## Installs

Tomo los tiempos de la primera conversión de cada dispositivo dentro de la ventana 2 para entrenar el algoritmo luego

In [3]:
inst['created'] = pd.to_datetime(inst['created'])
inst.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,6287817205707153877,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,4243443387795468703,3.3013777759777e+18
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,6287817205707153877,,7.80553892759877e+18,adjust.com,,,,,,4724288679627032761,3.3013777759777e+18
2,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,6287817205707153877,,8.355495513718673e+18,adjust.com,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,,8291809486355890410,4.060929664968129e+18
3,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,6287817205707153877,,2.3557720913769155e+18,adjust.com,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,,4006811922873399949,3.3013777759777e+18
4,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,6287817205707153877,,6.156971151807135e+18,adjust.com,,,,,,3386455054590810771,3.3013777759777e+18


## Calculo el tiempo que tarda en convertir cada dispositivo

In [6]:
inst['created_inicial'] = dt.datetime(2019, 4, 19)
inst['created_inicial'] = pd.to_datetime(inst['created_inicial'])
inst['timeToInstall'] = (inst['created'] - inst['created_inicial'])/np.timedelta64(1,'s')
inst['timeToInstall'] = inst['timeToInstall'].transform(lambda x: x if (x >=  0) else (72 * 60 * 60))
inst.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language,created_inicial,timeToInstall
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,6287817205707153877,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,4243443387795468703,3.3013777759777e+18,2019-04-25,259200.0
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,6287817205707153877,,7.80553892759877e+18,adjust.com,,,,,,4724288679627032761,3.3013777759777e+18,2019-04-25,259200.0
2,2019-04-20 10:15:36.274,1,1494519392962156891,5230323462636548010,,False,True,6287817205707153877,,8.355495513718673e+18,adjust.com,,dda99e3c-9c4b-487d-891c-79f0a02cb4a8,app_open,,,8291809486355890410,4.060929664968129e+18,2019-04-25,259200.0
3,2019-04-20 21:56:47.151,1,1494519392962156891,5097163995161606833,,False,True,6287817205707153877,,2.3557720913769155e+18,adjust.com,,7010c3ce-0fcf-46c6-9be8-374cc0e20af4,app_open,,,4006811922873399949,3.3013777759777e+18,2019-04-25,259200.0
4,2019-04-20 22:40:41.239,1,1494519392962156891,6328027616411983332,,False,False,6287817205707153877,,6.156971151807135e+18,adjust.com,,,,,,3386455054590810771,3.3013777759777e+18,2019-04-25,259200.0


Tomo el tiempo mínimo, en SEGUNDOS, que tardó un dispositivo en realizar una instalación

In [7]:
install_time = inst.groupby('ref_hash').agg({'timeToInstall': 'min'}).reset_index()
install_time.columns = ['ref_hash', 'predict_time_install']
install_time.head()

Unnamed: 0,ref_hash,predict_time_install
0,40621409780134,259200.0
1,41863526108385,259200.0
2,90072729247980,259200.0
3,135153013040192,259200.0
4,161514654074162,259200.0


## Ahora leo los datos de la ventana 1 para crear features sobre estos ids

In [None]:
installs = pd.read_csv('data/installs_ventana1.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})
installs.head()

## Agrego un feature sobre la popularidad de las applicaciones

In [None]:
apps_populares_installs = installs.groupby('application_id').agg({'created': 'count'}).reset_index()
apps_populares_installs.columns = ['application_id', 'popularidad_app']
apps_populares_installs = installs[['ref_hash', 'application_id']].merge(apps_populares_installs, on = 'application_id')
apps_populares_installs.head()

In [None]:
apps_counts = installs.groupby(['ref_hash', 'application_id']).agg({'created': 'count'}).reset_index()
# Le asigno a cada dispositivo la popularidad de la applicación en la cual generó más eventos
apps_pops = apps_counts.groupby(['ref_hash']).agg({'created': 'max'}).reset_index()
apps_pops.columns = ['ref_hash', 'popularidad_apps']
apps_pops.head()

## Creo el set de datos par a entrenar los algoritmos

In [None]:
data_installs = apps_pops #install_time.merge(apps_pops, on = 'ref_hash') 
data_installs.head()

## Pruebo XGBoost para predecir

In [None]:
y = data_installs['predict_time_install']
X = data_installs.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Agrego como feature el tiempo real que tarda un dispositivo en realizar la primera conversión, dentro de la ventana 1

In [None]:
installs['created'] = pd.to_datetime(inst['created'])
installs['created_inicial'] = dt.datetime(2019, 4, 18)
installs['created_inicial'] = pd.to_datetime(installs['created_inicial'])
installs['timeToInstall'] = (installs['created'] - installs['created_inicial'])/np.timedelta64(1,'s')


In [None]:
#Ahora tomo el tiempo mínimo
time_to_install = installs.groupby('ref_hash').agg({'timeToInstall': 'min'}).reset_index()

In [None]:
time_to_install.head()

Agrego el  nuevo feature y vuelvo a probar el algoritmo

In [None]:
data_installs1 = data_installs.merge(time_to_install, on = 'ref_hash')
data_installs1.head()

In [None]:
y = data_installs1['predict_time_install']
X = data_installs1.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Se mejoró el score, por lo que mantengo el feature

## ahora agrego los features sobre los eventos

In [None]:
new_data1 = data_installs1.merge(apps_pops, on = 'ref_hash', how = 'left')
new_data1.head()

Pruebo el nuevo feature con xgboost

In [None]:
y = new_data1['predict_time_install']
X = new_data1.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El score se mantuvo igual, por lo que puedo sospechar que las apps populares en events son las mismas que en installs

Agrego un feature sobre los eventos registrados para cada dispositivo

In [None]:
new_data2 = data_installs1.merge(count_events, on = 'ref_hash', how = 'left')

In [None]:
#new_data['count_events'] = new_data['count_events'].fillna(0.0)
y = new_data2['predict_time_install']
X = new_data2.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Se logró mejorar el score con este feature, así que lo mantengo

## Agrego los features sobre auctions

In [None]:
#agrego la mediana de subastas registradas por cada dispositivo según el ref_type_id
new_data3 = new_data2.merge(dfApRef[['ref_hash', 'median_count_ref_type']], on = 'ref_hash', how = 'left')

In [None]:
#new_data['median_count_ref_type'] = new_data['median_count_ref_type'].fillna(0.0)
y = new_data3['predict_time_install']
X = new_data3.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El feature anterior empeoró las performance por lo que no se lo tomará en cuenta para las predicciones

In [None]:
#agrego la cantidad de subastas registradas para cada dispositivo
new_data4 = new_data2.merge(auction_count, on = 'ref_hash', how = 'left')
new_data4.head()

In [None]:
##new_data['auction_count'] = new_data['auctions_count'].fillna(0.0)
y = new_data4['predict_time_install']
X = new_data4.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El feature anterior tampoco logró mejorar el score, aunque no lo empeoró, por ahora no lo tomo en cuenta

In [None]:
#Agrego el tiempo mínimo de aparición de cada dispositivo en una subasta
new_data5 = new_data2.merge(apariciones_auctions, on = 'ref_hash', how  = 'left')
new_data5.head()

In [None]:
#new_data[ 'auctions_by_srcID'] = new_data[ 'auctions_by_srcID'].fillna(0.0)
y = new_data5['predict_time_install']
X = new_data5.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Se puede apreciar que con este nuevo feature se mejoró bastante el score, así que lo mantendremos entre los features para las predicciones

Agrego otro feature, tomando en cuenta el promedio de subastas, por día, en las que participa cada dispositivo

In [None]:
new_data6 = new_data5.merge(dfApDay[['ref_hash', 'auctions_by_day']], on = 'ref_hash', how  = 'left')
new_data6.head()

Ahora pruebo el modelo con los nuevos features

In [None]:
#new_data['timeToAuction_min'] = new_data['timeToAuction_min'].fillna(72 * 60 * 60)
y = new_data6['predict_time_install']
X = new_data6.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Este feature hizo que el score empeore un poco, así que no lo mantendré en cuenta


## Agrego un nuevo feature tomando en cuenta el user_agent desde donde se origina la instalación

In [None]:
installs_by_user_agent = installs[['user_agent', 'ref_hash']].groupby('user_agent').count().reset_index()
installs_by_user_agent.columns = ['user_agent', 'installs_por_user_agent']
installs_by_user_agent = installs_by_user_agent.merge(installs[['ref_hash', 'user_agent']], on = 'user_agent')

In [None]:
installs_by_user_agent.head()

In [None]:
new_data7 = installs_by_user_agent[['installs_por_user_agent', 'ref_hash']].merge(new_data5, on = 'ref_hash')
new_data7.head()

Pruebo el nuevo feature

In [None]:
#new_data[ 'auctions_by_srcID'] = new_data[ 'auctions_by_srcID'].fillna(0.0)
y = new_data7['predict_time_install']
X = new_data7.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Con el nuevo feature el score mejoró bastante así que lo mantengo, hasta ahora el mejor es new_data7

## Agrego un nuevo feature tomando en cuenta la session_user_agent

In [None]:
session_user_agent = installs[['ref_hash', 'session_user_agent']].groupby('session_user_agent').count().reset_index()
session_user_agent.head()

In [None]:
session_feature = installs[['ref_hash', 'session_user_agent']].merge(session_user_agent, on = 'session_user_agent', how = 'left')
session_feature = session_feature.drop(columns = 'session_user_agent')
session_feature.columns = ['ref_hash', 'session_user_agent']
session_feature.head()

Agrego el nuevo feature al set de features

In [None]:
new_data8 = new_data7.merge(session_feature, on = 'ref_hash')

Pruebo el nuevo feature

In [None]:
y = new_data8['predict_time_install']
X = new_data8.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Este feature empeoró el score, así que lo descarto

# Guardo los features de cada ventana

## Features sobre st

In [None]:
data4 = data4.drop_duplicates(subset = 'ref_hash')
data4.columns = ['ref_hash', 'auctions_count_v7', 'auctions_by_srcID_v7', 'auctions_by_day_v7',
       'timeToAuction_min_v7']

In [None]:
data4.head()

Leo los features anteriores

In [None]:
features = pd.read_csv( 'data/st_features.csv')
features.head()

In [None]:
features = features.merge(data4, on = 'ref_hash', how = 'outer')
features.head()

In [None]:
features.count()

Guardo los features

In [None]:
features.to_csv(path_or_buf = 'data/st_features.csv', index = False)

## Creo las predicciones st con todos los ids

In [None]:
# Auction_time se calcula con auctions completo a partir del día 25
features = features.merge(auction_time, on = 'ref_hash')

In [None]:
data_final = features.merge(target, on = 'ref_hash').drop(columns = ['ref_hash', 'obj', 'predict_value'])

In [None]:
y = features['predict_value']
X = features.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

In [None]:
XGB = xgb.XGBRegressor()
XGB.fit(X, y)
result = XGB.predict(data_final)

In [None]:
target.drop(columns = 'obj', inplace = True)
target.count()

In [None]:
submit = []
for l in result:
    submit.append(l)
target['obj'] = submit

In [None]:
target['ref_hash'] = target['ref_hash'].transform(lambda x: str(x) + '_st')
target.head()

In [None]:
target.to_csv(path_or_buf = "submit_st.csv", index = False)

## Features sobre sc

In [None]:
new_data7 = new_data7[['ref_hash', 'installs_por_user_agent', 'popularidad_apps', 'timeToInstall', 'count_events', 'timeToAuction_min']].drop_duplicates(subset = 'ref_hash')
new_data7.columns = ['ref_hash', 'installs_por_user_agent_v7', 'popularidad_apps_v7', 'timeToInstall_v7', 'count_events_v7', 'timeToAuction_min_v7']
new_data7.head()

Leo los features de las ventanas anteriores

In [None]:
features = pd.read_csv('data/features_sc.csv')
features.head()

In [None]:
features = features.merge(new_data7, on = 'ref_hash', how = 'outer')
features.head()

Guardo los features

In [None]:
features.to_csv(path_or_buf = "data/features_sc.csv", index = False)

## Creo las predicciones sc con todos los ids

In [11]:
features_sc = pd.read_csv("data/features_sc.csv")
features_sc.head()

Unnamed: 0,ref_hash,installs_por_user_agent_v1,popularidad_apps_v1,timeToInstall_v1,count_events_v1,timeToAuction_min_v1,installs_por_user_agent_v2,popularidad_apps_v2,timeToInstall_v2,count_events_v2,...,installs_por_user_agent_v6,popularidad_apps_v6,timeToInstall_v6,count_events_v6,timeToAuction_min_v6,installs_por_user_agent_v7,popularidad_apps_v7,timeToInstall_v7,count_events_v7,timeToAuction_min_v7
0,8670865579348815667,1.0,1.0,27171.476,8.0,,,,,,...,,,,,,,,,,
1,1184543608462124266,1.0,1.0,96638.075,1.0,,1.0,1.0,10238.075,1.0,...,,,,,,,,,,
2,5471994910141133099,1.0,1.0,93676.495,,93437.460651,2.0,1.0,7276.495,,...,,,,,,,,,,
3,6967960820179343958,1.0,1.0,11573.442,3.0,250954.285768,,,,,...,,,,,,,,,,
4,9178440513583202912,2.0,2.0,97135.604,4.0,,2.0,2.0,10735.604,4.0,...,,,,,,,,,,


In [12]:
# Install_time se calcula sobre el set completo de installs, desde el día 25
features_train = features_sc.merge(install_time, on = 'ref_hash')
features_train.head()

Unnamed: 0,ref_hash,installs_por_user_agent_v1,popularidad_apps_v1,timeToInstall_v1,count_events_v1,timeToAuction_min_v1,installs_por_user_agent_v2,popularidad_apps_v2,timeToInstall_v2,count_events_v2,...,popularidad_apps_v6,timeToInstall_v6,count_events_v6,timeToAuction_min_v6,installs_por_user_agent_v7,popularidad_apps_v7,timeToInstall_v7,count_events_v7,timeToAuction_min_v7,predict_time_install
0,8670865579348815667,1.0,1.0,27171.476,8.0,,,,,,...,,,,,,,,,,259200.0
1,1184543608462124266,1.0,1.0,96638.075,1.0,,1.0,1.0,10238.075,1.0,...,,,,,,,,,,259200.0
2,5471994910141133099,1.0,1.0,93676.495,,93437.460651,2.0,1.0,7276.495,,...,,,,,,,,,,259200.0
3,6967960820179343958,1.0,1.0,11573.442,3.0,250954.285768,,,,,...,,,,,,,,,,259200.0
4,9178440513583202912,2.0,2.0,97135.604,4.0,,2.0,2.0,10735.604,4.0,...,,,,,,,,,,259200.0


In [15]:
y = features_train['predict_time_install']
X = features_train.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




14021.673035846014

In [28]:
features_final = features_sc.merge(target, on = 'ref_hash')#.drop(columns = ['ref_hash', 'obj', 'predict_time_install'])
features_final.count()

ref_hash                      2758
installs_por_user_agent_v1     885
popularidad_apps_v1            885
timeToInstall_v1               885
count_events_v1                647
timeToAuction_min_v1           731
installs_por_user_agent_v2     949
popularidad_apps_v2            949
timeToInstall_v2               949
count_events_v2                711
timeToAuction_min_v2           780
installs_por_user_agent_v3     916
popularidad_apps_v3            916
timeToInstall_v3               916
count_events_v3                690
timeToAuction_min_v3           761
installs_por_user_agent_v4     913
popularidad_apps_v4            913
timeToInstall_v4               913
count_events_v4                684
timeToAuction_min_v4           766
installs_por_user_agent_v5     898
popularidad_apps_v5            898
timeToInstall_v5               898
count_events_v5                678
timeToAuction_min_v5           750
installs_por_user_agent_v6     939
popularidad_apps_v6            939
timeToInstall_v6    

In [None]:
XGB = xgb.XGBRegressor()
XGB.fit(X, y)
result = XGB.predict(feature_final)

In [None]:
target.drop(columns = 'obj', inplace = True)
target.count()

In [None]:
submit = []
for l in result:
    submit.append(l)
target['obj'] = submit

In [None]:
target['ref_hash'] = target['ref_hash'].transform(lambda x: str(x) + '_sc')
target.head()

In [None]:
target.to_csv(path_or_buf = "submit_sc.csv", index = False)

## Armo el submit final

In [4]:
st = pd.read_csv('submit_st.csv')
st.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246_st,42108.023438
1,1000395625957344683_st,259508.109375
2,1003027494996471685_st,84329.234375
3,1006670001679961544_st,7656.664551
4,1007573308966476713_st,67003.21875


In [7]:
sc = pd.read_csv('submit_sc.csv')
sc.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,259163.75
1,1000395625957344683_sc,257842.34375
2,1003027494996471685_sc,259163.75
3,1006670001679961544_sc,259163.75
4,1007573308966476713_sc,251596.453125


In [9]:
submit_final = st.merge(sc, on = 'ref_hash', how = 'outer')
submit_final = submit_final.sort_values(by = 'ref_hash')

In [10]:
submit_final.head()

Unnamed: 0,ref_hash,obj_x,obj_y
4037,1000169251625791246_sc,,259163.75
0,1000169251625791246_st,42108.023438,
4038,1000395625957344683_sc,,257842.34375
1,1000395625957344683_st,259508.109375,
4039,1003027494996471685_sc,,259163.75


In [13]:
submit_final['obj_x'] = submit_final['obj_x'].fillna(0)
submit_final['obj_y'] = submit_final['obj_y'].fillna(0)
submit_final['obj'] = submit_final['obj_x'] + submit_final['obj_y']
submit_final = submit_final[['ref_hash', 'obj']]
submit_final.head()

Unnamed: 0,ref_hash,obj
4037,1000169251625791246_sc,259163.75
0,1000169251625791246_st,42108.023438
4038,1000395625957344683_sc,257842.34375
1,1000395625957344683_st,259508.109375
4039,1003027494996471685_sc,259163.75


In [14]:
submit_final.to_csv(path_or_buf = 'submit_final.csv', index = False)

In [15]:
submit_final.count()

ref_hash    8074
obj         8074
dtype: int64