# Trabajo Práctico N° 2
## Objetivo:

Para cada dispositivo presentado por Jampp, determinar el tiempo que transcurrirá hasta que el mismo aparezca nuevamente en una subasta, y el tiempo hasta que el usuario del mismo decida instalar una nueva aplicación.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import datetime as dt

## Primero veo los dispositivos del target

In [None]:
target = pd.read_csv('data/target_competencia_ids.csv')

In [None]:
target.head()

In [None]:
target.size

Tomo los ref_hash

In [None]:
target['ref_hash'] = target['ref_hash'].transform(lambda x: str(x)[:-3])

In [None]:
target['ref_hash'] = target['ref_hash'].astype(np.int64)
target.head()

In [None]:
target.drop_duplicates(subset = 'ref_hash', inplace = True)

In [None]:
target.count()

## Calculo cuanto tiempo tarda un dispositivo en aparecer en una subasta contando desde el inicio de la ventana que quiero predecir

La idea es determinar el tiempo que transcurrió entre cada aparición de un dispositivo en una subasta, para luego tomar el tiempo mínimo de aparición de un dispositivo en una subasta y de acuerdo a eso predecir utilizando los features de la ventana anterior.

In [None]:
auct_predict = pd.read_csv('data/auctions_ventana7.csv', dtype = { "ref_type_id": np.int8, "source_id": np.int8})

In [None]:
auct_predict['date'] = pd.to_datetime(auct_predict['date'])
auct_predict['date_inicial'] = dt.datetime(2019,4, 24)
auct_predict['timeToAuction'] = (auct_predict['date'] - auct_predict['date_inicial'])/np.timedelta64(1,'s')

In [None]:
auct_predict['timeToAuction'] = auct_predict['timeToAuction'].transform(lambda x: (72 * 60 * 60) if (x < 0) else x)
auct_predict.head()

Tomo el tiempo mínimo, en SEGUNDOS, que tardó cada dispositivo en aparecer en una subasta

In [None]:
auction_time = auct_predict.groupby('device_id').agg({'timeToAuction': 'min'}).reset_index()
auction_time.columns = ['ref_hash', 'predict_value']
auction_time.head()

In [None]:
auction_time = target[['ref_hash']].merge(auction_time, on = 'ref_hash', how = 'left')
auction_time['predict_value'] = auction_time['predict_value'].fillna((72 * 60 * 60)) # Los que no aparecieron 

In [None]:
auction_time.merge(target[['ref_hash']]).nunique()

# Analizo los datos de la ventana anterior

In [None]:
auct = pd.read_csv('data/auctions_ventana7.csv', dtype = { "ref_type_id": np.int8, "source_id": np.int8})

In [None]:
auct.dtypes

In [None]:
auct['date'] = pd.to_datetime(auct['date'])

In [None]:
auct.head()

## Veo cuantas veces aparece cada dispositivo en una subasta

Inicio sencillamente contando la cantidad de subastas en las que participó cada dispositivo, y lo agrego como un nuevo feature

In [None]:
auction_count = auct.groupby('device_id').agg({'date': 'count'}).reset_index()
auction_count.columns = ['ref_hash', 'auctions_count']

In [None]:
auction_count.head()

Creo un único set de datos con los primeros features creados usando los ids de los dispositivos de la ventana 2

In [None]:
data = auction_time.merge(auction_count, on = 'ref_hash', how = 'left')

In [None]:
data.head()

In [None]:
data.nunique()

## Pruebo con Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

y = data['predict_value']
X = data.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  
RFR.feature_importances_

In [None]:
predictions = cross_val_predict(RFR, X, y, cv=10)
predictions

In [None]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')

## Calculo el RMSE

In [None]:
# convierto los valores a MSE scores
mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)

In [None]:
rmse_scores.mean()

## Pruebo con xgboost

In [None]:
import xgboost as xgb

In [None]:
y = data['predict_value']
X = data.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Veo la cantidad de veces que aparece cada dispositivo segun el source_id

Ahora me interesa ver la cantidad de veces que un dispositivo participó en una subasta desde cada tipo de fuente desde donde se produce la subasta. De esta manera los source_id más populares tomaran un valor mayor, luego tomo la desviación estándar de la cantidad para cada device_id

In [None]:
auct['apariciones'] = 1

In [None]:
auction_by_sourceID = auct.groupby(['device_id', 'source_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
auction_by_sourceID.columns = auction_by_sourceID.columns.droplevel(0)
auction_by_sourceID.columns = ['ref_hash', 'source_id0', 'source_id1', 'source_id2', 'source_id3', 'source_id4', 'source_id5', 'source_id6', 'source_id7', 'source_id8', 'source_id9']
auction_by_sourceID.head()

In [None]:
auction_by_sourceID['auctions_by_srcID'] = auction_by_sourceID.iloc[:,1:].std(axis = 1)
auction_by_sourceID.head()

Agrego el nuevo feature a los datos

In [None]:
data1 = auction_time.merge(auction_by_sourceID, on = 'ref_hash', how = 'left')#data.merge(auction_by_sourceID[['ref_hash', 'auctions_by_srcID']], on = 'ref_hash')


In [None]:
data1.head()

In [None]:
data1.nunique()

Vuelvo a probar el modelo con el nuevo feature

In [None]:
y = data1['predict_value']
X = data1.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

In [None]:
cross_val_predict(RFR, X, y, cv=10)

In [None]:
scores = cross_val_score(RFR, X, y, cv=10, scoring='neg_mean_squared_error')

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Pruebo xgboost

In [None]:
y = data1['predict_value']
X = data1.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El nuevo feature logró bajar el score promedio de RMSE, así que lo mantenemos en el dataset de features.

## Veo la cantidad de veces que aparece cada dispositivo en una subasta según el ref_type

In [None]:
auct['ref_type_id'].value_counts()

In [None]:
dfApRef = auct.groupby(['device_id', 'ref_type_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApRef.columns = dfApRef.columns.droplevel(0)
dfApRef['median_count_ref_type'] = dfApRef.median(axis = 1)
dfApRef.head()

Me interesa ver si cada usuario sólo tiene apariciones para el mismo ref_type, lo chequeo para ver si me servirá o no el feature

In [None]:
dfApRef.columns = ['ref_hash', 'auctions_ref_type1', 'auctions_ref_type7', 'median_count_ref_type']
dfApRef.head()

In [None]:
dfApRef['aparece_en_distinto_ref_type'] = ((dfApRef['auctions_ref_type1'] > 0) & (dfApRef['auctions_ref_type7'] > 0))

In [None]:
dfApRef.head()

In [None]:
dfApRef['aparece_en_distinto_ref_type'].value_counts()

Como hay valores para ref_types distintos puedo probar agregar este feature y ver que sucede

In [None]:
data2 = data1.merge(dfApRef[['ref_hash', 'median_count_ref_type']], on = 'ref_hash', how = 'left')
data2.head()

In [None]:
data2.nunique()

In [None]:
y = data2['predict_value']
X = data2.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

In [None]:
cross_val_predict(RFR, X, y, cv=10)

In [None]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## XGBoost

In [None]:
y = data2['predict_value']
X = data2.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El valor del score bajó así que mantenemos estos features

## Cantidad de apariciones de un dispositivo en las subastas por día

Calculo la cantidad de apariciones para cada dispositivo en las subastas por día, luego tomo la mediana para las apariciones.

In [None]:
auct['fecha'] = auct['date'].dt.date

In [None]:
dfApDay = auct.groupby(['device_id', 'fecha']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApDay.columns = dfApDay.columns.droplevel(0)
dfApDay.columns = ['ref_hash', '2019-04-23', '2019-04-24', '2019-04-25']
dfApDay['auctions_by_day'] = dfApDay.median(axis = 1)

In [None]:
dfApDay.head()

Agrego el nuevo feature

In [None]:
data3 = data1.merge(dfApDay[['ref_hash', 'auctions_by_day']], on = 'ref_hash', how = 'left')
data3.head()

In [None]:
data3.nunique()

Ahora puebo el modelo

In [None]:
y = data3['predict_value']
X = data3.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

In [None]:
cross_val_predict(RFR, X, y, cv=10)

In [None]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')


In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Pruebo con Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor(random_state=23, n_estimators=50, min_samples_split=50)
scores = cross_val_score(GBR , X, y, scoring = "neg_mean_squared_error", cv=5)


In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Ahora pruebo con XGBoost 

In [None]:
XGB = xgb.XGBRegressor()

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El score con xgboost bajó, así que voy a mantener este feature

## Agrego como feature el tiempo minimo de aparición de un dispositivo durante la ventana anterior a la ventana en la que voy a predecir.

La idea de este feature es estudiar el comportamiento de los dispositivos, sus tiempos de aparición y calcular el mínimo tiempo entre ellos para tener un estimativo del tiempo a predecir.


In [None]:
apariciones_auctions = auct[['date', 'device_id']]
apariciones_auctions.head()

In [None]:
apariciones_auctions['date'] = pd.to_datetime(apariciones_auctions['date'])
apariciones_auctions['date_inicial'] = dt.datetime(2019,4, 24)
apariciones_auctions['timeToAuction'] = (apariciones_auctions['date'] - apariciones_auctions['date_inicial'])/np.timedelta64(1,'s')
apariciones_auctions.head()

In [None]:
apariciones_auctions = apariciones_auctions.groupby('device_id').agg({'timeToAuction': 'min'}).reset_index()
apariciones_auctions.columns = ['ref_hash', 'timeToAuction_min']
apariciones_auctions.head()

Agrego el nuevo feature 

In [None]:
data4 = data3.merge(apariciones_auctions, on = 'ref_hash', how = 'left')

In [None]:
data4.head()

In [None]:
data4.nunique()

## Pruebo con XGBoost

In [None]:
y = data4['predict_value']
X = data4.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

In [None]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Este feature mejoró muchísimo el score, lo mantengo

## Agrego Features sobre los eventos

In [None]:
evts = pd.read_csv("data/events_ventana7.csv", dtype = {"event_id": np.int16, "application_id": np.int16, 'device_countrycode': 'category', 'device_os_version': 'category', 'device_brand': 'category', 'device_model': 'category', 'device_city': 'category', 'session_user_agent': 'category', 'trans_id': 'category', 'user_agent': 'category', 'carrier' : 'category', 'kind': 'category', 'device_os': 'category', 'connection_type': 'category', 'ip_address': 'category', 'device_language': 'category'})

In [None]:
evts.head()

## Aplicaciones populares

Agrego Features tomando en cuenta cuales la popularidad de las applicaciones. La idea es generar un feature que asigne un número a cada dispositivo de acuerdo al id de la applicación.

In [None]:
apps_populares = evts[['application_id', 'ref_hash']].groupby('application_id').count().reset_index()
apps_populares.columns = ['application_id', 'popularidad_app']
apps_populares.head()

In [None]:
apps_populares = evts[['ref_hash', 'application_id']].merge(apps_populares, on = 'application_id')
apps_populares.head()

In [None]:
apps_counts = evts[['date', 'ref_hash', 'application_id']].groupby(['ref_hash', 'application_id']).count().reset_index()
apps_counts.head()

In [None]:
# Le asigno a cada dispositivo la popularidad de la applicación en la cual generó más eventos
apps_pops = apps_counts.groupby(['ref_hash']).agg({'date': 'max'}).reset_index()
apps_pops.columns = ['ref_hash', 'popularidad_events']
apps_pops.head()

Agrego el nuevo feature a los datos

In [None]:
data5 = data4.merge(apps_pops, on = 'ref_hash', how = 'left')
data.head()

In [None]:
data5.nunique()

## Pruebo el feature con XGBoost

In [None]:
y = data5['predict_value']
X = data5.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

con este feature el score bajó un poco así que lo mantengo

## Agrego un feature tomando en cuenta la cantidad de eventos en los que participó cada dispositivo

In [None]:
count_events = evts.groupby('ref_hash').agg({'date': 'count'}).reset_index()
count_events.columns = ['ref_hash', 'count_events']
count_events.head()

Agrego el nuevo feature y veo qué sucede con el score

In [None]:
data6 = data5.merge(count_events, on = 'ref_hash', how = 'left')
#data['count_events'] = data['count_events'].fillna(0)

In [None]:
data6.nunique()

## Vuelvo a probar el modelo con XGBoost

In [None]:
y = data6['predict_value']
X = data6.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

se puede ver que este feature empeoró el score, así que lo descartamos

## Agrego el tiempo que tarda cada dispositivo en generar el primer evento

In [None]:
events_time = evts[['date', 'ref_hash']]
events_time['date'] = pd.to_datetime(evts['date'])
events_time['date_inicial'] = dt.datetime(2019,4, 24)
events_time['timeToEvent'] = (events_time['date'] - events_time['date_inicial'])/np.timedelta64(1,'s')
events_time.head()

In [None]:
events_time = events_time.groupby('ref_hash').agg({'timeToEvent': 'min'}).reset_index()
events_time.columns = ['ref_hash', 'timeToEvent_min']
events_time.head()

Agrego el nuevo feature

In [None]:
data7 = data5.merge(events_time, on = 'ref_hash', how = 'left')
data7.head()

In [None]:
data7.nunique()

Pruebo el modelo con el nuevo feature

In [None]:
y = data7['predict_value']
X = data7.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor(learning_rate =0.075, n_estimators=95, max_depth=4, min_child_weight=6, 
                         gamma=0.3, subsample=0.8, colsample_bytree=0.8,
                         scale_pos_weight=0.8, seed = 15)
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Con este feature el score volvió a subir, así que lo descartamos 

Busco los mejores hiperparámetros para xgboost utilizando Ramdomize search

In [None]:
XGB = xgb.XGBRegressor()

In [None]:
parameters_for_testing = {'max_depth': [3, 4],
                          'learning_rate': [0.1,0.2],
                          'n_estimators': [50, 100], 
                          'objective': ['reg:linear'],
                          'n_jobs': [1,2], 
                          'gamma': [0,0.2], 
                          'min_child_weight': [0.5,1],
                          'subsample': [0.9, 1], 
                          'colsample_bytree': [0.9,1],
                          'reg_alpha': [0,0.1], 
                          'reg_lambda': [0.9, 1]}

                    
xgb_model = xgb.XGBRegressor()

gsearch1 = GridSearchCV(estimator = xgb_model, param_grid = parameters_for_testing, n_jobs=6,iid=False, verbose=10,scoring='neg_mean_squared_error')
gsearch1.fit(X,y)

In [None]:
print('best params')
print (gsearch1.best_params_)
print('best score')
print (gsearch1.best_score_)


In [None]:
score = -gsearch1.best_score_
rmse = np.sqrt(score)
rmse

# Pruebo los features anteriores para predecir los tiempos de conversiones

In [None]:
inst = pd.read_csv('data/installs_ventana7.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})

## Installs

Tomo los tiempos de la primera conversión de cada dispositivo dentro de la ventana 2 para entrenar el algoritmo luego

In [None]:
inst['created'] = pd.to_datetime(inst['created'])
inst.head()

## Calculo el tiempo que tarda en convertir cada dispositivo

In [None]:
inst['created_inicial'] = dt.datetime(2019, 4, 24)
inst['created_inicial'] = pd.to_datetime(inst['created_inicial'])
inst['timeToInstall'] = (inst['created'] - inst['created_inicial'])/np.timedelta64(1,'s')
inst['timeToInstall'] = inst['timeToInstall'].transform(lambda x: x if (x >=  0) else (72 * 60 * 60))
inst.head()

Tomo el tiempo mínimo, en SEGUNDOS, que tardó un dispositivo en realizar una instalación

In [None]:
install_time = inst.groupby('ref_hash').agg({'timeToInstall': 'min'}).reset_index()
install_time.columns = ['ref_hash', 'predict_time_install']
install_time.head()

In [None]:
install_time = target[['ref_hash']].merge(install_time, on = 'ref_hash', how = 'left')
install_time['predict_time_install'] = install_time['predict_time_install'].fillna(72 * 60 * 60)

In [None]:
install_time.nunique()

## Ahora leo los datos de la ventana  para crear features sobre estos ids

In [None]:
installs = pd.read_csv('data/installs_ventana7.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})
installs.head()

## Agrego un feature sobre la popularidad de las applicaciones

In [None]:
apps_populares_installs = installs.groupby('application_id').agg({'created': 'count'}).reset_index()
apps_populares_installs.columns = ['application_id', 'popularidad_app']
apps_populares_installs = installs[['ref_hash', 'application_id']].merge(apps_populares_installs, on = 'application_id')
apps_populares_installs.head()

In [None]:
apps_counts = installs.groupby(['ref_hash', 'application_id']).agg({'created': 'count'}).reset_index()
# Le asigno a cada dispositivo la popularidad de la applicación en la cual generó más eventos
apps_pops = apps_counts.groupby(['ref_hash']).agg({'created': 'max'}).reset_index()
apps_pops.columns = ['ref_hash', 'popularidad_apps']
apps_pops.head()

## Creo el set de datos par a entrenar los algoritmos

In [None]:
data_installs = install_time.merge(apps_pops, on = 'ref_hash', how = 'left') 
data_installs.head()

In [None]:
data_installs.nunique()

## Pruebo XGBoost para predecir

In [None]:
y = data_installs['predict_time_install']
X = data_installs.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

## Agrego como feature el tiempo real que tarda un dispositivo en realizar la primera conversión, dentro de la ventana 1

In [None]:
installs['created'] = pd.to_datetime(inst['created'])
installs['created_inicial'] = dt.datetime(2019, 4, 24)
installs['created_inicial'] = pd.to_datetime(installs['created_inicial'])
installs['timeToInstall'] = (installs['created'] - installs['created_inicial'])/np.timedelta64(1,'s')


In [None]:
#Ahora tomo el tiempo mínimo
time_to_install = installs.groupby('ref_hash').agg({'timeToInstall': 'min'}).reset_index()

In [None]:
time_to_install.head()

Agrego el  nuevo feature y vuelvo a probar el algoritmo

In [None]:
data_installs1 = data_installs.merge(time_to_install, on = 'ref_hash', how = 'left')
data_installs1.head()

In [None]:
data_installs1.nunique()

In [None]:
y = data_installs1['predict_time_install']
X = data_installs1.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Se mejoró el score, por lo que mantengo el feature

## ahora agrego los features sobre los eventos

In [None]:
new_data1 = data_installs1.merge(apps_pops, on = 'ref_hash', how = 'left')
new_data1.head()

In [None]:
new_data1.nunique()

Pruebo el nuevo feature con xgboost

In [None]:
y = new_data1['predict_time_install']
X = new_data1.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El score se mantuvo igual, por lo que puedo sospechar que las apps populares en events son las mismas que en installs

Agrego un feature sobre los eventos registrados para cada dispositivo

In [None]:
new_data2 = data_installs1.merge(count_events, on = 'ref_hash', how = 'left')

In [None]:
new_data2.nunique()

In [None]:
#new_data['count_events'] = new_data['count_events'].fillna(0.0)
y = new_data2['predict_time_install']
X = new_data2.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Se logró mejorar el score con este feature, así que lo mantengo

## Agrego los features sobre auctions

In [None]:
#agrego la mediana de subastas registradas por cada dispositivo según el ref_type_id
new_data3 = new_data2.merge(dfApRef[['ref_hash', 'median_count_ref_type']], on = 'ref_hash', how = 'left')

In [None]:
new_data3.nunique()

In [None]:
#new_data['median_count_ref_type'] = new_data['median_count_ref_type'].fillna(0.0)
y = new_data3['predict_time_install']
X = new_data3.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El feature anterior empeoró las performance por lo que no se lo tomará en cuenta para las predicciones

In [None]:
#agrego la cantidad de subastas registradas para cada dispositivo
new_data4 = new_data2.merge(auction_count, on = 'ref_hash', how = 'left')
new_data4.head()

In [None]:
new_data4.nunique()

In [None]:
##new_data['auction_count'] = new_data['auctions_count'].fillna(0.0)
y = new_data4['predict_time_install']
X = new_data4.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

El feature anterior tampoco logró mejorar el score, aunque no lo empeoró, por ahora no lo tomo en cuenta

In [None]:
#Agrego el tiempo mínimo de aparición de cada dispositivo en una subasta
new_data5 = new_data2.merge(apariciones_auctions, on = 'ref_hash', how  = 'left')
new_data5.head()

In [None]:
new_data5.nunique()

In [None]:
#new_data[ 'auctions_by_srcID'] = new_data[ 'auctions_by_srcID'].fillna(0.0)
y = new_data5['predict_time_install']
X = new_data5.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Se puede apreciar que con este nuevo feature se mejoró bastante el score, así que lo mantendremos entre los features para las predicciones

Agrego otro feature, tomando en cuenta el promedio de subastas, por día, en las que participa cada dispositivo

In [None]:
new_data6 = new_data5.merge(dfApDay[['ref_hash', 'auctions_by_day']], on = 'ref_hash', how  = 'left')
new_data6.head()

In [None]:
new_data6.nunique()

Ahora pruebo el modelo con los nuevos features

In [None]:
#new_data['timeToAuction_min'] = new_data['timeToAuction_min'].fillna(72 * 60 * 60)
y = new_data6['predict_time_install']
X = new_data6.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Este feature hizo que el score empeore un poco, así que no lo mantendré en cuenta


## Agrego un nuevo feature tomando en cuenta el user_agent desde donde se origina la instalación

In [None]:
installs_by_user_agent = installs[['user_agent', 'ref_hash']].groupby('user_agent').count().reset_index()
installs_by_user_agent.columns = ['user_agent', 'installs_por_user_agent']
installs_by_user_agent = installs_by_user_agent.merge(installs[['ref_hash', 'user_agent']], on = 'user_agent')

In [None]:
installs_by_user_agent.head()

In [None]:
new_data7 = new_data5.merge(installs_by_user_agent[['installs_por_user_agent', 'ref_hash']], on = 'ref_hash', how = 'left')
new_data7.head()

In [None]:
new_data7.merge(target[['ref_hash']]).nunique()

Pruebo el nuevo feature

In [None]:
#new_data[ 'auctions_by_srcID'] = new_data[ 'auctions_by_srcID'].fillna(0.0)
y = new_data7['predict_time_install']
X = new_data7.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Con el nuevo feature el score mejoró bastante así que lo mantengo, hasta ahora el mejor es new_data7

## Agrego un nuevo feature tomando en cuenta la session_user_agent

In [None]:
session_user_agent = installs[['ref_hash', 'session_user_agent']].groupby('session_user_agent').count().reset_index()
session_user_agent.head()

In [None]:
session_feature = installs[['ref_hash', 'session_user_agent']].merge(session_user_agent, on = 'session_user_agent', how = 'left')
session_feature = session_feature.drop(columns = 'session_user_agent')
session_feature.columns = ['ref_hash', 'session_user_agent']
session_feature.head()

Agrego el nuevo feature al set de features

In [None]:
new_data8 = new_data7.merge(session_feature, on = 'ref_hash', how = 'left')

In [None]:
new_data8.nunique()

Pruebo el nuevo feature

In [None]:
y = new_data8['predict_time_install']
X = new_data8.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

Este feature empeoró el score, así que lo descarto

# Guardo los features de cada ventana

## Features sobre st

In [None]:
#data7.to_csv(path_or_buf = 'features_anteriores_St.csv', index = False)
#Leo los datos de la ventana anterior
features = pd.read_csv('features_anteriores_St.csv')
features['predict_value'] = features['predict_value'].fillna(72 * 60 * 60)
features.head()

In [None]:
features = features.merge(target[['ref_hash']])
y_train = features['predict_value']
X_train = features.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X_train, y_train, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

In [None]:
data_final = data7.merge(target[['ref_hash']], on = 'ref_hash')
data_final = data_final.drop(columns = ['ref_hash', 'predict_value'])

In [None]:
XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')
XGB.fit(X_train, y_train)
result = XGB.predict(data_final)

In [None]:
target_st = target.drop(columns = 'obj')
target_st.head()

In [None]:
len(result)

In [None]:
submit = []
for l in result:
    submit.append(l)
target_st['obj'] = submit

In [None]:
target_st['ref_hash'] = target_st['ref_hash'].transform(lambda x: str(x) + '_st')
target_st.head()

In [None]:
target.to_csv(path_or_buf = "submit_st.csv", index = False)

## Creo las predicciones sc con todos los ids

In [None]:
#new_data7.to_csv(path_or_buf = 'features_anteriores_Sc.csv', index = False)
# Cargo los features de la ventana anterior
features_train = pd.read_csv('features_anteriores_Sc.csv')
features_train.head()

In [None]:
y_train = features_train['predict_time_install']
X_train = features_train.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X_train, y_train, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

In [None]:
X_train.head()

In [None]:
feature_final = new_data7.drop(columns = ['ref_hash','predict_time_install'])
feature_final.head()

In [None]:
XGB = xgb.XGBRegressor()
XGB.fit(X_train, y_train)
result2 = XGB.predict(feature_final)

In [None]:
target_sc = target.drop(columns = 'obj')
target.head()

In [None]:
submit = []
for l in result:
    submit.append(l)
target_sc['obj'] = submit

In [None]:
target_sc['ref_hash'] = target_sc['ref_hash'].transform(lambda x: str(x)[ + '_sc'])
target_sc.head()

In [None]:
target.to_csv(path_or_buf = "submit_sc.csv", index = False)

## Armo el submit final

In [None]:
st = pd.read_csv('submit_st.csv')
st.head()

In [None]:
sc = pd.read_csv('submit_sc.csv')
sc.head()

In [None]:
submit_final = target_st.merge(target_sc, on = 'ref_hash', how = 'outer')
submit_final = submit_final.sort_values(by = 'ref_hash')

In [None]:
submit_final.head()

In [None]:
submit_final['obj_x'] = submit_final['obj_x'].fillna(0)
submit_final['obj_y'] = submit_final['obj_y'].fillna(0)
submit_final['obj'] = submit_final['obj_x'] + submit_final['obj_y']
submit_final = submit_final[['ref_hash', 'obj']]
submit_final.head()

In [None]:
submit_final.to_csv(path_or_buf = 'submit_final2.csv', index = False)

In [None]:
submit_final.count()