# Trabajo Práctico N° 2
## Objetivo:

Para cada dispositivo presentado por Jampp, determinar el tiempo que transcurrirá hasta que el mismo aparezca nuevamente en una subasta, y el tiempo hasta que el usuario del mismo decida instalar una nueva aplicación.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import datetime as dt
import xgboost as xgb

## Primero veo los dispositivos del target

In [2]:
target = pd.read_csv('data/target_competencia_ids.csv')

In [3]:
target.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,0
1,1000169251625791246_st,0
2,1000395625957344683_sc,0
3,1000395625957344683_st,0
4,1003027494996471685_sc,0


In [4]:
target.size

16148

Tomo los ref_hash

In [5]:
target['ref_hash'] = target['ref_hash'].transform(lambda x: str(x)[:-3])

In [6]:
target['ref_hash'] = target['ref_hash'].astype(np.int64)
target.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246,0
1,1000169251625791246,0
2,1000395625957344683,0
3,1000395625957344683,0
4,1003027494996471685,0


In [7]:
target.drop_duplicates(subset = 'ref_hash', inplace = True)

In [8]:
target.count()

ref_hash    4037
obj         4037
dtype: int64

## Calculo cuanto tiempo tarda un dispositivo en aparecer en una subasta contando desde el inicio de la ventana que quiero predecir

La idea es determinar el tiempo que transcurrió entre cada aparición de un dispositivo en una subasta, para luego tomar el tiempo mínimo de aparición de un dispositivo en una subasta y de acuerdo a eso predecir utilizando los features de la ventana anterior.

In [9]:
auct_predict = pd.read_csv('data/auctions_ventana7.csv', dtype = { "ref_type_id": np.int8, "source_id": np.int8})

In [10]:
auct_predict['date'] = pd.to_datetime(auct_predict['date'])
auct_predict['date_inicial'] = dt.datetime(2019,4, 24)
auct_predict['timeToAuction'] = (auct_predict['date'] - auct_predict['date_inicial'])/np.timedelta64(1,'s')

In [11]:
auct_predict['timeToAuction'] = auct_predict['timeToAuction'].transform(lambda x: (72 * 60 * 60) if (x < 0) else x)
auct_predict.head()

Unnamed: 0,date,device_id,ref_type_id,source_id,date_inicial,timeToAuction
0,2019-04-26 23:52:29.135354,1384623003476985820,1,7,2019-04-24,258749.135354
1,2019-04-26 23:52:39.367477,3714738743084512188,1,7,2019-04-24,258759.367477
2,2019-04-26 23:52:54.714361,5697386557321863111,1,7,2019-04-24,258774.714361
3,2019-04-26 23:53:13.729835,5583037045722622336,1,7,2019-04-24,258793.729835
4,2019-04-26 23:53:48.577115,6383034009915294411,1,7,2019-04-24,258828.577115


Tomo el tiempo mínimo, en SEGUNDOS, que tardó cada dispositivo en aparecer en una subasta

In [12]:
auction_time = auct_predict.groupby('device_id').agg({'timeToAuction': 'min'}).reset_index()
auction_time.columns = ['ref_hash', 'predict_value']
auction_time.head()

Unnamed: 0,ref_hash,predict_value
0,69039685746313,126258.597103
1,345999128501141,250362.048531
2,360710529886978,46687.256609
3,365882020742330,96627.763854
4,416301579449694,12569.446617


In [13]:
auction_time = target[['ref_hash']].merge(auction_time, on = 'ref_hash', how = 'left')
auction_time['predict_value'] = auction_time['predict_value'].fillna((72 * 60 * 60)) # Los que no aparecieron 

In [14]:
auction_time.merge(target[['ref_hash']]).nunique()

ref_hash         4037
predict_value    3352
dtype: int64

# Analizo los datos de la ventana anterior

In [15]:
auct = pd.read_csv('data/auctions_ventana7.csv', dtype = { "ref_type_id": np.int8, "source_id": np.int8})

In [16]:
auct.dtypes

date           object
device_id       int64
ref_type_id      int8
source_id        int8
dtype: object

In [17]:
auct['date'] = pd.to_datetime(auct['date'])

In [18]:
auct.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-26 23:52:29.135354,1384623003476985820,1,7
1,2019-04-26 23:52:39.367477,3714738743084512188,1,7
2,2019-04-26 23:52:54.714361,5697386557321863111,1,7
3,2019-04-26 23:53:13.729835,5583037045722622336,1,7
4,2019-04-26 23:53:48.577115,6383034009915294411,1,7


## Veo cuantas veces aparece cada dispositivo en una subasta

Inicio sencillamente contando la cantidad de subastas en las que participó cada dispositivo, y lo agrego como un nuevo feature

In [19]:
auction_count = auct.groupby('device_id').agg({'date': 'count'}).reset_index()
auction_count.columns = ['ref_hash', 'auctions_count']

In [20]:
auction_count.head()

Unnamed: 0,ref_hash,auctions_count
0,69039685746313,4
1,345999128501141,2
2,360710529886978,42
3,365882020742330,2
4,416301579449694,62


Creo un único set de datos con los primeros features creados usando los ids de los dispositivos de la ventana 2

In [21]:
data = auction_time.merge(auction_count, on = 'ref_hash', how = 'left')

In [22]:
data.head()

Unnamed: 0,ref_hash,predict_value,auctions_count
0,1000169251625791246,76114.647428,13.0
1,1000395625957344683,8034.974209,15.0
2,1003027494996471685,12171.691046,168.0
3,1006670001679961544,64857.60634,3.0
4,1007573308966476713,18726.239096,7.0


In [23]:
data['auctions_count'] = data['auctions_count'].fillna(0)

In [24]:
data.nunique()

ref_hash          4037
predict_value     3352
auctions_count     371
dtype: int64

## Pruebo con Random Forest

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

y = data['predict_value']
X = data.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  
RFR.feature_importances_

array([1.])

In [26]:
predictions = cross_val_predict(RFR, X, y, cv=10)
predictions

array([ 59718.14794922,  55069.18476557,  53882.99850567, ...,
       259200.        , 259200.        ,  56184.05360033])

In [27]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')

## Calculo el RMSE

In [28]:
# convierto los valores a MSE scores
mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)

In [29]:
rmse_scores.mean()

61916.22167534436

## Pruebo con xgboost

In [30]:
y = data['predict_value']
X = data.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


60998.438109624745

## Veo la cantidad de veces que aparece cada dispositivo segun el source_id

Ahora me interesa ver la cantidad de veces que un dispositivo participó en una subasta desde cada tipo de fuente desde donde se produce la subasta. De esta manera los source_id más populares tomaran un valor mayor, luego tomo la desviación estándar de la cantidad para cada device_id

In [31]:
auct['apariciones'] = 1

In [32]:
auction_by_sourceID = auct.groupby(['device_id', 'source_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
auction_by_sourceID.columns = auction_by_sourceID.columns.droplevel(0)
auction_by_sourceID.columns = ['ref_hash', 'source_id0', 'source_id1', 'source_id2', 'source_id3', 'source_id4', 'source_id5', 'source_id6', 'source_id7', 'source_id8', 'source_id9']
auction_by_sourceID.head()

Unnamed: 0,ref_hash,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9
0,69039685746313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
1,345999128501141,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,360710529886978,0.0,31.0,0.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0
3,365882020742330,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,416301579449694,22.0,29.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,0.0


In [33]:
auction_by_sourceID['auctions_by_srcID'] = auction_by_sourceID.iloc[:,1:].std(axis = 1)
auction_by_sourceID.head()

Unnamed: 0,ref_hash,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID
0,69039685746313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.264911
1,345999128501141,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.632456
2,360710529886978,0.0,31.0,0.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,9.919677
3,365882020742330,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.421637
4,416301579449694,22.0,29.0,1.0,8.0,1.0,1.0,0.0,0.0,0.0,0.0,10.580905


Agrego el nuevo feature a los datos

In [34]:
data1 = auction_time.merge(auction_by_sourceID, on = 'ref_hash', how = 'left')#data.merge(auction_by_sourceID[['ref_hash', 'auctions_by_srcID']], on = 'ref_hash')


In [35]:
data1.head()

Unnamed: 0,ref_hash,predict_value,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID
0,1000169251625791246,76114.647428,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110961
1,1000395625957344683,8034.974209,0.0,9.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.838231
2,1003027494996471685,12171.691046,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841
3,1006670001679961544,64857.60634,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674949
4,1007573308966476713,18726.239096,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494


In [36]:
data1 = data1.fillna(0)

In [37]:
data1.count()

ref_hash             4037
predict_value        4037
source_id0           4037
source_id1           4037
source_id2           4037
source_id3           4037
source_id4           4037
source_id5           4037
source_id6           4037
source_id7           4037
source_id8           4037
source_id9           4037
auctions_by_srcID    4037
dtype: int64

Vuelvo a probar el modelo con el nuevo feature

In [38]:
y = data1['predict_value']
X = data1.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [39]:
cross_val_predict(RFR, X, y, cv=10)

array([ 53661.3559543 ,  59334.56876642,  53661.3559543 , ...,
       259200.        , 259200.        ,  54790.36010836])

In [40]:
scores = cross_val_score(RFR, X, y, cv=10, scoring='neg_mean_squared_error')

In [41]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

62151.614230338215

Pruebo xgboost

In [42]:
y = data1['predict_value']
X = data1.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




60676.19002600519

El nuevo feature logró bajar el score promedio de RMSE, así que lo mantenemos en el dataset de features.

## Veo la cantidad de veces que aparece cada dispositivo en una subasta según el ref_type

In [43]:
auct['ref_type_id'].value_counts()

1    13493412
7     2343431
Name: ref_type_id, dtype: int64

In [44]:
dfApRef = auct.groupby(['device_id', 'ref_type_id']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApRef.columns = dfApRef.columns.droplevel(0)
dfApRef['median_count_ref_type'] = dfApRef.median(axis = 1)
dfApRef.head()

ref_type_id,Unnamed: 1,1,7,median_count_ref_type
0,69039685746313,4.0,0.0,4.0
1,345999128501141,2.0,0.0,2.0
2,360710529886978,42.0,0.0,42.0
3,365882020742330,2.0,0.0,2.0
4,416301579449694,62.0,0.0,62.0


Me interesa ver si cada usuario sólo tiene apariciones para el mismo ref_type, lo chequeo para ver si me servirá o no el feature

In [45]:
dfApRef.columns = ['ref_hash', 'auctions_ref_type1', 'auctions_ref_type7', 'median_count_ref_type']
dfApRef.head()

Unnamed: 0,ref_hash,auctions_ref_type1,auctions_ref_type7,median_count_ref_type
0,69039685746313,4.0,0.0,4.0
1,345999128501141,2.0,0.0,2.0
2,360710529886978,42.0,0.0,42.0
3,365882020742330,2.0,0.0,2.0
4,416301579449694,62.0,0.0,62.0


In [46]:
dfApRef['aparece_en_distinto_ref_type'] = ((dfApRef['auctions_ref_type1'] > 0) & (dfApRef['auctions_ref_type7'] > 0))

In [47]:
dfApRef.head()

Unnamed: 0,ref_hash,auctions_ref_type1,auctions_ref_type7,median_count_ref_type,aparece_en_distinto_ref_type
0,69039685746313,4.0,0.0,4.0,False
1,345999128501141,2.0,0.0,2.0,False
2,360710529886978,42.0,0.0,42.0,False
3,365882020742330,2.0,0.0,2.0,False
4,416301579449694,62.0,0.0,62.0,False


In [48]:
dfApRef['aparece_en_distinto_ref_type'].value_counts()

False    329564
True        215
Name: aparece_en_distinto_ref_type, dtype: int64

Como hay valores para ref_types distintos puedo probar agregar este feature y ver que sucede

In [49]:
data2 = data1.merge(dfApRef[['ref_hash', 'median_count_ref_type']], on = 'ref_hash', how = 'left')
data2.head()

Unnamed: 0,ref_hash,predict_value,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID,median_count_ref_type
0,1000169251625791246,76114.647428,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110961,13.0
1,1000395625957344683,8034.974209,0.0,9.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.838231,15.0
2,1003027494996471685,12171.691046,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841,168.0
3,1006670001679961544,64857.60634,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674949,3.0
4,1007573308966476713,18726.239096,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494,7.0


In [50]:
data2['median_count_ref_type'] = data2['median_count_ref_type'].fillna(0)
data2.count()

ref_hash                 4037
predict_value            4037
source_id0               4037
source_id1               4037
source_id2               4037
source_id3               4037
source_id4               4037
source_id5               4037
source_id6               4037
source_id7               4037
source_id8               4037
source_id9               4037
auctions_by_srcID        4037
median_count_ref_type    4037
dtype: int64

In [51]:
y = data2['predict_value']
X = data2.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.50570276, 0.49429724])

In [52]:
cross_val_predict(RFR, X, y, cv=10)

array([ 56597.4928101 ,  56043.37533035,  53696.27428946, ...,
       259200.        , 259200.        ,  55559.85650578])

In [53]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')

In [54]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

62002.58547992337

## XGBoost

In [55]:
y = data2['predict_value']
X = data2.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

mse_scores = -scores
# paso de MSE a RMSE
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




60738.87150528346

## Cantidad de apariciones de un dispositivo en las subastas por día

Calculo la cantidad de apariciones para cada dispositivo en las subastas por día, luego tomo la mediana para las apariciones.

In [56]:
auct['fecha'] = auct['date'].dt.date

In [57]:
dfApDay = auct.groupby(['device_id', 'fecha']).agg({'apariciones':'sum'}).unstack(1).fillna(0).reset_index()
dfApDay.columns = dfApDay.columns.droplevel(0)
dfApDay.columns = ['ref_hash', '2019-04-23', '2019-04-24', '2019-04-25']
dfApDay['auctions_by_day'] = dfApDay.median(axis = 1)

In [58]:
dfApDay.head()

Unnamed: 0,ref_hash,2019-04-23,2019-04-24,2019-04-25,auctions_by_day
0,69039685746313,0.0,4.0,0.0,2.0
1,345999128501141,0.0,0.0,2.0,1.0
2,360710529886978,16.0,1.0,25.0,20.5
3,365882020742330,0.0,1.0,1.0,1.0
4,416301579449694,31.0,29.0,2.0,30.0


Agrego el nuevo feature

In [59]:
data3 = data1.merge(dfApDay[['ref_hash', 'auctions_by_day']], on = 'ref_hash', how = 'left')
data3.head()

Unnamed: 0,ref_hash,predict_value,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID,auctions_by_day
0,1000169251625791246,76114.647428,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110961,5.5
1,1000395625957344683,8034.974209,0.0,9.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.838231,6.5
2,1003027494996471685,12171.691046,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841,84.0
3,1006670001679961544,64857.60634,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674949,1.5
4,1007573308966476713,18726.239096,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494,3.0


In [60]:
data3['auctions_by_day'] = data3['auctions_by_day'].fillna(0)
data3.nunique()

ref_hash             4037
predict_value        3352
source_id0            193
source_id1            256
source_id2             22
source_id3            137
source_id4             28
source_id5             34
source_id6             59
source_id7             50
source_id8             34
source_id9              3
auctions_by_srcID    1398
auctions_by_day       344
dtype: int64

Ahora puebo el modelo

In [61]:
y = data3['predict_value']
X = data3.drop(['ref_hash', 'predict_value'], axis=1)

RFR = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
RFR.fit(X, y)  

RFR.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.58860458, 0.41139542])

In [62]:
cross_val_predict(RFR, X, y, cv=10)

array([ 53661.3559543 ,  59334.56876642,  53661.3559543 , ...,
       259200.        , 259200.        ,  54790.36010836])

In [63]:
scores = cross_val_score(RFR, X, y, cv=5, scoring='neg_mean_squared_error')


In [64]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

62182.89047602184

## Pruebo con Gradient Boosting Regressor

In [65]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor(random_state=23, n_estimators=50, min_samples_split=50)
scores = cross_val_score(GBR , X, y, scoring = "neg_mean_squared_error", cv=5)


In [66]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

60510.27469448878

## Ahora pruebo con XGBoost 

In [67]:
XGB = xgb.XGBRegressor()

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


In [68]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

60494.22932004814

El score con xgboost bajó, así que voy a mantener este feature

## Agrego como feature el tiempo minimo de aparición de un dispositivo durante la ventana anterior a la ventana en la que voy a predecir.

La idea de este feature es estudiar el comportamiento de los dispositivos, sus tiempos de aparición y calcular el mínimo tiempo entre ellos para tener un estimativo del tiempo a predecir.


In [69]:
apariciones_auctions = auct[['date', 'device_id']]
apariciones_auctions.head()

Unnamed: 0,date,device_id
0,2019-04-26 23:52:29.135354,1384623003476985820
1,2019-04-26 23:52:39.367477,3714738743084512188
2,2019-04-26 23:52:54.714361,5697386557321863111
3,2019-04-26 23:53:13.729835,5583037045722622336
4,2019-04-26 23:53:48.577115,6383034009915294411


In [70]:
apariciones_auctions['date'] = pd.to_datetime(apariciones_auctions['date'])
apariciones_auctions['date_inicial'] = dt.datetime(2019,4, 24)
apariciones_auctions['timeToAuction'] = (apariciones_auctions['date'] - apariciones_auctions['date_inicial'])/np.timedelta64(1,'s')
apariciones_auctions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,date,device_id,date_inicial,timeToAuction
0,2019-04-26 23:52:29.135354,1384623003476985820,2019-04-24,258749.135354
1,2019-04-26 23:52:39.367477,3714738743084512188,2019-04-24,258759.367477
2,2019-04-26 23:52:54.714361,5697386557321863111,2019-04-24,258774.714361
3,2019-04-26 23:53:13.729835,5583037045722622336,2019-04-24,258793.729835
4,2019-04-26 23:53:48.577115,6383034009915294411,2019-04-24,258828.577115


In [71]:
apariciones_auctions = apariciones_auctions.groupby('device_id').agg({'timeToAuction': ['min', 'max']}).reset_index()
apariciones_auctions.columns = apariciones_auctions.columns.droplevel(0)
apariciones_auctions.columns = ['ref_hash', 'timeToAuction_min', 'timeToAuction_max']
apariciones_auctions.head()

Unnamed: 0,ref_hash,timeToAuction_min,timeToAuction_max
0,69039685746313,126258.597103,126704.792132
1,345999128501141,250362.048531,250402.600874
2,360710529886978,46687.256609,256671.753077
3,365882020742330,96627.763854,190225.244451
4,416301579449694,12569.446617,182084.380124


Agrego el nuevo feature 

In [72]:
data4 = data3.merge(apariciones_auctions[['ref_hash', 'timeToAuction_min']], on = 'ref_hash', how = 'left')

In [73]:
data4.head()

Unnamed: 0,ref_hash,predict_value,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID,auctions_by_day,timeToAuction_min
0,1000169251625791246,76114.647428,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110961,5.5,76114.647428
1,1000395625957344683,8034.974209,0.0,9.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.838231,6.5,8034.974209
2,1003027494996471685,12171.691046,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841,84.0,12171.691046
3,1006670001679961544,64857.60634,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674949,1.5,64857.60634
4,1007573308966476713,18726.239096,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494,3.0,18726.239096


In [74]:
data4['timeToAuction_min'] = data4['timeToAuction_min'].fillna(72 * 60 * 60)
#data4['timeToAuction_max'] = data4['timeToAuction_max'].fillna(72 * 60 * 60)
data4.count()

ref_hash             4037
predict_value        4037
source_id0           4037
source_id1           4037
source_id2           4037
source_id3           4037
source_id4           4037
source_id5           4037
source_id6           4037
source_id7           4037
source_id8           4037
source_id9           4037
auctions_by_srcID    4037
auctions_by_day      4037
timeToAuction_min    4037
dtype: int64

In [75]:
data4.nunique()

ref_hash             4037
predict_value        3352
source_id0            193
source_id1            256
source_id2             22
source_id3            137
source_id4             28
source_id5             34
source_id6             59
source_id7             50
source_id8             34
source_id9              3
auctions_by_srcID    1398
auctions_by_day       344
timeToAuction_min    3352
dtype: int64

## Pruebo con XGBoost

In [76]:
y = data4['predict_value']
X = data4.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




In [77]:
# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

455.9972388369555

Este feature mejoró muchísimo el score, lo mantengo

## Agrego Features sobre los eventos

In [78]:
evts = pd.read_csv("data/events_ventana7.csv", dtype = {"event_id": np.int16, "application_id": np.int16, 'device_countrycode': 'category', 'device_os_version': 'category', 'device_brand': 'category', 'device_model': 'category', 'device_city': 'category', 'session_user_agent': 'category', 'trans_id': 'category', 'user_agent': 'category', 'carrier' : 'category', 'kind': 'category', 'device_os': 'category', 'connection_type': 'category', 'ip_address': 'category', 'device_language': 'category'})

In [79]:
evts.head()

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,device_countrycode,device_os_version,device_brand,device_model,...,trans_id,user_agent,event_uuid,carrier,kind,device_os,wifi,connection_type,ip_address,device_language
0,2019-04-25 21:25:34.650,1,1891515180541284343,809939361959643854,210,False,6287817205707153877,,,,...,,,eb50de22-8c0f-42e3-85c2-fb6c54782bd6,,4.017674184041173e+18,,False,,151684593053252001,
1,2019-04-25 21:25:33.165,1,1891515180541284343,2704332589081852700,210,False,6287817205707153877,,,,...,,,fc0e2e76-475d-4d60-bc0e-86070792a44b,,4.017674184041173e+18,,False,,6918006307204159217,
2,2019-04-25 21:25:33.097,1,1891515180541284343,2704332589081852700,210,False,6287817205707153877,,,,...,,,1113aebd-1ac9-4583-a8d0-9da011013ec9,,4.017674184041173e+18,,False,,6918006307204159217,
3,2019-04-25 21:25:34.227,0,1891515180541284343,6376777580200607439,210,False,6287817205707153877,,,6.87116077233974e+18,...,,4.9432788293014746e+17,2a348381-848c-4fe4-bd83-48e6c1f8b4d2,,5.882882097123621e+18,,False,,8378906526277633862,3.3013777759777e+18
4,2019-04-25 21:17:30.501,1,1891515180541284343,2602532777370559745,210,False,6287817205707153877,,,,...,,,69b4057e-78f3-42c0-988b-9ad6d67613e7,,4.017674184041173e+18,,False,,7090887066466907036,


## Aplicaciones populares

Agrego Features tomando en cuenta cuales la popularidad de las applicaciones. La idea es generar un feature que asigne un número a cada dispositivo de acuerdo al id de la applicación.

In [80]:
apps_populares = evts[['application_id', 'ref_hash']].groupby('application_id').count().reset_index()
apps_populares.columns = ['application_id', 'popularidad_app']
apps_populares.head()

Unnamed: 0,application_id,popularidad_app
0,1,6
1,2,124
2,3,1870
3,5,34
4,6,31


In [81]:
apps_populares = evts[['ref_hash', 'application_id']].merge(apps_populares, on = 'application_id')
apps_populares.head()

Unnamed: 0,ref_hash,application_id,popularidad_app
0,809939361959643854,210,664050
1,2704332589081852700,210,664050
2,2704332589081852700,210,664050
3,6376777580200607439,210,664050
4,2602532777370559745,210,664050


In [82]:
apps_counts = evts[['date', 'ref_hash', 'application_id']].groupby(['ref_hash', 'application_id']).count().reset_index()
apps_counts.head()

Unnamed: 0,ref_hash,application_id,date
0,41863526108385,210,57
1,69039685746313,226,16
2,90072729247980,210,3
3,161514654074162,121,2
4,168103949904656,155,3


In [83]:
# Le asigno a cada dispositivo la popularidad de la applicación en la cual generó más eventos
apps_pops_events = apps_counts.groupby(['ref_hash']).agg({'date': 'max'}).reset_index()
apps_pops_events.columns = ['ref_hash', 'popularidad_apps_events']
apps_pops_events.head()

Unnamed: 0,ref_hash,popularidad_apps_events
0,41863526108385,57
1,69039685746313,16
2,90072729247980,3
3,161514654074162,2
4,168103949904656,3


Agrego el nuevo feature a los datos

In [84]:
data5 = data4.merge(apps_pops_events, on = 'ref_hash', how = 'left')
data.head()

Unnamed: 0,ref_hash,predict_value,auctions_count
0,1000169251625791246,76114.647428,13.0
1,1000395625957344683,8034.974209,15.0
2,1003027494996471685,12171.691046,168.0
3,1006670001679961544,64857.60634,3.0
4,1007573308966476713,18726.239096,7.0


In [85]:
data5['popularidad_apps_events'] = data5['popularidad_apps_events'].fillna(0)
data5.count()

ref_hash                   4037
predict_value              4037
source_id0                 4037
source_id1                 4037
source_id2                 4037
source_id3                 4037
source_id4                 4037
source_id5                 4037
source_id6                 4037
source_id7                 4037
source_id8                 4037
source_id9                 4037
auctions_by_srcID          4037
auctions_by_day            4037
timeToAuction_min          4037
popularidad_apps_events    4037
dtype: int64

In [86]:
data5.nunique()

ref_hash                   4037
predict_value              3352
source_id0                  193
source_id1                  256
source_id2                   22
source_id3                  137
source_id4                   28
source_id5                   34
source_id6                   59
source_id7                   50
source_id8                   34
source_id9                    3
auctions_by_srcID          1398
auctions_by_day             344
timeToAuction_min          3352
popularidad_apps_events     131
dtype: int64

## Pruebo el feature con XGBoost

In [87]:
y = data5['predict_value']
X = data5.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




458.6872004473165

con este feature el score bajó un poco así que lo mantengo

## Agrego un feature tomando en cuenta la cantidad de eventos en los que participó cada dispositivo

In [88]:
count_events = evts.groupby('ref_hash').agg({'date': 'count'}).reset_index()
count_events.columns = ['ref_hash', 'count_events']
count_events.head()

Unnamed: 0,ref_hash,count_events
0,41863526108385,57
1,69039685746313,16
2,90072729247980,3
3,161514654074162,2
4,168103949904656,3


Agrego el nuevo feature y veo qué sucede con el score

In [89]:
data6 = data5.merge(count_events, on = 'ref_hash', how = 'left')
data6['count_events'] = data6['count_events'].fillna(0)

In [90]:
data6.count()

ref_hash                   4037
predict_value              4037
source_id0                 4037
source_id1                 4037
source_id2                 4037
source_id3                 4037
source_id4                 4037
source_id5                 4037
source_id6                 4037
source_id7                 4037
source_id8                 4037
source_id9                 4037
auctions_by_srcID          4037
auctions_by_day            4037
timeToAuction_min          4037
popularidad_apps_events    4037
count_events               4037
dtype: int64

In [91]:
data6.nunique()

ref_hash                   4037
predict_value              3352
source_id0                  193
source_id1                  256
source_id2                   22
source_id3                  137
source_id4                   28
source_id5                   34
source_id6                   59
source_id7                   50
source_id8                   34
source_id9                    3
auctions_by_srcID          1398
auctions_by_day             344
timeToAuction_min          3352
popularidad_apps_events     131
count_events                141
dtype: int64

## Vuelvo a probar el modelo con XGBoost

In [92]:
y = data6['predict_value']
X = data6.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




458.319809906552

se puede ver que este feature empeoró el score, así que lo descartamos

## Agrego el tiempo que tarda cada dispositivo en generar el primer evento

In [93]:
events_time = evts[['date', 'ref_hash']]
events_time['date'] = pd.to_datetime(evts['date'])
events_time['date_inicial'] = dt.datetime(2019,4, 24)
events_time['timeToEvent'] = (events_time['date'] - events_time['date_inicial'])/np.timedelta64(1,'s')
events_time.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,date,ref_hash,date_inicial,timeToEvent
0,2019-04-25 21:25:34.650,809939361959643854,2019-04-24,163534.65
1,2019-04-25 21:25:33.165,2704332589081852700,2019-04-24,163533.165
2,2019-04-25 21:25:33.097,2704332589081852700,2019-04-24,163533.097
3,2019-04-25 21:25:34.227,6376777580200607439,2019-04-24,163534.227
4,2019-04-25 21:17:30.501,2602532777370559745,2019-04-24,163050.501


In [94]:
events_time = events_time.groupby('ref_hash').agg({'timeToEvent': ['min', 'max']}).reset_index()
events_time.columns = events_time.columns.droplevel(0)
events_time.columns = ['ref_hash', 'timeToEvent_min', 'timeToEvent_max']
events_time.head()

Unnamed: 0,ref_hash,timeToEvent_min,timeToEvent_max
0,41863526108385,192058.035,194076.53
1,69039685746313,158837.672,256909.082
2,90072729247980,66653.967,66676.624
3,161514654074162,154212.242,154233.535
4,168103949904656,109295.011,109349.096


Agrego el nuevo feature

In [95]:
data7 = data5.merge(events_time[['ref_hash', 'timeToEvent_min']], on = 'ref_hash', how = 'left')
data7.head()

Unnamed: 0,ref_hash,predict_value,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID,auctions_by_day,timeToAuction_min,popularidad_apps_events,timeToEvent_min
0,1000169251625791246,76114.647428,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110961,5.5,76114.647428,2.0,36872.309
1,1000395625957344683,8034.974209,0.0,9.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.838231,6.5,8034.974209,0.0,
2,1003027494996471685,12171.691046,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841,84.0,12171.691046,5.0,73028.525
3,1006670001679961544,64857.60634,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674949,1.5,64857.60634,0.0,
4,1007573308966476713,18726.239096,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494,3.0,18726.239096,0.0,


In [96]:
data7['timeToEvent_min'] = data7['timeToEvent_min'].fillna(72 * 60 * 60)
#data7['timeToEvent_max'] = data7['timeToEvent_max'].fillna(72 * 60 * 60)
data7.count()

ref_hash                   4037
predict_value              4037
source_id0                 4037
source_id1                 4037
source_id2                 4037
source_id3                 4037
source_id4                 4037
source_id5                 4037
source_id6                 4037
source_id7                 4037
source_id8                 4037
source_id9                 4037
auctions_by_srcID          4037
auctions_by_day            4037
timeToAuction_min          4037
popularidad_apps_events    4037
timeToEvent_min            4037
dtype: int64

In [97]:
data7.nunique()

ref_hash                   4037
predict_value              3352
source_id0                  193
source_id1                  256
source_id2                   22
source_id3                  137
source_id4                   28
source_id5                   34
source_id6                   59
source_id7                   50
source_id8                   34
source_id9                    3
auctions_by_srcID          1398
auctions_by_day             344
timeToAuction_min          3352
popularidad_apps_events     131
timeToEvent_min            2064
dtype: int64

Pruebo el modelo con el nuevo feature

In [98]:
y = data7['predict_value']
X = data7.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor(learning_rate =0.075, n_estimators=95, max_depth=4, min_child_weight=6, 
                         gamma=0.3, subsample=0.8, colsample_bytree=0.8,
                         scale_pos_weight=0.8, seed = 15)
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




1347.5412610517765

Con este feature el score volvió a subir, así que lo descartamos 

# Busco los mejores hiperparámetros para xgboost utilizando gridSearch

# Pruebo los features anteriores para predecir los tiempos de conversiones

In [99]:
inst = pd.read_csv('data/installs_ventana7.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})

## Installs

Tomo los tiempos de la primera conversión de cada dispositivo dentro de la ventana 2 para entrenar el algoritmo luego

In [100]:
inst['created'] = pd.to_datetime(inst['created'])
inst.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,6287817205707153877,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,4243443387795468703,3.3013777759777e+18
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,6287817205707153877,,7.80553892759877e+18,adjust.com,,,,,,4724288679627032761,3.3013777759777e+18
2,2019-04-26 12:17:29.831,1,1494519392962156891,8326131692284822715,,False,True,6287817205707153877,,4.780544676403395e+18,adjust.com,,c7c0f1c7-b5bd-4887-8c15-22faa9eed45a,app_open,,,7914241434760640009,3.3013777759777e+18
3,2019-04-24 16:08:49.392,3,1891515180541284343,7982672190758515108,,False,False,6287817205707153877,,7.04773203135986e+18,adjust.com,,,,,,2331161058149107059,3.3013777759777e+18
4,2019-04-24 01:43:29.697,3,1891515180541284343,2374271566729163309,,False,False,6287817205707153877,,3.017401437185711e+18,adjust.com,,,,,,4368472372578406700,3.3013777759777e+18


## Calculo el tiempo que tarda en convertir cada dispositivo

In [101]:
inst['created_inicial'] = dt.datetime(2019, 4, 24)
inst['created_inicial'] = pd.to_datetime(inst['created_inicial'])
inst['timeToInstall'] = (inst['created'] - inst['created_inicial'])/np.timedelta64(1,'s')
inst['timeToInstall'] = inst['timeToInstall'].transform(lambda x: x if (x >=  0) else (72 * 60 * 60))
inst.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language,created_inicial,timeToInstall
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,6287817205707153877,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,4243443387795468703,3.3013777759777e+18,2019-04-24,23009.495
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,6287817205707153877,,7.80553892759877e+18,adjust.com,,,,,,4724288679627032761,3.3013777759777e+18,2019-04-24,7561.032
2,2019-04-26 12:17:29.831,1,1494519392962156891,8326131692284822715,,False,True,6287817205707153877,,4.780544676403395e+18,adjust.com,,c7c0f1c7-b5bd-4887-8c15-22faa9eed45a,app_open,,,7914241434760640009,3.3013777759777e+18,2019-04-24,217049.831
3,2019-04-24 16:08:49.392,3,1891515180541284343,7982672190758515108,,False,False,6287817205707153877,,7.04773203135986e+18,adjust.com,,,,,,2331161058149107059,3.3013777759777e+18,2019-04-24,58129.392
4,2019-04-24 01:43:29.697,3,1891515180541284343,2374271566729163309,,False,False,6287817205707153877,,3.017401437185711e+18,adjust.com,,,,,,4368472372578406700,3.3013777759777e+18,2019-04-24,6209.697


Tomo el tiempo mínimo, en SEGUNDOS, que tardó un dispositivo en realizar una instalación

In [102]:
install_time = inst.groupby('ref_hash').agg({'timeToInstall': 'min'}).reset_index()
install_time.columns = ['ref_hash', 'predict_time_install']
install_time.head()

Unnamed: 0,ref_hash,predict_time_install
0,90072729247980,66650.199
1,342614246084071,253421.836
2,347785260789835,250811.227
3,416301579449694,140951.886
4,420449720220692,147791.04


In [103]:
install_time = target[['ref_hash']].merge(install_time, on = 'ref_hash', how = 'left')
install_time['predict_time_install'] = install_time['predict_time_install'].fillna(72 * 60 * 60)

In [104]:
install_time.nunique()

ref_hash                4037
predict_time_install    1588
dtype: int64

In [105]:
install_time.count()

ref_hash                4037
predict_time_install    4037
dtype: int64

## Ahora leo los datos de la ventana  para crear features sobre estos ids

In [106]:
installs = inst #pd.read_csv('data/installs_ventana6.csv', dtype = {'application_id': np.int16, 'ref_type': 'category', 'click_hash': 'category', 'device_country_code': 'category', 'device_brand': 'category', 'device_model': 'category', 'kind': 'category', 'device_language': 'category'})
installs.head()

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,session_user_agent,user_agent,event_uuid,kind,wifi,trans_id,ip_address,device_language,created_inicial,timeToInstall
0,2019-04-24 06:23:29.495,1,1494519392962156891,4716708407362582887,,False,True,6287817205707153877,,3.739127126472163e+17,adjust.com,,79837499-2f2a-4605-a663-e322f759424f,app_open,,,4243443387795468703,3.3013777759777e+18,2019-04-24,23009.495
1,2019-04-24 02:06:01.032,1,1494519392962156891,7143568733100935872,,False,False,6287817205707153877,,7.80553892759877e+18,adjust.com,,,,,,4724288679627032761,3.3013777759777e+18,2019-04-24,7561.032
2,2019-04-26 12:17:29.831,1,1494519392962156891,8326131692284822715,,False,True,6287817205707153877,,4.780544676403395e+18,adjust.com,,c7c0f1c7-b5bd-4887-8c15-22faa9eed45a,app_open,,,7914241434760640009,3.3013777759777e+18,2019-04-24,217049.831
3,2019-04-24 16:08:49.392,3,1891515180541284343,7982672190758515108,,False,False,6287817205707153877,,7.04773203135986e+18,adjust.com,,,,,,2331161058149107059,3.3013777759777e+18,2019-04-24,58129.392
4,2019-04-24 01:43:29.697,3,1891515180541284343,2374271566729163309,,False,False,6287817205707153877,,3.017401437185711e+18,adjust.com,,,,,,4368472372578406700,3.3013777759777e+18,2019-04-24,6209.697


## Agrego un feature sobre la popularidad de las applicaciones

In [107]:
apps_populares_installs = installs.groupby('application_id').agg({'created': 'count'}).reset_index()
apps_populares_installs.columns = ['application_id', 'popularidad_app']
apps_populares_installs = installs[['ref_hash', 'application_id']].merge(apps_populares_installs, on = 'application_id')
apps_populares_installs.head()

Unnamed: 0,ref_hash,application_id,popularidad_app
0,4716708407362582887,1,7
1,7143568733100935872,1,7
2,8326131692284822715,1,7
3,9139746146594102744,1,7
4,2170353875382826823,1,7


In [108]:
apps_counts = installs.groupby(['ref_hash', 'application_id']).agg({'created': 'count'}).reset_index()
# Le asigno a cada dispositivo la popularidad de la applicación en la cual generó más eventos
apps_pops = apps_counts.groupby(['ref_hash']).agg({'created': 'max'}).reset_index()
apps_pops.columns = ['ref_hash', 'popularidad_apps']
apps_pops.head()

Unnamed: 0,ref_hash,popularidad_apps
0,90072729247980,1
1,342614246084071,1
2,347785260789835,1
3,416301579449694,1
4,420449720220692,1


## Creo el set de datos par a entrenar los algoritmos

In [109]:
data_installs = install_time.merge(apps_pops, on = 'ref_hash', how = 'left') 
data_installs.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps
0,1000169251625791246,259200.0,
1,1000395625957344683,259200.0,
2,1003027494996471685,259200.0,
3,1006670001679961544,259200.0,
4,1007573308966476713,259200.0,


In [110]:
data_installs.nunique()

ref_hash                4037
predict_time_install    1588
popularidad_apps           3
dtype: int64

In [111]:
data_installs['popularidad_apps'] = data_installs['popularidad_apps'].fillna(0)
data_installs.count()

ref_hash                4037
predict_time_install    4037
popularidad_apps        4037
dtype: int64

## Pruebo XGBoost para predecir

In [112]:
y = data_installs['predict_time_install']
X = data_installs.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


47526.659601424646

## Agrego como feature el tiempo real que tarda un dispositivo en realizar la primera conversión, dentro de la ventana 1

In [113]:
installs['created'] = pd.to_datetime(inst['created'])
installs['created_inicial'] = dt.datetime(2019, 4, 24)
installs['created_inicial'] = pd.to_datetime(installs['created_inicial'])
installs['timeToInstall'] = (installs['created'] - installs['created_inicial'])/np.timedelta64(1,'s')


In [114]:
#Ahora tomo el tiempo mínimo
time_to_install = installs.groupby('ref_hash').agg({'timeToInstall': ['min', 'max']}).reset_index()
time_to_install.columns = time_to_install.columns.droplevel(0)
time_to_install.columns = ['ref_hash', 'timeToInstall_min', 'timeToInstall_max']

In [115]:
time_to_install.head()

Unnamed: 0,ref_hash,timeToInstall_min,timeToInstall_max
0,90072729247980,66650.199,66650.199
1,342614246084071,253421.836,253421.836
2,347785260789835,250811.227,250811.227
3,416301579449694,140951.886,140951.886
4,420449720220692,147791.04,147791.04


Agrego el  nuevo feature y vuelvo a probar el algoritmo

In [116]:
data_installs1 = data_installs.merge(time_to_install[['ref_hash', 'timeToInstall_min']], on = 'ref_hash', how = 'left')
data_installs1.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min
0,1000169251625791246,259200.0,0.0,
1,1000395625957344683,259200.0,0.0,
2,1003027494996471685,259200.0,0.0,
3,1006670001679961544,259200.0,0.0,
4,1007573308966476713,259200.0,0.0,


In [117]:
data_installs1.nunique()

ref_hash                4037
predict_time_install    1588
popularidad_apps           4
timeToInstall_min       1587
dtype: int64

In [118]:
data_installs1['timeToInstall_min'] = data_installs1['timeToInstall_min'].fillna(72 * 60 * 60)
#data_installs1['timeToInstall_max'] = data_installs1['timeToInstall_max'].fillna(72 * 60 * 60)
data_installs1.count()

ref_hash                4037
predict_time_install    4037
popularidad_apps        4037
timeToInstall_min       4037
dtype: int64

Pruebo el modelo con el nuevo feature

In [119]:
y = data_installs1['predict_time_install']
X = data_installs1.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


359.64936713570694

Se mejoró el score, por lo que mantengo el feature

## ahora agrego los features sobre los eventos

In [120]:
new_data1 = data_installs1.merge(apps_pops_events, on = 'ref_hash', how = 'left')
new_data1.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,popularidad_apps_events
0,1000169251625791246,259200.0,0.0,259200.0,2.0
1,1000395625957344683,259200.0,0.0,259200.0,
2,1003027494996471685,259200.0,0.0,259200.0,5.0
3,1006670001679961544,259200.0,0.0,259200.0,
4,1007573308966476713,259200.0,0.0,259200.0,


In [121]:
new_data1['popularidad_apps_events'] = new_data1['popularidad_apps_events'].fillna(0)
new_data1.count()

ref_hash                   4037
predict_time_install       4037
popularidad_apps           4037
timeToInstall_min          4037
popularidad_apps_events    4037
dtype: int64

In [122]:
new_data1.nunique()

ref_hash                   4037
predict_time_install       1588
popularidad_apps              4
timeToInstall_min          1588
popularidad_apps_events     131
dtype: int64

Pruebo el nuevo feature con xgboost

In [123]:
y = new_data1['predict_time_install']
X = new_data1.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


366.51853768046874

el score empeoró por lo que no se tomará en cuenta este feature

Agrego un feature sobre los eventos registrados para cada dispositivo

In [124]:
new_data2 = data_installs1.merge(count_events, on = 'ref_hash', how = 'left')

In [125]:
new_data2['count_events'] = new_data2['count_events'].fillna(0)
new_data2.count()

ref_hash                4037
predict_time_install    4037
popularidad_apps        4037
timeToInstall_min       4037
count_events            4037
dtype: int64

In [126]:
new_data2.nunique()

ref_hash                4037
predict_time_install    1588
popularidad_apps           4
timeToInstall_min       1588
count_events             141
dtype: int64

In [127]:
#new_data['count_events'] = new_data['count_events'].fillna(0.0)
y = new_data2['predict_time_install']
X = new_data2.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


364.9970301087698

Se logró mejorar el score con este feature, así que lo mantengo

## Agrego los features sobre auctions

In [128]:
#agrego la mediana de subastas registradas por cada dispositivo según el ref_type_id
new_data3 = new_data2.merge(dfApRef[['ref_hash', 'median_count_ref_type']], on = 'ref_hash', how = 'left')

In [129]:
new_data3.nunique()

ref_hash                 4037
predict_time_install     1588
popularidad_apps            4
timeToInstall_min        1588
count_events              141
median_count_ref_type     370
dtype: int64

In [130]:
new_data3['median_count_ref_type'] = new_data3['median_count_ref_type'].fillna(0.0)
new_data3.count()

ref_hash                 4037
predict_time_install     4037
popularidad_apps         4037
timeToInstall_min        4037
count_events             4037
median_count_ref_type    4037
dtype: int64

In [131]:
#new_data['median_count_ref_type'] = new_data['median_count_ref_type'].fillna(0.0)
y = new_data3['predict_time_install']
X = new_data3.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


370.1630454711305

El feature anterior funcionó bastante bien

In [132]:
#agrego la cantidad de subastas registradas para cada dispositivo
new_data4 = new_data3.merge(auction_count, on = 'ref_hash', how = 'left')
new_data4.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,auctions_count
0,1000169251625791246,259200.0,0.0,259200.0,2.0,13.0,13.0
1,1000395625957344683,259200.0,0.0,259200.0,0.0,15.0,15.0
2,1003027494996471685,259200.0,0.0,259200.0,9.0,168.0,168.0
3,1006670001679961544,259200.0,0.0,259200.0,0.0,3.0,3.0
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,7.0


In [133]:
new_data4.nunique()

ref_hash                 4037
predict_time_install     1588
popularidad_apps            4
timeToInstall_min        1588
count_events              141
median_count_ref_type     371
auctions_count            370
dtype: int64

In [134]:
new_data4['auctions_count'] = new_data4['auctions_count'].fillna(0.0)
new_data4.count()

ref_hash                 4037
predict_time_install     4037
popularidad_apps         4037
timeToInstall_min        4037
count_events             4037
median_count_ref_type    4037
auctions_count           4037
dtype: int64

In [135]:
##new_data['auction_count'] = new_data['auctions_count'].fillna(0.0)
y = new_data4['predict_time_install']
X = new_data4.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


370.1630454711305

El feature anterior no aportó absolutamente nada, por ahora no lo tomo en cuenta

In [204]:
#Agrego el tiempo mínimo de aparición de cada dispositivo en una subasta
new_data5 = new_data3.merge(apariciones_auctions[['ref_hash', 'timeToAuction_min']], on = 'ref_hash', how  = 'left')
new_data5.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,timeToAuction_min
0,1000169251625791246,259200.0,0.0,259200.0,2.0,13.0,76114.647428
1,1000395625957344683,259200.0,0.0,259200.0,0.0,15.0,8034.974209
2,1003027494996471685,259200.0,0.0,259200.0,9.0,168.0,12171.691046
3,1006670001679961544,259200.0,0.0,259200.0,0.0,3.0,64857.60634
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,18726.239096


In [205]:
new_data5.nunique()

ref_hash                 4037
predict_time_install     1588
popularidad_apps            4
timeToInstall_min        1588
count_events              141
median_count_ref_type     371
timeToAuction_min        3351
dtype: int64

In [206]:
new_data5[ 'timeToAuction_min'] = new_data5[ 'timeToAuction_min'].fillna(72 * 60 * 60)
#new_data5[ 'timeToAuction_max'] = new_data5[ 'timeToAuction_max'].fillna(72 * 60 * 60)
new_data5.count()

ref_hash                 4037
predict_time_install     4037
popularidad_apps         4037
timeToInstall_min        4037
count_events             4037
median_count_ref_type    4037
timeToAuction_min        4037
dtype: int64

In [139]:

y = new_data5['predict_time_install']
X = new_data5.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




374.8097809763828

este feature empeoró un poco el score

Agrego otro feature, tomando en cuenta el promedio de subastas, por día, en las que participa cada dispositivo

In [140]:
new_data6 = new_data5.merge(dfApDay[['ref_hash', 'auctions_by_day']], on = 'ref_hash', how  = 'left')
new_data6.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,timeToAuction_min,timeToAuction_max,auctions_by_day
0,1000169251625791246,259200.0,0.0,259200.0,2.0,13.0,76114.647428,255742.21327,5.5
1,1000395625957344683,259200.0,0.0,259200.0,0.0,15.0,8034.974209,251092.530737,6.5
2,1003027494996471685,259200.0,0.0,259200.0,9.0,168.0,12171.691046,105662.836023,84.0
3,1006670001679961544,259200.0,0.0,259200.0,0.0,3.0,64857.60634,237780.114576,1.5
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,18726.239096,254084.485745,3.0


In [141]:
new_data6.nunique()

ref_hash                 4037
predict_time_install     1588
popularidad_apps            4
timeToInstall_min        1588
count_events              141
median_count_ref_type     371
timeToAuction_min        3352
timeToAuction_max        3351
auctions_by_day           343
dtype: int64

In [142]:
new_data6['auctions_by_day'] = new_data6['auctions_by_day'].fillna(0)

Ahora pruebo el modelo con los nuevos features

In [143]:
#
y = new_data6['predict_time_install']
X = new_data6.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




373.43486742520247

el score mejoró bastante

## Agrego un nuevo feature tomando en cuenta el user_agent desde donde se origina la instalación

In [144]:
installs_by_user_agent = installs[['user_agent', 'ref_hash']].groupby('user_agent').count().reset_index()
installs_by_user_agent.columns = ['user_agent', 'installs_por_user_agent']
installs_by_user_agent = installs_by_user_agent.merge(installs[['ref_hash', 'user_agent']], on = 'user_agent')

In [145]:
installs_by_user_agent.head()

Unnamed: 0,user_agent,installs_por_user_agent,ref_hash
0,%E3%83%93%E3%83%AA%E3%82%AA%E3%83%8D%E3%82%A2/...,1,5465584849989394434
1,%E4%BA%BF%E4%B8%87%E5%AF%8C%E7%BF%81/4.1.1313 ...,1,7105193871395775333
2,%E6%A2%A6%E5%B9%BB%E8%8A%B1%E5%9B%AD/3.3.2 CFN...,1,4924170003127137953
3,5miles/2717 CFNetwork/889.9 Darwin/17.2.0,1,3060582157001165122
4,5miles/2778 CFNetwork/758.5.3 Darwin/15.6.0,1,6100912187384224260


In [146]:
new_data7 = new_data5.merge(installs_by_user_agent[['installs_por_user_agent', 'ref_hash']], on = 'ref_hash', how = 'left')
new_data7.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,timeToAuction_min,timeToAuction_max,installs_por_user_agent
0,1000169251625791246,259200.0,0.0,259200.0,2.0,13.0,76114.647428,255742.21327,
1,1000395625957344683,259200.0,0.0,259200.0,0.0,15.0,8034.974209,251092.530737,
2,1003027494996471685,259200.0,0.0,259200.0,9.0,168.0,12171.691046,105662.836023,
3,1006670001679961544,259200.0,0.0,259200.0,0.0,3.0,64857.60634,237780.114576,
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,18726.239096,254084.485745,


In [147]:
new_data7['installs_por_user_agent'] = new_data7['installs_por_user_agent'].fillna(0)

Pruebo el nuevo feature

In [148]:
#new_data[ 'auctions_by_srcID'] = new_data[ 'auctions_by_srcID'].fillna(0.0)
y = new_data7['predict_time_install']
X = new_data7.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




359.4379231558604

## Agrego un nuevo feature tomando en cuenta la session_user_agent

In [149]:
session_user_agent = installs[['ref_hash', 'session_user_agent']].groupby('session_user_agent').count().reset_index()
session_user_agent.head()

Unnamed: 0,session_user_agent,ref_hash
0,5010G Android 6.0,1
1,5041C Android 8.1.0,1
2,5059Z Android 8.1.0,1
3,ANE-AL00 Android 8.0.0,1
4,Apsalar-Postback,5678


In [150]:
session_feature = installs[['ref_hash', 'session_user_agent']].merge(session_user_agent, on = 'session_user_agent', how = 'left')
session_feature = session_feature.drop(columns = 'session_user_agent')
session_feature.columns = ['ref_hash', 'session_user_agent']
session_feature.head()

Unnamed: 0,ref_hash,session_user_agent
0,4716708407362582887,26386.0
1,7143568733100935872,26386.0
2,8326131692284822715,26386.0
3,7982672190758515108,26386.0
4,2374271566729163309,26386.0


Agrego el nuevo feature al set de features

In [151]:
new_data8 = new_data7.merge(session_feature, on = 'ref_hash', how = 'left')

In [152]:
new_data8.nunique()

ref_hash                   4037
predict_time_install       1588
popularidad_apps              4
timeToInstall_min          1588
count_events                141
median_count_ref_type       371
timeToAuction_min          3352
timeToAuction_max          3351
installs_por_user_agent     220
session_user_agent           38
dtype: int64

Pruebo el nuevo feature

In [153]:
y = new_data8['predict_time_install']
X = new_data8.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




409.38559334697237

Este feature empeoró el score, así que lo descarto

# Agrego features sobre los clicks

In [154]:
clks = pd.read_csv("data/clks_ventana7.csv", dtype = {'advertiser_id': np.int8, 'action_id': np.float32, 'source_id': np.int8, 'country_codde': 'category',  'carrier_id': np.float16, 'specs_brand': 'category', 'brand': np.float16, 'ref_type': 'category'})

In [155]:
clks.head()

Unnamed: 0,advertiser_id,action_id,source_id,created,country_code,latitude,longitude,wifi_connection,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_type,ref_hash
0,1,,0,2019-04-25T13:47:20.817Z,6287817205707153877,1.714547,0.871535,False,9.0,4223LIw4hxGWLIFrxXYEVTWAHltzHao,6.795762e+18,,5.131616e+18,71913840936116953,,4.706,0.91,0.036,1891515180541284343,7727410219160218534
1,1,,1,2019-04-25T16:16:56.642Z,6287817205707153877,1.68608,0.880132,True,1.0,3i6jOkXH7UuqsJxa_T0ZM7RaRfCU8jQ,7.531669e+18,,5.648867e+18,3576558787748411622,,3.62,0.577,4.626,1891515180541284343,1054780916525454915
2,1,,1,2019-04-25T16:58:53.030Z,6287817205707153877,1.714512,0.871062,True,1.0,C9GdZ63zLHJP5mlMAVddVG0oBhZohI4,3.575963e+18,,5.754947e+18,3576558787748411622,,39.523,0.816,0.875,1891515180541284343,4412003964704268712
3,1,,3,2019-04-25T16:30:33.253Z,6287817205707153877,1.747826,0.86429,True,7.0,CMABIH7z_gPK7PkBeEqLnyrDXWtlzZE,3.575963e+18,,5.754947e+18,71913840936116953,,0.919,0.956,0.069,1891515180541284343,3891350874593022574
4,1,,1,2019-04-25T17:02:30.824Z,6287817205707153877,1.730828,0.855571,True,1.0,slT0WO8YGuHwT46rmGvtOxp6rq9aowg,5.310345e+18,,3.581233e+18,3576558787748411622,,128.765,0.613,0.709,1891515180541284343,4541657187949816744


In [156]:
clks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16927 entries, 0 to 16926
Data columns (total 20 columns):
advertiser_id      16927 non-null int8
action_id          4 non-null float32
source_id          16927 non-null int8
created            16927 non-null object
country_code       16927 non-null int64
latitude           16927 non-null float64
longitude          16927 non-null float64
wifi_connection    16927 non-null bool
carrier_id         16650 non-null float16
trans_id           16927 non-null object
os_minor           16920 non-null float64
agent_device       2030 non-null float64
os_major           16920 non-null float64
specs_brand        16927 non-null category
brand              4009 non-null float16
timeToClick        9497 non-null float64
touchX             11343 non-null float64
touchY             11343 non-null float64
ref_type           16927 non-null category
ref_hash           16927 non-null int64
dtypes: bool(1), category(2), float16(2), float32(1), float64(8), int64

## Creo feature sobre el tiempo que tarda cada dispositivo en dar el primer click a una publicidad

In [157]:
clicks_time = clks[['created', 'ref_hash']]
clicks_time['created'] = clks['created'].astype('datetime64[ns]')
clicks_time['date_inicial'] = dt.datetime(2019,4, 24)
clicks_time['time_to_click'] = (clicks_time['created'] - clicks_time['date_inicial'])/np.timedelta64(1,'s')
clicks_time.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,created,ref_hash,date_inicial,time_to_click
0,2019-04-25 13:47:20.817,7727410219160218534,2019-04-24,136040.817
1,2019-04-25 16:16:56.642,1054780916525454915,2019-04-24,145016.642
2,2019-04-25 16:58:53.030,4412003964704268712,2019-04-24,147533.03
3,2019-04-25 16:30:33.253,3891350874593022574,2019-04-24,145833.253
4,2019-04-25 17:02:30.824,4541657187949816744,2019-04-24,147750.824


In [158]:
clicks_time = clicks_time.groupby('ref_hash').agg({'time_to_click': 'min'}).reset_index()
clicks_time.columns = ['ref_hash', 'timeToClick_min']
clicks_time.head()

Unnamed: 0,ref_hash,timeToClick_min
0,7164788605058735,248498.913
1,7429113196145773,150386.344
2,8452408857001723,249410.902
3,8577232270715133,143664.373
4,10261987748713353,22499.898


In [159]:
clicks = clicks_time.merge(target[['ref_hash']], how = 'right')
clicks['timeToClick_min'] = clicks['timeToClick_min'].fillna(72 * 60 * 60)
clicks.count()

ref_hash           4037
timeToClick_min    4037
dtype: int64

## Pruebo el nuevo feature para las predicciones

St

In [160]:
data9 = data7.merge(clicks, on = 'ref_hash', how = 'left')
data9.count()

ref_hash                   4037
predict_value              4037
source_id0                 4037
source_id1                 4037
source_id2                 4037
source_id3                 4037
source_id4                 4037
source_id5                 4037
source_id6                 4037
source_id7                 4037
source_id8                 4037
source_id9                 4037
auctions_by_srcID          4037
auctions_by_day            4037
timeToAuction_min          4037
popularidad_apps_events    4037
timeToEvent_min            4037
timeToClick_min            4037
dtype: int64

In [161]:
y = data9['predict_value']
X = data9.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor(learning_rate =0.075, n_estimators=95, max_depth=4, min_child_weight=6, 
                         gamma=0.3, subsample=0.8, colsample_bytree=0.8,
                         scale_pos_weight=0.8, seed = 15)
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




1436.6185565950498

Sc

In [162]:
new_data9 = new_data7.merge(clicks, on = 'ref_hash', how = 'left')
new_data9.count()

ref_hash                   4304
predict_time_install       4304
popularidad_apps           4304
timeToInstall_min          4304
count_events               4304
median_count_ref_type      4304
timeToAuction_min          4304
timeToAuction_max          3587
installs_por_user_agent    4304
timeToClick_min            4304
dtype: int64

In [163]:
y = new_data9['predict_time_install']
X = new_data9.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




360.076303850426

Con este feature el score de predicciones Sc mejoró un poco

Pruebo agregando el campo timeToClick

In [164]:
new_data10 = new_data9.merge(clks[['ref_hash', 'timeToClick']], on = 'ref_hash', how = 'left')
new_data10['timeToClick'] = new_data10['timeToClick'].fillna(72 * 60 * 60)
new_data10.count()

ref_hash                   4420
predict_time_install       4420
popularidad_apps           4420
timeToInstall_min          4420
count_events               4420
median_count_ref_type      4420
timeToAuction_min          4420
timeToAuction_max          3703
installs_por_user_agent    4420
timeToClick_min            4420
timeToClick                4420
dtype: int64

In [165]:
y = new_data10['predict_time_install']
X = new_data10.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




356.21312444578905

## Creo un nuevo feature tomando en cuenta el carrier_id

In [166]:
clks.head()

Unnamed: 0,advertiser_id,action_id,source_id,created,country_code,latitude,longitude,wifi_connection,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_type,ref_hash
0,1,,0,2019-04-25T13:47:20.817Z,6287817205707153877,1.714547,0.871535,False,9.0,4223LIw4hxGWLIFrxXYEVTWAHltzHao,6.795762e+18,,5.131616e+18,71913840936116953,,4.706,0.91,0.036,1891515180541284343,7727410219160218534
1,1,,1,2019-04-25T16:16:56.642Z,6287817205707153877,1.68608,0.880132,True,1.0,3i6jOkXH7UuqsJxa_T0ZM7RaRfCU8jQ,7.531669e+18,,5.648867e+18,3576558787748411622,,3.62,0.577,4.626,1891515180541284343,1054780916525454915
2,1,,1,2019-04-25T16:58:53.030Z,6287817205707153877,1.714512,0.871062,True,1.0,C9GdZ63zLHJP5mlMAVddVG0oBhZohI4,3.575963e+18,,5.754947e+18,3576558787748411622,,39.523,0.816,0.875,1891515180541284343,4412003964704268712
3,1,,3,2019-04-25T16:30:33.253Z,6287817205707153877,1.747826,0.86429,True,7.0,CMABIH7z_gPK7PkBeEqLnyrDXWtlzZE,3.575963e+18,,5.754947e+18,71913840936116953,,0.919,0.956,0.069,1891515180541284343,3891350874593022574
4,1,,1,2019-04-25T17:02:30.824Z,6287817205707153877,1.730828,0.855571,True,1.0,slT0WO8YGuHwT46rmGvtOxp6rq9aowg,5.310345e+18,,3.581233e+18,3576558787748411622,,128.765,0.613,0.709,1891515180541284343,4541657187949816744


In [167]:
clks['carrier_id'].value_counts().head()

1.0     4536
2.0     1627
7.0     1584
13.0    1064
4.0      985
Name: carrier_id, dtype: int64

In [168]:
# Veo cuantos clicks tiene cada dispositivo para cada carrier_id
clicks_carrier_id = clks[['ref_hash', 'carrier_id', 'created']].groupby(['ref_hash', 'carrier_id']).count().reset_index()
clicks_carrier_id.head()

Unnamed: 0,ref_hash,carrier_id,created
0,7164788605058735,7.0,1
1,7429113196145773,11.0,1
2,8452408857001723,2.0,1
3,8577232270715133,7.0,1
4,10261987748713353,7.0,1


In [169]:
# Calculo la "popularidad" de cada carrier_id
carrier_id_pops = clks.groupby('carrier_id').agg({'ref_hash': 'count'}).reset_index()
carrier_id_pops.columns = ['carrier_id', 'popularidad_carrier_id']
carrier_id_pops.head()

Unnamed: 0,carrier_id,popularidad_carrier_id
0,0.0,868
1,1.0,4536
2,2.0,1627
3,3.0,106
4,4.0,985


In [170]:
carriers_populares = clicks_carrier_id.merge(carrier_id_pops, on = 'carrier_id')
carriers_populares.head()

Unnamed: 0,ref_hash,carrier_id,created,popularidad_carrier_id
0,7164788605058735,7.0,1,1584
1,8577232270715133,7.0,1,1584
2,10261987748713353,7.0,1,1584
3,17107379078229231,7.0,1,1584
4,20190713968793307,7.0,1,1584


In [171]:
# calculo la popularidad de cada carrier_id para cada dispositivo
carriers_populares['popularidad_carrier_id'] = carriers_populares['created'] * carriers_populares['popularidad_carrier_id']
carriers_populares = carriers_populares.groupby(['ref_hash']).agg({'popularidad_carrier_id': 'mean'}).reset_index()
carriers_populares.head()

Unnamed: 0,ref_hash,popularidad_carrier_id
0,7164788605058735,1584.0
1,7429113196145773,441.0
2,8452408857001723,1627.0
3,8577232270715133,1584.0
4,10261987748713353,1584.0


## Agrego el nuevo feature a los features de Sc

In [172]:
new_data11 = new_data10.merge(carriers_populares, on = 'ref_hash', how = 'left')
new_data11['popularidad_carrier_id'] = new_data11['popularidad_carrier_id'].fillna(0)
new_data11.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,timeToAuction_min,timeToAuction_max,installs_por_user_agent,timeToClick_min,timeToClick,popularidad_carrier_id
0,1000169251625791246,259200.0,0.0,259200.0,2.0,13.0,76114.647428,255742.21327,0.0,259200.0,259200.0,0.0
1,1000395625957344683,259200.0,0.0,259200.0,0.0,15.0,8034.974209,251092.530737,0.0,259200.0,259200.0,0.0
2,1003027494996471685,259200.0,0.0,259200.0,9.0,168.0,12171.691046,105662.836023,0.0,259200.0,259200.0,0.0
3,1006670001679961544,259200.0,0.0,259200.0,0.0,3.0,64857.60634,237780.114576,0.0,259200.0,259200.0,0.0
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,18726.239096,254084.485745,0.0,259200.0,259200.0,0.0


In [173]:
new_data11.nunique()

ref_hash                   4037
predict_time_install       1588
popularidad_apps              4
timeToInstall_min          1588
count_events                141
median_count_ref_type       371
timeToAuction_min          3352
timeToAuction_max          3351
installs_por_user_agent     220
timeToClick_min             175
timeToClick                 140
popularidad_carrier_id       56
dtype: int64

Pruebo el nuevo feature

In [174]:
y = new_data11['predict_time_install']
X = new_data11.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




355.63627964682667

## Agrego feature timeToEvent

In [202]:
## Tomo el set con mejor score hasta ahora
new_data12 = new_data9.merge(events_time[['ref_hash', 'timeToEvent_min']], on = 'ref_hash', how = 'left')
new_data12.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,timeToAuction_min,timeToAuction_max,installs_por_user_agent,timeToClick_min,timeToEvent_min
0,1000169251625791246,259200.0,0.0,259200.0,2.0,13.0,76114.647428,255742.21327,0.0,259200.0,36872.309
1,1000395625957344683,259200.0,0.0,259200.0,0.0,15.0,8034.974209,251092.530737,0.0,259200.0,
2,1003027494996471685,259200.0,0.0,259200.0,9.0,168.0,12171.691046,105662.836023,0.0,259200.0,73028.525
3,1006670001679961544,259200.0,0.0,259200.0,0.0,3.0,64857.60634,237780.114576,0.0,259200.0,
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,18726.239096,254084.485745,0.0,259200.0,


In [203]:
new_data12['timeToEvent_min'] = new_data12['timeToEvent_min'].fillna(72 * 60 * 60)
#new_data12['timeToEvent_max'] = new_data12['timeToEvent_max'].fillna(72 * 60 * 60)
new_data12.count()

ref_hash                   4304
predict_time_install       4304
popularidad_apps           4304
timeToInstall_min          4304
count_events               4304
median_count_ref_type      4304
timeToAuction_min          4304
timeToAuction_max          3587
installs_por_user_agent    4304
timeToClick_min            4304
timeToEvent_min            4304
dtype: int64

Pruebo el modelo

In [177]:
y = new_data12['predict_time_install']
X = new_data12.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor()
XGB.fit(X, y)

scores = cross_val_score(XGB , X, y, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




360.7729462289692

In [178]:
data_final = data9.merge(target[['ref_hash']], on = 'ref_hash')
data_final = data_final.drop(columns = ['ref_hash', 'predict_value'])

# Guardo los features de cada ventana

## Creo las predicciones St 

In [179]:
#Leo los datos de la ventana anterior
features = pd.read_csv('features_anteriores_St.csv')
features.head()

Unnamed: 0,ref_hash,predict_value,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID,auctions_by_day,timeToAuction_min,popularidad_apps_events,timeToEvent_min
0,1000169251625791246,76114.647428,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.213594,2.5,46581.029283,3.0,6976.388
1,1000395625957344683,8034.974209,0.0,11.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,3.470511,7.0,2467.488253,0.0,259200.0
2,1003027494996471685,12171.691046,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841,84.0,98571.691046,14.0,3666.737
3,1006670001679961544,64857.60634,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316228,0.5,151257.60634,0.0,259200.0
4,1007573308966476713,18726.239096,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494,3.0,32222.420264,0.0,259200.0


In [180]:
target.dtypes

ref_hash    int64
obj         int64
dtype: object

In [181]:
features = features.merge(target[['ref_hash']])
y_train = features['predict_value']
X_train = features.drop(['ref_hash', 'predict_value'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X_train, y_train, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


47472.292407998604

In [182]:
GBR = GradientBoostingRegressor(alpha =  0.9, 
                                learning_rate =  0.1, 
                                max_depth = 3, 
                                min_samples_split =  2, 
                                n_estimators = 50, 
                                subsample = 0.5)

scores = cross_val_score(GBR , X_train, y_train, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

47883.56561963584

In [183]:
data_final = data7.merge(target[['ref_hash']], on = 'ref_hash')
data_final = data_final.drop(columns = ['ref_hash', 'predict_value'])

In [184]:
data_final.head()

Unnamed: 0,source_id0,source_id1,source_id2,source_id3,source_id4,source_id5,source_id6,source_id7,source_id8,source_id9,auctions_by_srcID,auctions_by_day,timeToAuction_min,popularidad_apps_events,timeToEvent_min
0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110961,5.5,76114.647428,2.0,36872.309
1,0.0,9.0,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,2.838231,6.5,8034.974209,0.0,259200.0
2,102.0,3.0,0.0,23.0,2.0,0.0,36.0,0.0,2.0,0.0,32.35841,84.0,12171.691046,5.0,73028.525
3,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674949,1.5,64857.60634,0.0,259200.0
4,0.0,4.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.337494,3.0,18726.239096,0.0,259200.0


In [185]:
GBR = GradientBoostingRegressor(alpha =  0.9, 
                                learning_rate =  0.1, 
                                max_depth = 3, 
                                min_samples_split =  2, 
                                n_estimators = 50, 
                                subsample = 0.5)
GBR.fit(X_train, y_train)
result_st = GBR.predict(data_final)

In [186]:
target_st = target.drop(columns = 'obj')
target_st.head()

Unnamed: 0,ref_hash
0,1000169251625791246
2,1000395625957344683
4,1003027494996471685
6,1006670001679961544
8,1007573308966476713


In [187]:
submit = []
for l in result_st:
    submit.append(l)
target_st['obj'] = submit

In [188]:
target_st['ref_hash'] = target_st['ref_hash'].transform(lambda x: str(x) + '_st')
target_st.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246_st,55893.547415
2,1000395625957344683_st,60294.776208
4,1003027494996471685_st,20364.270168
6,1006670001679961544_st,135790.482656
8,1007573308966476713_st,93532.425466


## Predicciones Sc

In [189]:
# Cargo los features de la ventana anterior
features_train = pd.read_csv('features_anteriores_Sc.csv')
features_train.head()

Unnamed: 0,ref_hash,predict_time_install,popularidad_apps,timeToInstall_min,count_events,median_count_ref_type,timeToAuction_min,installs_por_user_agent,timeToClick_min,timeToEvent_min
0,1000169251625791246,259200.0,0.0,259200.0,3.0,7.0,46581.029283,0.0,259200.0,6976.388
1,1000395625957344683,259200.0,0.0,259200.0,0.0,16.0,2467.488253,0.0,259200.0,259200.0
2,1003027494996471685,259200.0,2.0,336982.31,22.0,168.0,98571.691046,0.0,259200.0,3666.737
3,1006670001679961544,259200.0,0.0,259200.0,0.0,1.0,151257.60634,0.0,259200.0,259200.0
4,1007573308966476713,259200.0,0.0,259200.0,0.0,7.0,32222.420264,0.0,259200.0,259200.0


In [190]:
features_train.nunique()

ref_hash                   4037
predict_time_install       1588
popularidad_apps              4
timeToInstall_min          1402
count_events                121
median_count_ref_type       364
timeToAuction_min          3134
installs_por_user_agent     215
timeToClick_min             144
timeToEvent_min            1848
dtype: int64

In [191]:
features_train = features_train.merge(target[['ref_hash']])
y_train = features_train['predict_time_install']
X_train = features_train.drop(['ref_hash', 'predict_time_install'], axis=1)

XGB = xgb.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=100,
                       verbosity=1, silent=None, objective='reg:linear', n_jobs=1, gamma=0,
                       min_child_weight=1,  max_delta_step=0, reg_alpha=0, reg_lambda=1, 
                       scale_pos_weight=1, base_score=0.5, random_state=0, importance_type='gain')

scores = cross_val_score(XGB , X_train, y_train, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


44984.71808367378

In [193]:
GBR = GradientBoostingRegressor(alpha =  0.9, 
                                learning_rate =  0.1, 
                                max_depth = 3, 
                                min_samples_split =  2, 
                                n_estimators = 50, 
                                subsample = 0.5)

scores = cross_val_score(GBR , X_train, y_train, scoring = "neg_mean_squared_error", cv=5)

# RMSE
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)
rmse_scores.mean()

44690.17582390061

In [209]:
feature_final = new_data12.merge(target[['ref_hash']], on = 'ref_hash')
feature_final = feature_final.drop_duplicates(subset = ['ref_hash'])
feature_final = feature_final.drop(columns = ['ref_hash', 'predict_time_install', 'timeToAuction_max'])

In [210]:
feature_final.count()

popularidad_apps           4037
timeToInstall_min          4037
count_events               4037
median_count_ref_type      4037
timeToAuction_min          4037
installs_por_user_agent    4037
timeToClick_min            4037
timeToEvent_min            4037
dtype: int64

In [211]:
GBR.fit(X_train, y_train)
result_sc = GBR.predict(feature_final)

In [212]:
len(result_sc)

4037

In [213]:
target_sc = target.drop(columns = 'obj')
target_sc.head()

Unnamed: 0,ref_hash
0,1000169251625791246
2,1000395625957344683
4,1003027494996471685
6,1006670001679961544
8,1007573308966476713


In [214]:
submit = []
for l in result_sc:
    submit.append(l)
target_sc['obj'] = submit

In [215]:
target_sc.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246,257862.679682
2,1000395625957344683,248769.755824
4,1003027494996471685,247829.625879
6,1006670001679961544,249139.250706
8,1007573308966476713,249514.332235


In [216]:
target_sc['ref_hash'] = target_sc['ref_hash'].transform(lambda x: str(x) + '_sc')
target_sc.head()

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,257862.679682
2,1000395625957344683_sc,248769.755824
4,1003027494996471685_sc,247829.625879
6,1006670001679961544_sc,249139.250706
8,1007573308966476713_sc,249514.332235


## Armo el submit final


In [217]:
submit_final = target_st.merge(target_sc, on = 'ref_hash', how = 'outer')
submit_final = submit_final.sort_values(by = 'ref_hash')
submit_final.head()

Unnamed: 0,ref_hash,obj_x,obj_y
4037,1000169251625791246_sc,,257862.679682
0,1000169251625791246_st,55893.547415,
4038,1000395625957344683_sc,,248769.755824
1,1000395625957344683_st,60294.776208,
4039,1003027494996471685_sc,,247829.625879


In [218]:
submit_final['obj_x'] = submit_final['obj_x'].fillna(0)
submit_final['obj_y'] = submit_final['obj_y'].fillna(0)
submit_final['obj'] = submit_final['obj_x'] + submit_final['obj_y']
submit_final = submit_final[['ref_hash', 'obj']]
submit_final.head()

Unnamed: 0,ref_hash,obj
4037,1000169251625791246_sc,257862.679682
0,1000169251625791246_st,55893.547415
4038,1000395625957344683_sc,248769.755824
1,1000395625957344683_st,60294.776208
4039,1003027494996471685_sc,247829.625879


In [219]:
submit_final.count()

ref_hash    8074
obj         8074
dtype: int64

In [220]:
submit_final.to_csv(path_or_buf = 'submit_final2.csv', index = False)