In [1]:
import pandas as pd
import numpy as np

In [2]:
dtypes_dict = {'created':'str',  'ref_type': 'category', 'device_brand': np.float32, \
                'device_countrycode': 'category', 'device_language': 'category', \
                'ref_hash': 'category', 'kind': 'str', 'user_agent': 'str'}

installs = pd.read_csv('data/installs.csv.gzip', dtype=dtypes_dict, low_memory=False)

installs = installs.drop('device_countrycode', axis=1)
installs['created'] = pd.to_datetime(installs['created'], format='%Y-%m-%d %H:%M:%S')

In [3]:
installs = installs.drop(columns=['device_language', 'ip_address', 'event_uuid', 'device_brand', \
                         'device_model', 'click_hash', 'session_user_agent'])

In [4]:
# Normalizacion de valores a binario

installs['wifi'] = installs['wifi'].map({True: 1, False: 0})
installs['wifi'] = installs['wifi'].fillna(-1)

installs['attributed'] = installs['attributed'].map({True: 1, False: 0})
installs['implicit'] = installs['implicit'].map({True: 1, False: 0})

In [5]:
installs['ref_type_android'] = (installs['ref_type'].astype(str).str.contains("1891515180541284343") == True).astype(int)
installs['ref_type_iphone'] = (installs['ref_type'].astype(str).str.contains("1494519392962156891") == True).astype(int)

installs = installs.drop('ref_type', axis=1)

In [6]:
# features de dias

installs['weekday'] = installs['created'].dt.weekday

installs = pd.get_dummies(installs, columns=['weekday'])

# installs = installs.rename(index=str, columns={"weekday_0": "lunes", "weekday_1": "martes", "weekday_2": "miercoles", \
#                              "weekday_3": "jueves", "weekday_4": "viernes", "weekday_5": "sabado", \
#                              "weekday_6": "domingo"})

In [7]:
# feature horarios

bins = [0, 6, 13, 17, 25]
labels = ['morning','afternoon','evening','night']
hours = installs['created'].dt.hour
installs['time'] = pd.cut(hours-5+24 *(hours<5),bins=bins,labels=labels,right=False)

installs = pd.get_dummies(installs, columns=['time'])

In [8]:
# installs kinds to features, creo que no sirve

installs['kind_open'] = (installs['kind'].astype(str).str.lower().str.contains("open|abertura") == True).astype(int)
installs['kind_home'] = (installs['kind'].astype(str).str.lower().str.contains("home") == True).astype(int)
installs['kind_start'] = (installs['kind'].astype(str).str.lower().str.contains("start") == True).astype(int)
installs['kind_view'] = (installs['kind'].astype(str).str.lower().str.contains("view") == True).astype(int)
installs['kind_sign'] = (installs['kind'].astype(str).str.lower().str.contains("sign|registration|account") == True).astype(int)
installs['kind_login'] = (installs['kind'].astype(str).str.lower().str.contains("login|begin") == True).astype(int)
installs['kind_launch'] = (installs['kind'].astype(str).str.lower().str.contains("launch") == True).astype(int)
installs['kind_install'] = (installs['kind'].astype(str).str.lower().str.contains("install") == True).astype(int)
installs['kind_purchase'] = (installs['kind'].astype(str).str.lower().str.contains("purchase|buy") == True).astype(int)
installs['kind_add'] = (installs['kind'].astype(str).str.lower().str.contains("add") == True).astype(int)

In [9]:
# user_agent features

installs['agent_tiktok'] = (installs['user_agent'].astype(str).str.lower().str.contains("tiktok") == True).astype(int)
installs['agent_sindelantal'] = (installs['user_agent'].astype(str).str.lower().str.contains("sindelantal") == True).astype(int)
installs['agent_gardenscapes'] = (installs['user_agent'].astype(str).str.lower().str.contains("gardenscapes") == True).astype(int)
installs['agent_dalvik'] = (installs['user_agent'].astype(str).str.lower().str.contains("dalvik") == True).astype(int)
installs['agent_wish'] = (installs['user_agent'].astype(str).str.lower().str.contains("wish") == True).astype(int)
installs['agent_trivago'] = (installs['user_agent'].astype(str).str.lower().str.contains("trivago") == True).astype(int)
installs['agent_grability'] = (installs['user_agent'].astype(str).str.lower().str.contains("grability") == True).astype(int)
installs['agent_mercadopago'] = (installs['user_agent'].astype(str).str.lower().str.contains("mercadopago") == True).astype(int)

In [10]:
# contar cuantas veces instalo un usuario

installs['total_count_install'] = installs.groupby('ref_hash')['ref_hash'].transform('count')

In [11]:
installs = installs.drop(columns=['kind', 'user_agent', 'trans_id'])

# **Ventanas para train y test**

In [8]:
time_start_1 = '2019-04-18'
time_start_2 = '2019-04-19'
time_start_3 = '2019-04-20'
time_start_4 = '2019-04-21'
time_start_5 = '2019-04-22'

In [12]:
# Ventanas para train y testing

window_1 = installs.loc[installs['created'].between('2019-04-18', '2019-04-21')]
window_2 = installs.loc[installs['created'].between('2019-04-19', '2019-04-22')]
# window_3 = installs.loc[installs['created'].between('2019-04-20', '2019-04-23')]
# window_4 = installs.loc[installs['created'].between('2019-04-21', '2019-04-24')]
# window_5 = installs.loc[installs['created'].between('2019-04-22', '2019-04-25')]

In [13]:
# Ordeno por fechas

window_1 = window_1.sort_values('created')
window_2 = window_2.sort_values('created')
# window_3 = window_3.sort_values('created')
# window_4 = window_4.sort_values('created')
# window_5 = window_5.sort_values('created')

In [14]:
# Contar la cantidad de apariciones que tiene un usuario por ventada

window_1['count'] = window_1.groupby('ref_hash')['ref_hash'].transform('count')
window_2['count'] = window_2.groupby('ref_hash')['ref_hash'].transform('count')
# window_3['count'] = window_3.groupby('ref_hash')['ref_hash'].transform('count')
# window_4['count'] = window_4.groupby('ref_hash')['ref_hash'].transform('count')
# window_5['count'] = window_5.groupby('ref_hash')['ref_hash'].transform('count')

In [15]:
# primera y ultima instalacion por ref_hash

window_1['last_install'] = window_1.groupby('ref_hash')['created'].transform('max')
window_1['first_install'] = window_1.groupby('ref_hash')['created'].transform('min')

window_2['last_install'] = window_2.groupby('ref_hash')['created'].transform('max')
window_2['first_install'] = window_2.groupby('ref_hash')['created'].transform('min')

# window_3['last_install'] = window_3.groupby('ref_hash')['created'].transform('max')
# window_3['first_install'] = window_3.groupby('ref_hash')['created'].transform('min')

# window_4['last_install'] = window_4.groupby('ref_hash')['created'].transform('max')
# window_4['first_install'] = window_4.groupby('ref_hash')['created'].transform('min')

# window_5['last_install'] = window_5.groupby('ref_hash')['created'].transform('max')
# window_5['first_install'] = window_5.groupby('ref_hash')['created'].transform('min')

In [16]:
# diferencia de tiempo entre la primera y la ultima instalacion

window_1['diff_first_installs'] = (window_1['last_install'] - window_1['first_install']).dt.total_seconds()

window_2['diff_first_installs'] = (window_2['last_install'] - window_2['first_install']).dt.total_seconds()

# window_3['diff_first_installs'] = (window_3['last_install'] - window_3['first_install']).dt.total_seconds()

# window_4['diff_first_installs'] = (window_4['last_install'] - window_4['first_install']).dt.total_seconds()

# window_5['diff_first_installs'] = (window_5['last_install'] - window_5['first_install']).dt.total_seconds()

In [17]:
# Promedio de diferencias de tiempo entre todas las instalaciones
window_1['prom_time_install'] = 0
window_1.loc[window_1['count'] > 1, 'prom_time_install'] = ((window_1['last_install'] - window_1['first_install'])/ \
                                 (window_1['count'] -1)).dt.total_seconds()

window_2['prom_time_install'] = 0
window_2.loc[window_2['count'] > 1,'prom_time_install'] = ((window_2['last_install'] - window_2['first_install'])/ \
                                 (window_2['count'] - 1)).dt.total_seconds()

# window_3['prom_time_install'] = 0
# window_3.loc[window_3['count'] > 1,'prom_time_install'] = ((window_3['last_install'] - window_3['first_install'])/ \
#                                  (window_3['count'] - 1)).dt.total_seconds()

# window_4['prom_time_install'] = 0
# window_4.loc[window_4['count'] > 1, 'prom_time_install'] = ((window_4['last_install'] - window_4['first_install'])/ \
#                                  (window_4['count'] - 1)).dt.total_seconds()

# window_5['prom_time_install'] = 0
# window_5.loc[window_5['count'] > 1, 'prom_time_install'] = ((window_5['last_install'] - window_5['first_install'])/ \
#                                  (window_5['count'] -1 )).dt.total_seconds()

In [22]:
window_1.loc[window_1['count'] > 2].head()

Unnamed: 0,created,application_id,ref_hash,attributed,implicit,wifi,ref_type_android,ref_type_iphone,weekday_0,weekday_1,...,agent_wish,agent_trivago,agent_grability,agent_mercadopago,total_count_install,count,last_install,first_install,diff_first_installs,prom_time_install
111872,2019-04-18 00:00:33.627,65,4676578089958170485,0,0,1.0,1,0,0,0,...,0,0,0,0,3,3,2019-04-18 00:01:59.673,2019-04-18 00:00:33.627,86.046,43.023
308764,2019-04-18 00:00:51.554,26,4312630037032150640,0,1,-1.0,0,1,0,0,...,1,0,0,0,3,3,2019-04-18 00:00:52.185,2019-04-18 00:00:51.554,0.631,0.3155
308763,2019-04-18 00:00:51.647,26,4312630037032150640,0,1,-1.0,0,1,0,0,...,1,0,0,0,3,3,2019-04-18 00:00:52.185,2019-04-18 00:00:51.554,0.631,0.3155
308766,2019-04-18 00:00:52.185,26,4312630037032150640,0,0,-1.0,0,1,0,0,...,1,0,0,0,3,3,2019-04-18 00:00:52.185,2019-04-18 00:00:51.554,0.631,0.3155
465430,2019-04-18 00:01:52.515,121,4676578089958170485,0,0,1.0,1,0,0,0,...,0,0,0,0,3,3,2019-04-18 00:01:59.673,2019-04-18 00:00:33.627,86.046,43.023


In [23]:
window_1.isna().sum()

created                0
application_id         0
ref_hash               0
attributed             0
implicit               0
wifi                   0
ref_type_android       0
ref_type_iphone        0
weekday_0              0
weekday_1              0
weekday_2              0
weekday_3              0
weekday_4              0
weekday_5              0
weekday_6              0
time_morning           0
time_afternoon         0
time_evening           0
time_night             0
agent_tiktok           0
agent_sindelantal      0
agent_gardenscapes     0
agent_dalvik           0
agent_wish             0
agent_trivago          0
agent_grability        0
agent_mercadopago      0
total_count_install    0
count                  0
last_install           0
first_install          0
diff_first_installs    0
prom_time_install      0
dtype: int64

In [18]:
window_1 = window_1.drop(columns=['created', 'last_install', 'first_install'])
window_2 = window_2.drop(columns=['created', 'last_install', 'first_install'])

In [18]:
print(len(window_1))
print(len(window_2))
print(len(window_3))
print(len(window_4))
print(len(window_5))

160861
157363
158559
160617
159013


In [19]:
window_1 = window_1.groupby('ref_hash').head(1)
window_2 = window_2.groupby('ref_hash').head(1)
# asi['ref_hash'].value_counts()
# window_2 = window_2.groupby('ref_hash').first()

In [58]:
# window_1 = window_1.drop('ref_hash',axis=1)

In [None]:
join = window_1.

# **XGBOOX**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston

In [51]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [66]:
train, test = train_test_split(window_1, test_size=0.2)

In [67]:
train_Y = train['prom_time_install']
train_X = train.drop(columns=['prom_time_install'])
test_Y = test['prom_time_install']
test_X = test.drop(columns=['prom_time_install'])

In [68]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [70]:
xg_reg.fit(train_X,train_Y)

preds = xg_reg.predict(test_X)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [71]:
rmse = np.sqrt(mean_squared_error(test_Y, preds))
print("RMSE: %f" % (rmse))

RMSE: 10112.565456


In [36]:
# rf = RandomForestRegressor(n_estimators = 2, random_state = 0)
# rf.fit(window_1,window_1)
# window_1_pred = rf.predict(window_2)

In [21]:
# installs = installs['kind'].fillna('unknown_')
# open_app = installs[installs['kind'].str.contains('open')]
# open_app

In [34]:
window_1_pred

array([[ 1.00168373e+17,  3.42000000e+01,  9.50000000e-01,
         0.00000000e+00,  0.00000000e+00,  6.00000000e-01],
       [ 1.00170625e+18,  2.31500000e+01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -3.50000000e-01],
       [ 1.00180223e+18,  3.41500000e+01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  6.50000000e-01],
       ...,
       [ 9.98574351e+17,  9.42000000e+01,  1.00000000e+00,
         0.00000000e+00,  1.50000000e-01, -5.50000000e-01],
       [ 9.98576690e+17,  1.28000000e+02,  1.00000000e+00,
         0.00000000e+00,  5.00000000e-02, -8.50000000e-01],
       [ 9.98576690e+17,  1.28000000e+02,  1.00000000e+00,
         0.00000000e+00,  5.00000000e-02, -8.50000000e-01]])

In [None]:
rf.fit(windows[features], y)

### Clicks

In [37]:
clicks = pd.read_csv("data/clicks.csv.gzip")

clicks = clicks.drop(columns=['agent_device', 'os_minor', 'os_major', 'country_code', 'action_id'\
                'brand', 'touchX', 'touchY', 'specs_brand', 'carrier_id', 'longitude', 'latitude'], axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
clicks.head()

Unnamed: 0,advertiser_id,action_id,source_id,created,wifi_connection,trans_id,timeToClick,ref_type,ref_hash
0,1,,2,2019-04-18T05:27:42.197Z,False,9JMAfrb-b9cSEVCJb0P9JfihGthaS7E,2.317,1891515180541284343,1293710398598742392
1,1,,1,2019-04-18T05:27:03.164Z,True,r3xtTRv2lInfiXG8JI3NQsNcBo8GyFQ,7.653,1891515180541284343,1663930990551616564
2,1,,1,2019-04-18T05:42:07.926Z,True,WOnHFqQtY48z_ygKZ-030U_g0TMGVMw,464.796,1891515180541284343,8488038938665586188
3,1,,1,2019-04-18T05:26:04.446Z,True,wQMLLmYqiFhSuha9p9B13PMtcyBW_vM,225.311,1891515180541284343,6488361690105189959
4,1,,1,2019-04-18T05:23:37.764Z,True,GeFoyBzMA7taylMxxjzlNPTU-n4FXFs,84.736,1891515180541284343,1348993302102753419


In [None]:
installs['wifi'] = installs['wifi'].map({True: 1, False: 0})
installs['wifi'] = installs['wifi'].fillna(-1)

installs['ref_type'] = installs['ref_type'].map({'1891515180541284343': 1, '1494519392962156891': 0})

In [74]:
window_1_click = clicks[clicks['created'].between('2019-04-18', '2019-04-21')]

In [75]:
clicks_first = pd.merge(window_1_click, window_1_click.groupby('ref_hash').agg('size').reset_index(), on='ref_hash', how='inner')

In [76]:
clicks_first = clicks_first.drop_duplicates('ref_hash')

In [90]:
installs_ref_hash_2 = pd.merge(installs, clicks, on='ref_hash', how='inner')

In [91]:
installs_ref_hash_2.head()

Unnamed: 0,created_x,application_id,ref_type_x,ref_hash,click_hash,attributed,implicit,kind,wifi,trans_id_x,advertiser_id,action_id,source_id,created_y,wifi_connection,trans_id_y,timeToClick,ref_type_y
