In [2]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import SGDRegressor
from sklearn import preprocessing
from sklearn.feature_extraction import FeatureHasher

**Armado de features**

In [2]:
objective = 0
window = 1

In [3]:
max_time = 24*3600*3

time_windows = [('2019-04-18 00:00:00.000000', '2019-04-21 00:00:00.000000'), ('2019-04-19 00:00:00.000000', '2019-04-22 00:00:00.000000'), 
                ('2019-04-20 00:00:00.000000', '2019-04-23 00:00:00.000000'), ('2019-04-21 00:00:00.000000', '2019-04-24 00:00:00.000000'), 
                ('2019-04-24 00:00:00.000000', '2019-04-27 00:00:00.000000')]
time_labels = [('2019-04-21 00:00:00.000000', '2019-04-24 00:00:00.000000'), ('2019-04-22 00:00:00.000000', '2019-04-25 00:00:00.000000'), 
               ('2019-04-23 00:00:00.000000', '2019-04-26 00:00:00.000000'), ('2019-04-24 00:00:00.000000', '2019-04-27 00:00:00.000000')]
days = [18, 19, 20, 21, 24]

model = pd.DataFrame()
targets = pd.DataFrame()

In [4]:
def armadoFeatures(window):
    auctions = pd.read_pickle('data/auctions_w'+str(window)+'.pkl')
    gb = auctions.groupby('ref_hash')
    auctions['n_auctions'] = gb['date'].transform('count')
    auctions['last_auction'] = gb['date'].transform('max')
    auctions['first_auction'] = gb['date'].transform('min')
    auctions['diff_auctions'] = (auctions['last_auction'] - auctions['first_auction']).dt.total_seconds()
    auctions['mean_time_auction'] = 0
    auctions.loc[auctions['n_auctions'] > 1, 'mean_time_auction'] = ((auctions['last_auction'] - auctions['first_auction'])/ \
                                     (auctions['n_auctions'] -1)).dt.total_seconds()
    auctions['first_auction_sec'] = (auctions['first_auction'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    auctions['last_auction_sec'] = (auctions['last_auction'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    auctions['last_auction_sec_to_end'] = (pd.Timestamp(time_windows[window-1][1]) - auctions['last_auction']).dt.total_seconds()
    auctions['ref_type_id_1'] = auctions['ref_type_id'].apply(lambda x: 1 if x==1 else 0)
    auctions['ref_type_id_7'] = auctions['ref_type_id'].apply(lambda x: 1 if x==7 else 0)
    auctions['day'] = (auctions['date'].dt.day) - days[window-1]
    gb = auctions.groupby('ref_hash')
    auctions = auctions.join(gb['source_id'].value_counts().unstack().add_prefix('source_'), on='ref_hash')
    auctions = pd.get_dummies(auctions, columns=['day'])
    gb = auctions.groupby('ref_hash')
    auctions['day_0'] = gb['day_0'].transform('sum')
    auctions['day_1'] = gb['day_1'].transform('sum')
    auctions['day_2'] = gb['day_2'].transform('sum')

    auctions.drop_duplicates(subset='ref_hash', inplace=True)
    auctions.drop(columns=['date', 'ref_type_id', 'last_auction', 'first_auction', 'source_id'], inplace=True)
    auctions = auctions.fillna(0)

    clicks = pd.read_pickle('data/clicks_w'+str(window)+'.pkl')
    clicks.drop(columns=['action_id', 'agent_device'])
    gb = clicks.groupby('ref_hash')
    clicks['n_clicks'] = gb['created'].transform('count')
    clicks['last_click'] = gb['created'].transform('max')
    clicks['first_click'] = gb['created'].transform('min')
    clicks['diff_clicks'] = (clicks['last_click'] - clicks['first_click']).dt.total_seconds()
    clicks['mean_time_click'] = 0
    clicks.loc[clicks['n_clicks'] > 1, 'mean_time_click'] = ((clicks['last_click'] - clicks['first_click'])/ \
                                     (clicks['n_clicks'] -1)).dt.total_seconds()
    clicks['first_click_sec'] = (clicks['first_click'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    clicks['last_click_sec'] = (clicks['last_click'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    clicks['wifi_connection'] = clicks['wifi_connection'].map({True: 1, False: 0})
    clicks['timeToClick_mean'] = clicks.groupby('ref_hash')['timeToClick'].transform('mean')
    clicks.loc[clicks.touchX == 'Infinity', 'touchX'] = 1
    clicks.loc[clicks.touchY == 'Infinity', 'touchY'] = 10
    clicks["touchX"] = pd.to_numeric(clicks["touchX"])
    clicks["touchY"] = pd.to_numeric(clicks["touchY"])
    clicks['touch_bottom'] = clicks['touchY'].apply(lambda x: 1 if x<=1 else 0)
    clicks['touch_bottom2'] = clicks['touchY'].apply(lambda x: 1 if x>1 and x<=2 else 0)
    top_10_carrier_id = clicks['carrier_id'].value_counts().head(10)
    clicks.loc[(~clicks['carrier_id'].isin(top_10_carrier_id.index))&(clicks['carrier_id'].notnull()), 'carrier_id'] = 'Other'
    gb = clicks.groupby('ref_hash')
    clicks = clicks.join(gb['advertiser_id'].value_counts().unstack().add_prefix('advertiser_id_'), on='ref_hash')
    clicks = clicks.join(gb['source_id'].value_counts().unstack().add_prefix('source_id_'), on='ref_hash')
    clicks['touchX_mean'] = gb['touchX'].transform('mean')
    clicks['touchY_mean'] = gb['touchY'].transform('mean')
    clicks['touchs_in_bottom'] = gb['touch_bottom'].transform('sum')
    clicks['touchs_in_bottom2'] = gb['touch_bottom2'].transform('sum')
    clicks['latitude_mean'] = gb['latitude'].transform('mean')
    clicks['longitude_mean'] = gb['longitude'].transform('mean')
    clicks['timeToClick_mean'] = clicks['timeToClick_mean'].fillna(clicks['timeToClick_mean'].mean())
    clicks['touchX_mean'] = clicks['touchX_mean'].fillna(clicks['touchX_mean'].mean())
    clicks['touchY_mean'] = clicks['touchY_mean'].fillna(clicks['touchY_mean'].mean())

    clicks.drop_duplicates(subset='ref_hash', inplace=True)
    clicks = clicks.drop(columns=['advertiser_id', 'source_id', 'created', 'country_code', 'latitude', 'longitude', 'wifi_connection', 'carrier_id', 'trans_id', 'agent_device',
       'os_minor', 'os_major', 'specs_brand', 'brand', 'timeToClick', 'touchX', 'touchY', 'ref_type', 'last_click', 'first_click', 'touch_bottom', 'touch_bottom2', 'action_id'])

    modelo = pd.merge(auctions, clicks, on='ref_hash', how='outer')
    auctions = 0
    clicks = 0
    gc.collect()
    
    installs = pd.read_pickle('data/installs_w'+str(window)+'.pkl')
    installs = installs.drop(columns=['device_countrycode', 'ip_address', 'event_uuid', 'click_hash', 'device_brand', 'device_model'])
    #installs = installs.drop(columns=['device_language', 'device_countrycode', 'ip_address', 'event_uuid', 'device_brand', 'device_model', 'click_hash', 'session_user_agent'])
    installs.loc[installs['kind'] == 'OPEN', 'kind'] = 'Open'
    installs.loc[installs['kind'] == 'app open', 'kind'] = 'app_open'
    installs.loc[installs['kind'] == 'af app open', 'kind'] = 'af_app_opened'
    installs.loc[installs['kind'] == 'af_app_opend', 'kind'] = 'af_app_opened'
    installs.loc[installs['kind'] == 'Session Begin', 'kind'] = 'sessionbegin'
    installs.loc[installs['kind'] == 'signed in', 'kind'] = 'Sign In'
    top_20_installs_kind = installs['kind'].value_counts().head(20)
    installs.loc[(~installs['kind'].isin(top_20_installs_kind.index))&(installs['kind'].notnull()), 'kind'] = 'Other'
    top_15_installs_app = installs['application_id'].value_counts().head(15)
    installs.loc[(~installs['application_id'].isin(top_15_installs_app.index))&(installs['application_id'].notnull()), 'application_id'] = 'Other'
    installs['created'] = pd.to_datetime(installs['created'], format='%Y-%m-%d %H:%M:%S')
    installs['wifi_installs'] = installs['wifi'].map({True: 1, False: 0})
    installs['attributed'] = installs['attributed'].map({True: 1, False: 0})
    installs['implicit'] = installs['implicit'].map({True: 1, False: 0})
    gb = installs.groupby('ref_hash')
    installs = installs.join(gb['kind'].value_counts().unstack().add_prefix('kind_'), on='ref_hash')
    installs = installs.join(gb['application_id'].value_counts().unstack().add_prefix('application_id_'), on='ref_hash')
    installs['wifi_installs_mean'] = gb['wifi_installs'].transform('mean')
    installs['attributed_installs_mean'] = gb['attributed'].transform('mean')
    installs['n_installs'] = gb['created'].transform('count')
    installs['last_install'] = gb['created'].transform('max')
    installs['first_install'] = gb['created'].transform('min')
    installs['diff_installs'] = (installs['last_install'] - installs['first_install']).dt.total_seconds()
    installs['mean_time_install'] = 0
    installs.loc[installs['n_installs'] > 1, 'mean_time_install'] = ((installs['last_install'] - installs['first_install'])/ \
                                     (installs['n_installs'] -1)).dt.total_seconds()
    installs['first_install_sec'] = (installs['first_install'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    installs['last_install_sec'] = (installs['last_install'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    
    installs.drop_duplicates(subset='ref_hash', inplace=True)
    installs.drop(columns=['created', 'application_id', 'ref_type', 'attributed',
       'implicit', 'user_agent', 'kind', 'wifi', 'trans_id', 'device_language',
       'wifi_installs', 'last_install', 'first_install', 'session_user_agent'], inplace=True)

    modelo = pd.merge(modelo, installs, on='ref_hash', how='outer')
    
    installs = 0
    gc.collect()
    
    events = pd.read_pickle('data/events_w'+str(window)+'.pkl')
    events.drop(columns=['event_uuid', 'ip_address', 'index', 'device_countrycode', 'trans_id'], inplace=True)

    events['attributed'] = events['attributed'].map({True: 1, False: 0})
    events['wifi'] = events['wifi'].map({True: 1, False: 0})
    top_15_events_id = events['event_id'].value_counts().head(15)
    events.loc[(~events['event_id'].isin(top_15_events_id.index))&(events['event_id'].notnull()), 'event_id'] = 'Other'
    top_20_application_id = events['application_id'].value_counts().head(20)
    events.loc[(~events['application_id'].isin(top_20_application_id.index))&(events['application_id'].notnull()), 'application_id'] = 'Other'
    #top_15_device_os_version = events['device_os_version'].value_counts().head(15)
    #events.loc[(~events['device_os_version'].isin(top_15_device_os_version.index))&(events['device_os_version'].notnull()), 'device_os_version'] = 'Other'
    #top_15_device_brand = events['device_brand'].value_counts().head(15)
    #events.loc[(~events['device_brand'].isin(top_15_device_brand.index))&(events['device_brand'].notnull()), 'device_brand'] = 'Other'
    #top_15_device_model = events['device_model'].value_counts().head(15)
    #events.loc[(~events['device_model'].isin(top_15_device_model.index))&(events['device_model'].notnull()), 'device_model'] = 'Other'
    top_10_session_user_agent = events['session_user_agent'].value_counts().head(10)
    events.loc[(~events['session_user_agent'].isin(top_10_session_user_agent.index))&(events['session_user_agent'].notnull()), 'session_user_agent'] = 'Other'
    #top_10_carrier = events['carrier'].value_counts().head(10)
    #events.loc[(~events['carrier'].isin(top_10_carrier.index))&(events['carrier'].notnull()), 'carrier'] = 'Other'
    top_15_kind = events['kind'].value_counts().head(15)
    events.loc[(~events['kind'].isin(top_15_kind.index))&(events['kind'].notnull()), 'kind'] = 'Other'
    #top_15_device_language = events['device_language'].value_counts().head(15)
    #events.loc[(~events['device_language'].isin(top_15_device_language.index))&(events['device_language'].notnull()), 'device_language'] = 'Other'

    gb = events.groupby('ref_hash')
    events = events.join(gb['event_id'].value_counts().unstack().add_prefix('event_id_'), on='ref_hash')
    events = events.join(gb['application_id'].value_counts().unstack().add_prefix('application_id_'), on='ref_hash')
    #events = events.join(gb['device_os_version'].value_counts().unstack().add_prefix('device_os_version_'), on='ref_hash')
    #events = events.join(gb['device_brand'].value_counts().unstack().add_prefix('device_brand_'), on='ref_hash')
    #events = events.join(gb['device_model'].value_counts().unstack().add_prefix('device_model_'), on='ref_hash')
    #events = events.join(gb['carrier'].value_counts().unstack().add_prefix('carrier_'), on='ref_hash')
    events = events.join(gb['session_user_agent'].value_counts().unstack().add_prefix('session_user_agent_'), on='ref_hash')
    events = events.join(gb['kind'].value_counts().unstack().add_prefix('kind_'), on='ref_hash')
    #events = events.join(gb['device_language'].value_counts().unstack().add_prefix('device_language_'), on='ref_hash')

    events['n_events'] = gb['date'].transform('count')
    events['attributed_events_mean'] = gb['attributed'].transform('mean')
    events['wifi_events_mean'] = gb['wifi'].transform('mean')
    events['first_event'] = gb['date'].transform('min')
    events['last_event'] = gb['date'].transform('max')
    events['diff_events'] = (events['last_event'] - events['first_event']).dt.total_seconds()
    events['mean_time_events'] = 0
    events.loc[events['n_events'] > 1, 'mean_time_events'] = ((events['last_event'] - events['first_event'])/ \
                                     (events['n_events'] -1)).dt.total_seconds()
    events['first_event_sec'] = (events['first_event'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    events['last_event_sec'] = (events['last_event'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    
    events.drop_duplicates(subset='ref_hash', inplace=True)
    events = events.drop(columns=['date', 'event_id', 'ref_type', 'application_id', 'attributed', 'device_os_version', 'device_brand', 'device_model', 
       'device_city', 'session_user_agent', 'user_agent', 'carrier', 'kind', 'device_os', 'wifi', 'connection_type', 'device_language', 'first_event', 'last_event'])
    
    modelo = pd.merge(modelo, events, on='ref_hash', how='outer')
    events = 0
    gc.collect()
    return modelo

def imputacionValores(df):
    df['last_auction_sec_to_end'].fillna(24*3600*3, inplace=True)

    features = ['diff_auctions', 'mean_time_auction', 'diff_clicks',
                'mean_time_click', 'timeToClick_mean', 'touchX_mean', 
                'touchY_mean', 'latitude_mean', 'longitude_mean',
                'wifi_installs_mean', 'attributed_installs_mean', 'diff_installs',
                'mean_time_install', 'time_appearence', 'time_appearence_install', 
                'attributed_events_mean', 'wifi_events_mean', 'diff_events',
                'mean_time_events']

    for feature in features:
        if feature not in df:
            continue
        else:
            df[feature] = df[feature].fillna(df[feature].mean())

    for feature in df.columns:
        df[feature].fillna(0, inplace=True)
    return df

In [5]:
model = pd.DataFrame()

for i in tqdm(range(1,5)):
    modelo = armadoFeatures(i)

    if window <= 4:
        auctions_label = pd.read_pickle('data/auctions_w'+str(window)+'_label.pkl')
        installs_label = pd.read_pickle('data/installs_w'+str(window)+'_label.pkl')
        modelo = pd.merge(modelo, auctions_label, on='ref_hash', how='outer')
        modelo = pd.merge(modelo, installs_label, on='ref_hash', how='outer')
        modelo['time_appearence'] = modelo['time_appearence'].fillna(max_time)
        modelo['time_appearence_install'] = modelo['time_appearence_install'].fillna(max_time)

    model = pd.concat([model, modelo], sort=False)

    modelo = 0
    gc.collect()
    
#Imputacion valores
model = imputacionValores(model)

100%|███████████████████████████████████████████| 4/4 [20:01<00:00, 299.15s/it]


In [17]:
target_features = ['time_appearence', 'time_appearence_install']
train, test = train_test_split(model.drop(columns=['ref_hash']), test_size=0.2)

train_Y = train[target_features[objective]]
train_X = train.drop(columns=target_features)
test_Y = test[target_features[objective]]
test_X = test.drop(columns=target_features)

In [18]:
lgb_model = lgb.LGBMModel(boosting_type="gbdt", num_leaves=70, max_depth=None, learning_rate=0.025, n_estimators=500, 
                      max_bin=500, subsample_for_bin=50000, objective='regression', min_split_gain=0, min_child_weight=5, 
                      min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, 
                      seed=0, silent=True, categoricals=categoricals)

In [20]:
lgb_model.fit(train_X, train_Y)

LGBMModel(boosting_type='gbdt',
     categoricals=['ref_type_id', 'source_id', 'carrier_id', 'agent_device', 'os_minor', 'os_major', 'specs_brand', 'brand', 'ref_type_x', 'ref_type_y', 'device_brand', 'device_model', 'session_user_agent', 'device_os'],
     class_weight=None, colsample_bytree=1, importance_type='split',
     learning_rate=0.025, max_bin=500, max_depth=None,
     min_child_samples=10, min_child_weight=5, min_split_gain=0,
     n_estimators=500, n_jobs=-1, num_leaves=70, objective='regression',
     random_state=None, reg_alpha=0, reg_lambda=0, seed=0, silent=True,
     subsample=1, subsample_for_bin=50000, subsample_freq=1)

In [23]:
error_train = np.sqrt(mean_squared_error(lgb_model.predict(train_X), train_Y))
error_train

69173.11044756648

In [24]:
error_test = np.sqrt(mean_squared_error(lgb_model.predict(test_X), test_Y))
error_test

69219.4417460478

In [195]:
data.to_pickle('lgb1_data_final.pkl')

**Predicciones**

In [5]:
data = pd.read_pickle('nuevos_modelos/lgb1_data_final.pkl')
target = pd.read_csv('data/target_competencia_ids.csv')
target['ref_hash'] = target['ref_hash'].apply(lambda x: int(str(x)[:-3]))
target = target.drop_duplicates('ref_hash')

In [6]:
data.shape

(4037, 176)

In [7]:
#Predicciones auctions:
lgb_model_auctions = pickle.load(open('nuevos_modelos/lgb1_all_auctions', 'rb'))

pred_data_auctions = lgb_model_auctions.predict(data.drop(columns=['ref_hash']))
predictions_auctions = pd.DataFrame({'ref_hash': data['ref_hash'],'obj': pred_data_auctions})

predictions_auctions['ref_hash'] = predictions_auctions['ref_hash'].apply(lambda x: str(x) + '_st')

In [8]:
#Predicciones installs:
lgb_model_installs = pickle.load(open('nuevos_modelos/lgb1_all_installs', 'rb'))

pred_data_installs = lgb_model_installs.predict(data.drop(columns=['ref_hash']))
predictions_installs = pd.DataFrame({'ref_hash': data['ref_hash'],'obj': pred_data_installs})

predictions_installs['ref_hash'] = predictions_installs['ref_hash'].apply(lambda x: str(x) + '_sc')

In [9]:
predictions_auctions.loc[round(predictions_auctions['obj']) == 189823, 'obj'] = (189823.284861+predictions_auctions['obj'].mean())/2

In [10]:
predictions = pd.concat([predictions_auctions, predictions_installs])

In [11]:
predictions.head(10)

Unnamed: 0,ref_hash,obj
0,8027109759910869730_st,161952.365447
1,3805512975348983658_st,108786.033585
2,706875581985023190_st,64673.774385
3,9201763056911976665_st,27848.106768
4,2070001883938629880_st,46166.708285
5,2956299000597738624_st,2726.349251
6,5051062186658844309_st,41323.710753
7,3729857814892336524_st,43936.008958
8,8048087799114816623_st,122001.951295
9,7988921706433140919_st,37892.576905


In [188]:
predictions.to_csv('predicciones/predicciones19.csv', index=False)