In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import SGDRegressor
from sklearn import preprocessing
from sklearn.feature_extraction import FeatureHasher

**Armado de features**

In [2]:
objective = 0
window = 1

In [3]:
max_time = 24*3600*3

time_windows = [('2019-04-18 00:00:00.000000', '2019-04-21 00:00:00.000000'), ('2019-04-19 00:00:00.000000', '2019-04-22 00:00:00.000000'), 
                ('2019-04-20 00:00:00.000000', '2019-04-23 00:00:00.000000'), ('2019-04-21 00:00:00.000000', '2019-04-24 00:00:00.000000'), 
                ('2019-04-24 00:00:00.000000', '2019-04-27 00:00:00.000000')]
time_labels = [('2019-04-21 00:00:00.000000', '2019-04-24 00:00:00.000000'), ('2019-04-22 00:00:00.000000', '2019-04-25 00:00:00.000000'), 
               ('2019-04-23 00:00:00.000000', '2019-04-26 00:00:00.000000'), ('2019-04-24 00:00:00.000000', '2019-04-27 00:00:00.000000')]
days = [18, 19, 20, 21, 24]

model = pd.DataFrame()
targets = pd.DataFrame()

In [14]:
def armadoFeatures(window):
    auctions = pd.read_pickle('data/auctions_w'+str(window)+'.pkl')
    auctions['ref_type_id'] = auctions['ref_type_id'].astype(np.int8)

    gb = auctions.groupby('ref_hash')
    auctions['n_auctions'] = gb['date'].transform('count').astype(np.int16)
    auctions['last_auction'] = gb['date'].transform('max')
    auctions['first_auction'] = gb['date'].transform('min')
    auctions['diff_auctions'] = (auctions['last_auction'] - auctions['first_auction']).dt.total_seconds().astype(np.int32)
    auctions['mean_time_auction'] = 0
    auctions.loc[auctions['n_auctions'] > 1, 'mean_time_auction'] = ((auctions['last_auction'] - auctions['first_auction'])/ \
                                     (auctions['n_auctions'] -1)).dt.total_seconds()
    auctions['first_auction_sec'] = (auctions['first_auction'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds().astype(np.int32)
    auctions['last_auction_sec'] = (auctions['last_auction'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds().astype(np.int32)
    auctions['last_auction_sec_to_end'] = (pd.Timestamp(time_windows[window-1][1]) - auctions['last_auction']).dt.total_seconds().astype(np.int32)
    auctions['day'] = ((auctions['date'].dt.day) - days[window-1]).astype(np.int8)
    auctions = pd.get_dummies(auctions, columns=['day'])
    gb = auctions.groupby('ref_hash')
    auctions['day_0'] = gb['day_0'].transform('sum').astype(np.int16)
    auctions['day_1'] = gb['day_1'].transform('sum').astype(np.int16)
    auctions['day_2'] = gb['day_2'].transform('sum').astype(np.int16)
    auctions = encodeCategoricals(auctions, 'source_id', [1, 0, 3, 7, 6, 8, 5, 4, 2, 9])

    auctions.drop_duplicates(subset='ref_hash', inplace=True)
    auctions.drop(columns=['date', 'last_auction', 'first_auction', 'source_id'], inplace=True)
    auctions = auctions.fillna(0)

    clicks = pd.read_pickle('data/clicks_w'+str(window)+'.pkl')
    clicks.drop(columns=['action_id', 'agent_device', 'country_code', 'trans_id'], inplace=True)
    clicks['advertiser_id'] = clicks['advertiser_id'].astype(np.int8)
    clicks['source_id'] = clicks['source_id'].astype(np.int8)
    clicks['latitude'] = clicks['latitude'].astype(np.float32)
    clicks['longitude'] = clicks['longitude'].astype(np.float32)
    clicks['wifi_connection'] = clicks['wifi_connection'].map({True: 1, False: 0}).astype(np.int8)
    clicks['carrier_id'] = ((clicks['carrier_id'].fillna(-1))).astype(np.int16)
    clicks['os_major'] = (clicks['os_major'].fillna(0)/10e15).astype(np.int16)
    clicks['os_minor'] = (clicks['os_minor'].fillna(0)/10e15).astype(np.int16)
    clicks['specs_brand'] = clicks['specs_brand'].map({71913840936116953: 0, 3576558787748411622: 1, 784329784168794382: 2})
    clicks['brand'] = (clicks['brand'].fillna(-1)).astype(np.int16)
    clicks['ref_type'] = clicks['ref_type'].map({1891515180541284343: 1, 1494519392962156891: 0}).astype(np.int8)
    gb = clicks.groupby('ref_hash')
    clicks['n_clicks'] = gb['created'].transform('count')
    clicks['last_click'] = gb['created'].transform('max')
    clicks['first_click'] = gb['created'].transform('min')
    clicks['timeToClick_mean'] = gb['timeToClick'].transform('mean')
    clicks['carrier_id'] = gb['carrier_id'].transform(lambda x: x.mode().iloc[0])
    clicks['os_minor'] = gb['os_minor'].transform(lambda x: x.mode().iloc[0])
    clicks['os_major'] = gb['os_major'].transform(lambda x: x.mode().iloc[0])
    clicks['diff_clicks'] = (clicks['last_click'] - clicks['first_click']).dt.total_seconds().astype(np.int32)
    clicks['mean_time_click'] = 0
    clicks.loc[clicks['n_clicks'] > 1, 'mean_time_click'] = ((clicks['last_click'] - clicks['first_click'])/ \
                                     (clicks['n_clicks'] -1)).dt.total_seconds()
    clicks['first_click_sec'] = (clicks['first_click'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds().astype(np.int32)
    clicks['last_click_sec'] = (clicks['last_click'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds().astype(np.int32)
    clicks['timeToClick_mean'] = clicks.groupby('ref_hash')['timeToClick'].transform('mean')
    clicks.loc[clicks.touchX == 'Infinity', 'touchX'] = 1
    clicks.loc[clicks.touchY == 'Infinity', 'touchY'] = 10
    clicks["touchX"] = pd.to_numeric(clicks["touchX"])
    clicks["touchY"] = pd.to_numeric(clicks["touchY"])
    clicks['touch_bottom'] = clicks['touchY'].apply(lambda x: 1 if x<=1 else 0).astype(np.int8)
    clicks['touch_bottom2'] = clicks['touchY'].apply(lambda x: 1 if x>1 and x<=2 else 0).astype(np.int8)

    gb = clicks.groupby('ref_hash')
    clicks['touchX_mean'] = gb['touchX'].transform('mean')
    clicks['touchY_mean'] = gb['touchY'].transform('mean')
    clicks['touchs_in_bottom'] = gb['touch_bottom'].transform('sum').astype(np.int16)
    clicks['touchs_in_bottom2'] = gb['touch_bottom2'].transform('sum').astype(np.int16)
    clicks['latitude_mean'] = gb['latitude'].transform('mean')
    clicks['longitude_mean'] = gb['longitude'].transform('mean')
    clicks['timeToClick_mean'] = clicks['timeToClick_mean'].fillna(clicks['timeToClick_mean'].mean())
    clicks['touchX_mean'] = clicks['touchX_mean'].fillna(clicks['touchX_mean'].mean())
    clicks['touchY_mean'] = clicks['touchY_mean'].fillna(clicks['touchY_mean'].mean())
    clicks['last_click_sec_to_end'] = (pd.Timestamp(time_windows[window-1][1]) - clicks['last_click']).dt.total_seconds().astype(np.int32)
    clicks = encodeCategoricals(clicks, 'advertiser_id', [2, 1, 0, 3, 4])
    clicks = encodeCategoricals(clicks, 'source_id', [1, 0, 3, 2, 6, 4, 5, 7, 8])
    
    clicks.drop_duplicates(subset='ref_hash', inplace=True)
    clicks = clicks.drop(columns=['advertiser_id', 'source_id', 'created', 'latitude', 'longitude', 'wifi_connection', 
                                  'timeToClick', 'touchX', 'touchY','last_click', 'first_click', 'touch_bottom', 'touch_bottom2'])

    modelo = pd.merge(auctions, clicks, on='ref_hash', how='outer')
    auctions = 0
    clicks = 0
    gc.collect()
    
    installs = pd.read_pickle('data/installs_w'+str(window)+'.pkl')
    installs = installs.drop(columns=['device_countrycode', 'ip_address', 'event_uuid', 'click_hash', 'trans_id'])
    installs['ref_type'] = installs['ref_type'].map({1891515180541284343: 1, 1494519392962156891: 0}).astype(np.int8)
    installs['kind'] = installs['kind'].str.lower()
    installs.loc[installs['kind'] == 'OPEN', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'app_open', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'app open', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'af app open', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'af_app_opened', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'af_app_opend', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'Session Begin', 'kind'] = 'sessionbegin'
    installs.loc[installs['kind'] == 'signed in', 'kind'] = 'sign in'
    installs.loc[installs['kind'] == 'opened_app', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'appopened', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'event_open_app', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'app_opened', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'app_launch', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'app launch', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'app launched', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'application open', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'abertura do app', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'pax_app_open', 'kind'] = 'open'
    installs.loc[installs['kind'] == 'login', 'kind'] = 'sign_in'
    installs.loc[installs['kind'] == 'sessionbegin', 'kind'] = 'sign_in'
    installs.loc[installs['kind'] == 'sign in', 'kind'] = 'sign_in'
    installs.loc[installs['kind'] == 'session begin', 'kind'] = 'sign_in'
    installs.loc[installs['kind'] == 'login_success', 'kind'] = 'sign_in'
    installs.loc[installs['kind'] == 'list view', 'kind'] = 'list_view'
    installs.loc[installs['kind'] == 'af_view_list', 'kind'] = 'list_view'
    installs.loc[installs['kind'] == 'af_list_view', 'kind'] = 'list_view'
    installs.loc[installs['kind'] == 'product view', 'kind'] = 'product_view'
    installs.loc[installs['kind'] == 'af_view_product', 'kind'] = 'product_view'
    installs.loc[installs['kind'] == 'af_content_view', 'kind'] = 'content_view'
    installs.loc[installs['kind'] == 'content view', 'kind'] = 'content_view'
    installs.loc[installs['kind'] == 'pin_verified - registration', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'userregistered', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'complete_registration', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'new registration', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'registration_complete', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'completed registration', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'create_account', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'sign up', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'mobilesignup', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'signed_up', 'kind'] = 'registration'
    installs.loc[installs['kind'] == 'privacypolicy_agree_split', 'kind'] = 'terms_agree_split'
    installs.loc[installs['kind'] == 'payment method add', 'kind'] = 'add_payment'
    installs = encodeCategoricals(installs, 'kind', ['open', 'sign_in', 'list_view', 'product_view',
           'account_summary_first_step', 'registration', 'reattribution',
           'add_to_cart', 'pre_checkout', 'install', 'content_view',
           'app first start', 'startsessionplayback', 'add_payment',
           'terms_agree_split', 'reinstall', 'app_alive', 'trackapplaunch',
           'adjust_uninstall', 'terms_open_split', 'purchase', 'af_add_to_wishlist'])
    installs = encodeCategoricals(installs, 'application_id', [121,  36, 210,  65,  14,  68, 122, 155,  21, 226, 128,  75,  70,
             94, 117, 302, 158,  26, 187, 339, 133,  77,  19,  49, 194, 145, 143, 263, 327, 214])
    
    installs['wifi_installs'] = installs['wifi'].map({True: 1, False: 0}).astype(np.int8)
    installs['attributed'] = installs['attributed'].map({True: 1, False: 0}).astype(np.int8)
    installs['implicit'] = installs['implicit'].map({True: 1, False: 0}).astype(np.int8)
    gb = installs.groupby('ref_hash')
    installs['wifi_installs_mean'] = gb['wifi_installs'].transform('mean')
    installs['sum_wifi_on_installs'] = gb['wifi_installs'].transform('sum').astype(np.int16)
    installs['attributed_installs_mean'] = gb['attributed'].transform('mean')
    installs['sum_attributed_installs'] = gb['attributed'].transform('sum').astype(np.int16)
    installs['n_installs'] = gb['created'].transform('count').astype(np.int16)
    installs['last_install'] = gb['created'].transform('max')
    installs['first_install'] = gb['created'].transform('min')
    installs['device_language'] = gb['device_language'].transform(lambda x: x.mode().iloc[0])
    installs['diff_installs'] = (installs['last_install'] - installs['first_install']).dt.total_seconds().astype(np.int32)
    installs['last_install_sec_to_end'] = (pd.Timestamp(time_windows[window-1][1]) - installs['last_install']).dt.total_seconds().astype(np.int32)
    installs['mean_time_install'] = 0
    installs.loc[installs['n_installs'] > 1, 'mean_time_install'] = ((installs['last_install'] - installs['first_install'])/ \
                                     (installs['n_installs'] -1)).dt.total_seconds()
    installs['first_install_sec'] = (installs['first_install'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds().astype(np.int32)
    installs['last_install_sec'] = (installs['last_install'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds().astype(np.int32)
    
    installs['device_brand'] = installs['device_brand'].fillna(-1)
    installs['device_model'] = installs['device_brand'].fillna(-1)
    
    installs.drop_duplicates(subset='ref_hash', inplace=True)
    installs.drop(columns=['created', 'ref_type', 'user_agent', 'kind', 'wifi_installs', 'last_install', 
                           'first_install', 'session_user_agent', 'application_id', 'attributed', 'implicit', 'wifi'], inplace=True)

    modelo = pd.merge(modelo, installs, on='ref_hash', how='outer')
    
    installs = 0
    gc.collect()
    
    events = pd.read_pickle('data/events_w'+str(window)+'.pkl')
    events.drop(columns=['event_uuid', 'ip_address', 'index', 'device_countrycode', 'trans_id', 'carrier', 'device_city'], inplace=True)

    events['attributed'] = events['attributed'].map({True: 1, False: 0})
    events['wifi'] = events['wifi'].map({True: 1, False: 0})
    events['ref_type'] = events['ref_type'].map({1891515180541284343: 1, False: 1494519392962156891})
    
    gb = events.groupby('ref_hash')

    events['n_events'] = gb['date'].transform('count')
    events['attributed_events_mean'] = gb['attributed'].transform('mean')
    events['wifi_events_mean'] = gb['wifi'].transform('mean')
    events['first_event'] = gb['date'].transform('min')
    events['last_event'] = gb['date'].transform('max')
    events['diff_events'] = (events['last_event'] - events['first_event']).dt.total_seconds()
    events['mean_time_events'] = 0
    events.loc[events['n_events'] > 1, 'mean_time_events'] = ((events['last_event'] - events['first_event'])/ \
                                     (events['n_events'] -1)).dt.total_seconds()
    events['first_event_sec'] = (events['first_event'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    events['last_event_sec'] = (events['last_event'] - pd.Timestamp(time_windows[window-1][0])).dt.total_seconds()
    events['last_event_sec_to_end'] = (pd.Timestamp(time_windows[window-1][1]) - events['last_event']).dt.total_seconds().astype(np.int32)
    
    events['device_os_version'] = round(events['device_os_version']/10e14)
    events['device_os_version'].fillna(-1, inplace=True)
    events['device_os_version'] = events['device_os_version'].astype(np.int16)
    events['device_os'] = events['device_os'].fillna(-1)
    events['device_os'] = events['device_os'].map({7.531669329342817e+18: 0, 6.94182462626038e+18: 1, -1:-1}).astype(np.int8)
    events['device_brand'] = events['device_brand'].fillna(-1)
    ranking = events['device_brand'].value_counts()
    events['freq_device_brand'] = events['device_brand'].transform(lambda x: ranking[x])
    events['device_model'] = events['device_model'].fillna(-1)
    ranking = events['device_model'].value_counts()
    events['freq_device_model'] = events['device_model'].transform(lambda x: ranking[x])
    events['kind'] = round(events['kind']/10e13)
    events['kind'] = events['kind'].fillna(-1)
    ranking = events['kind'].value_counts()
    events['freq_kind'] = events['kind'].transform(lambda x: ranking[x])
    events = encodeCategoricals(events, 'kind', [40177.0, 55008.0, 60812.0, 61683.0, 61238.0, 58829.0, 48532.0,
                   9978.0, 90668.0, 68977.0, 53587.0, 50591.0, 41719.0, 86347.0,
                  25155.0, 70053.0, 33880.0, 40746.0, 88368.0, 91530.0, 83705.0,
                  24514.0, 41749.0, 23767.0, 85698.0, 32091.0, 21823.0, 43053.0,
                  10788.0,  1037.0, 48868.0, 76110.0, 54878.0, 77210.0, 72842.0,
                  10146.0, 38120.0, 46479.0, 12870.0, 73384.0])
    events['event_id'] = events['event_id'].fillna(-1)
    ranking = events['event_id'].value_counts()
    events['freq_event_id'] = events['event_id'].transform(lambda x: ranking[x])
    events = encodeCategoricals(events, 'event_id', [  1,  15,  23,   2, 115,   0,  13,   7, 245, 101, 116, 157,   3,
            120, 100, 287,  14, 211, 104, 118, 158, 363, 106, 108, 341, 246,
             39, 159,  24, 171])

    events['session_user_agent'] = round(events['session_user_agent']/10e12)
    events['session_user_agent'] = events['session_user_agent'].fillna(-1)

    events = encodeCategoricals(events, 'connection_type', ['Cable/DSL', 'Cellular'])
    
    events.drop_duplicates(subset='ref_hash', inplace=True)
    events = events.drop(columns=['date', 'event_id', 'application_id', 'attributed', 'device_brand', 'device_model', 
                                  'user_agent', 'wifi', 'connection_type', 'device_language', 'first_event', 'last_event'])
    
    modelo = pd.merge(modelo, events, on='ref_hash', how='outer')
    events = 0
    gc.collect()
    
    modelo['n_appearences'] = modelo['n_auctions'] + modelo['n_clicks'] + modelo['n_events'] + modelo['n_installs']
    
    return modelo

def imputacionValores(df):
    df['last_auction_sec_to_end'].fillna(24*3600*3, inplace=True)
    df['last_event_sec_to_end'].fillna(24*3600*3, inplace=True)
    df['last_install_sec_to_end'].fillna(24*3600*3, inplace=True)
    df['last_click_sec_to_end'].fillna(24*3600*3, inplace=True)
    df['carrier_id'] = ((df['carrier_id'].fillna(-1))).astype(np.int16)
    df['os_major'] = (df['os_major'].fillna(0)/10e15).astype(np.int16)
    df['os_minor'] = (df['os_minor'].fillna(0)/10e15).astype(np.int16)
    df['brand'] = (df['brand'].fillna(-1)).astype(np.int16)
    df['device_brand'] = df['device_brand'].fillna(-1)
    df['device_model'] = df['device_brand'].fillna(-1)
    df['device_os_version'].fillna(-1, inplace=True)
    df['device_os'] = df['device_os'].fillna(-1)
    df['device_brand'] = df['device_brand'].fillna(-1)
    df['device_model'] = df['device_model'].fillna(-1)
    df['kind'] = df['kind'].fillna(-1)

    features = ['diff_auctions', 'mean_time_auction', 'diff_clicks',
                'mean_time_click', 'timeToClick_mean', 'touchX_mean', 
                'touchY_mean', 'latitude_mean', 'longitude_mean',
                'wifi_installs_mean', 'attributed_installs_mean', 'diff_installs',
                'mean_time_install', 'time_appearence', 'time_appearence_install', 
                'attributed_events_mean', 'wifi_events_mean', 'diff_events',
                'mean_time_events']

    for feature in features:
        if feature not in df:
            continue
        else:
            df[feature] = df[feature].fillna(df[feature].mean())

    for feature in df.columns:
        df[feature].fillna(0, inplace=True)
    return df

def encodeCategoricals(df, column_name, values):
    string_values = list(map(lambda x: str(x), values))
    gb = df.groupby('ref_hash')
    new_df = gb[column_name].value_counts().unstack()
    remove = []
    for column in list(new_df.columns):
        if column not in values:
            remove.append(column)
    new_df.drop(columns = remove, inplace=True)

    for value in values:
        if value not in list(new_df.columns):
            new_df[value] = 0
    new_df.columns = values
    new_df = new_df.add_prefix(column_name+'_')
    df = df.join(new_df, on='ref_hash')
    df.fillna(0, inplace=True)
    return df

In [5]:
model = pd.DataFrame()

for i in tqdm(range(1,5)):
    modelo = armadoFeatures(i)

    if window <= 4:
        auctions_label = pd.read_pickle('data/auctions_w'+str(window)+'_label.pkl')
        installs_label = pd.read_pickle('data/installs_w'+str(window)+'_label.pkl')
        modelo = pd.merge(modelo, auctions_label, on='ref_hash', how='outer')
        modelo = pd.merge(modelo, installs_label, on='ref_hash', how='outer')
        modelo['time_appearence'] = modelo['time_appearence'].fillna(max_time)
        modelo['time_appearence_install'] = modelo['time_appearence_install'].fillna(max_time)

    model = pd.concat([model, modelo], sort=False)

    modelo = 0
    gc.collect()
    
#Imputacion valores
model['n_appearences'] = model['n_auctions'] + model['n_clicks'] + model['n_events'] + model['n_installs']
model = imputacionValores(model)

100%|███████████████████████████████████████████| 4/4 [20:23<00:00, 306.24s/it]


KeyError: 'device_os_version'

In [12]:
model.to_pickle('nuevos_modelos/lgb2_data.pkl')

KeyboardInterrupt: 

In [None]:
lista = ['kind_open', 'kind_sign_in', 'kind_list_view', 'kind_product_view',
       'kind_account_summary_first_step', 'kind_registration',
       'kind_reattribution', 'kind_add_to_cart', 'kind_pre_checkout',
       'kind_install', 'kind_content_view', 'kind_app first start',
       'kind_startsessionplayback', 'kind_add_payment',
       'kind_terms_agree_split', 'kind_reinstall', 'kind_app_alive',
       'kind_trackapplaunch', 'kind_adjust_uninstall', 'kind_terms_open_split',
       'kind_purchase', 'kind_af_add_to_wishlist']
for columna in lista:
    modelo[columna] = modelo[columna].astype(np.int8)

In [16]:
categoricals = ['ref_type_id', 'carrier_id', 'agent_device', 'os_minor', 'os_major','specs_brand', 'brand', 'ref_type_x', 'ref_type_y', 
                'device_brand', 'device_model', 'session_user_agent', 'device_os']

In [18]:
objective = 0
target_features = ['time_appearence', 'time_appearence_install']
train, test = train_test_split(model.drop(columns=['ref_hash']), test_size=0.2)

train_Y = train[target_features[objective]]
train_X = train.drop(columns=target_features)
test_Y = test[target_features[objective]]
test_X = test.drop(columns=target_features)

In [19]:
lgb_model = lgb.LGBMModel(boosting_type="gbdt", num_leaves=70, max_depth=None, learning_rate=0.025, n_estimators=500, 
                      max_bin=500, subsample_for_bin=50000, objective='regression', min_split_gain=0, min_child_weight=5, 
                      min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, 
                      seed=0, silent=True, categoricals=categoricals)

In [20]:
lgb_model.fit(train_X, train_Y)

LGBMModel(boosting_type='gbdt',
     categoricals=['ref_type_id', 'source_id', 'carrier_id', 'agent_device', 'os_minor', 'os_major', 'specs_brand', 'brand', 'ref_type_x', 'ref_type_y', 'device_brand', 'device_model', 'session_user_agent', 'device_os'],
     class_weight=None, colsample_bytree=1, importance_type='split',
     learning_rate=0.025, max_bin=500, max_depth=None,
     min_child_samples=10, min_child_weight=5, min_split_gain=0,
     n_estimators=500, n_jobs=-1, num_leaves=70, objective='regression',
     random_state=None, reg_alpha=0, reg_lambda=0, seed=0, silent=True,
     subsample=1, subsample_for_bin=50000, subsample_freq=1)

In [21]:
error_train = np.sqrt(mean_squared_error(lgb_model.predict(train_X), train_Y))
error_train

75412.19253679502

In [22]:
error_test = np.sqrt(mean_squared_error(lgb_model.predict(test_X), test_Y))
error_test

75680.06446032545

In [25]:
with open('logs/models.txt', mode='a') as log:
    log.write('LGBM; {}; {}; {}; {}; {} \n'.format(target_features[objective], lgb_model.get_params(), list(train.columns), error_train, error_test))

In [23]:
pickle.dump(lgb_model, open('nuevos_modelos/lgb2_all_auctions', 'wb'))

**Predicciones**

In [27]:
ventana5 = armadoFeatures(5)
target = pd.read_csv('data/target_competencia_ids.csv')
target['ref_hash'] = target['ref_hash'].apply(lambda x: int(str(x)[:-3]))
target = target.drop_duplicates('ref_hash')

In [28]:
data = pd.merge(ventana5, target, on='ref_hash', how='right')
data = imputacionValores(data)
data.drop(columns=['obj'], inplace=True)

In [47]:
#Predicciones auctions:
lgb_model_auctions = pickle.load(open('nuevos_modelos/lgb2_all_auctions', 'rb'))

lgb_model_auctions = lgb_model

pred_data_auctions = lgb_model_auctions.predict(data.drop(columns=['ref_hash']))
predictions_auctions = pd.DataFrame({'ref_hash': data['ref_hash'],'obj': pred_data_auctions})

predictions_auctions['ref_hash'] = predictions_auctions['ref_hash'].apply(lambda x: str(x) + '_st')

In [48]:
#Predicciones installs:
lgb_model_installs = pickle.load(open('nuevos_modelos/lgb2_all_installs', 'rb'))

pred_data_installs = lgb_model_installs.predict(data.drop(columns=['ref_hash']))
predictions_installs = pd.DataFrame({'ref_hash': data['ref_hash'],'obj': pred_data_installs})

predictions_installs['ref_hash'] = predictions_installs['ref_hash'].apply(lambda x: str(x) + '_sc')

In [65]:
predictions = pd.concat([predictions_auctions, predictions_installs])

In [128]:
predictions2.loc[predictions2['obj'] < 0, 'obj'] = 0
predictions2.loc[predictions2['obj'] > 24*3*3600, 'obj'] = 24*3*3600

In [133]:
predictions2.to_csv('predicciones/predicciones22.csv', index=False)