In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.datasets import make_regression
import sklearn
from sklearn.model_selection import GridSearchCV
np.random.seed(10)
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from scipy.stats import randint as sp_randint
from scipy.stats import randint as sp_uniform

In [None]:
windows = ["18_20","19_21","20_22","21_23"]
features = {}
label_auc = {}
label_inst = {}
label_clas_auc = {}
label_clas_inst = {}

for window in windows:
    features[window] = pd.read_csv("windows/{}/features.csv".format(window), index_col="ref_hash")
    label_auc[window] = pd.read_csv("windows/{}/labels_auc.csv".format(window), index_col="ref_hash")
    label_inst[window] = pd.read_csv("windows/{}/labels_inst.csv".format(window), index_col="ref_hash")
    label_clas_auc[window] = pd.DataFrame({'ref_hash': label_auc[window].index, 'label_auc': (label_auc[window]['label_auc'] == 259200).astype(int)}).set_index("ref_hash")
    label_clas_inst[window] = pd.DataFrame({'ref_hash': label_inst[window].index, 'label_inst': (label_inst[window]['label_inst'] == 259200).astype(int)}).set_index("ref_hash")
    
features_to_predict = pd.read_csv("windows/24_26/features.csv", index_col="ref_hash") 

## Selección de features

In [None]:
def select_features(df, feature_list):
    return df.reindex(columns=feature_list)

In [None]:
features_list_class_auc = [
    'appearances_in_auctions', 
    'user_appeared_last_day', 
    'time_to_reappear',
    #'amount_of_clicks', 
    #'has_installed', 
    #'user_clicked_last_day',
    #'user_installed_last_day', 
    #'amount_of_installs', 
    #'cant_max_day',
    #'cant_min_day', 
    #'mean_time_to_click', 
    #'max_time_click',
    #'min_time_click', 
    'mean_auctions_per_day', 
    #'mean_events_per_day',
    #'mean_clicks_per_day', 
    'amount_auctions_in_last_hour',
    'amount_auctions_in_last_2_hours', 
    'amount_auctions_in_last_5_hours',
    'amount_auctions_in_last_12_hours', 
    #'amount_auctions_in_last_24_hours',
    'amount_events_in_last_hour', 
    'amount_events_in_last_2_hours',
    'amount_auctions_in_first_hour', 
    'amount_auctions_in_first_3_hours',
    'amount_auctions_in_first_5_hours', 
    'amount_auctions_in_first_12_hours',
    'amount_events_in_first_hour', 
    'amount_events_in_first_5_hours',
    'amount_events_in_first_12_hours', 
    #'amount_clicks_in_last_2_hours',
    #'amount_clicks_in_last_4_hours', 
    #'device_os', 
    #'std_time_to_click',
    'std_time_auctions', 
    #'max_time_install', 
    #'min_time_install',
    #'mean_time_install', 
    #'std_time_install', 
    #'max_time_events',
    #'min_time_events', 
    #'mean_time_events', 
    'std_time_events',
    #'installs_per_events', 
    #'installs_per_clicks', 
    #'events_x_app_210',
    #'events_x_app_122', 
    #'events_x_app_65', 
    #'events_x_app_121',
    #'events_x_app_26', 
    #'most_installed_apps_used', 
    #'cant_apps_used',
    #'cant_events_atributed', 
    #'has_events_atributed',
    #'has_events_ids_with_installs', 
    #'has_events_ids_without_installs',
    'cant_events_0_4', 
    'cant_events_4_8', 
    'cant_events_8_12',
    'cant_events_12_16', 
    'cant_events_16_20', 
    'cant_events_20_24',
    'cant_auctions_0_4', 
    'cant_auctions_4_8', 
    'cant_auctions_8_12',
    'cant_auctions_12_16', 
    'cant_auctions_16_20', 
    'cant_auctions_20_24',
    #'implicit', 
    #'latitude', 
    #'longitude', 
    #'clicked_in_last_5_minutes',
    #'clicked_with_wifi_in_last_3_hours', 
    'hour_install', 
    'hour_events',
    'hour_clicks', 
    'hour_auctions',
    'amount_auctions_in_last_half_hour',
    'amount_auctions_in_last_15_minutes',
    "mean_appearances_events",
    "std_appearances_events",
    "std_appearances_auctions",
    'cant_devices',
    "mean_appearances_auctions",
    #"mean_of_first_auction",
    #"mean_of_first_event",
    "time_to_first_appearance_in_auctions",
    "mean_auctions_last_24hs",
    "mean_auctions_last_48hs",
    "std_auctions_last_24hs",
    "std_auctions_last_48hs"
      
]

In [None]:
features_list_class_inst = [
    'appearances_in_auctions', 
    'user_appeared_last_day', 
    'time_to_reappear',
    #'amount_of_clicks', 
    'has_installed', 
    'user_clicked_last_day',
    'user_installed_last_day', 
    'amount_of_installs', 
    #'cant_max_day',
    #'cant_min_day', 
    #'mean_time_to_click', 
    #'max_time_click',
    #'min_time_click', 
    'mean_auctions_per_day', 
    'mean_events_per_day',
    #'mean_clicks_per_day', 
    'amount_auctions_in_last_hour',
    'amount_auctions_in_last_2_hours', 
    'amount_auctions_in_last_5_hours',
    'amount_auctions_in_last_12_hours', 
    'amount_auctions_in_last_24_hours',
    'amount_events_in_last_hour', 
    'amount_events_in_last_2_hours',
    'amount_auctions_in_first_hour', 
    'amount_auctions_in_first_3_hours',
    'amount_auctions_in_first_5_hours', 
    'amount_auctions_in_first_12_hours',
    'amount_events_in_first_hour', 
    'amount_events_in_first_5_hours',
    'amount_events_in_first_12_hours', 
    #'amount_clicks_in_last_2_hours',
    #'amount_clicks_in_last_4_hours', 
    #'device_os', 
    #'std_time_to_click',
    'std_time_auctions', 
    #'max_time_install', 
    #'min_time_install',
    #'mean_time_install', 
    #'std_time_install', 
    #'max_time_events',
    #'min_time_events', 
    'mean_time_events', 
    'std_time_events',
    #'installs_per_events', 
    #'installs_per_clicks', 
    'events_x_app_210',
    'events_x_app_122', 
    'events_x_app_65', 
    'events_x_app_121',
    'events_x_app_26', 
    'most_installed_apps_used', 
    'cant_apps_used',
    'cant_events_atributed', 
    'has_events_atributed',
    #'has_events_ids_with_installs', 
    #'has_events_ids_without_installs',
    'cant_events_0_4', 
    'cant_events_4_8', 
    'cant_events_8_12',
    'cant_events_12_16', 
    'cant_events_16_20', 
    'cant_events_20_24',
    'cant_auctions_0_4', 
    'cant_auctions_4_8', 
    'cant_auctions_8_12',
    'cant_auctions_12_16', 
    'cant_auctions_16_20', 
    'cant_auctions_20_24',
    #'implicit', 
    #'latitude', 
    #'longitude', 
    #'clicked_in_last_5_minutes',
    'clicked_with_wifi_in_last_3_hours', 
    'hour_install', 
    'hour_events',
    'hour_clicks', 
    'hour_auctions',
    'cant_installs_0_4',
    'cant_installs_4_8', 
    'cant_installs_8_12',
    'cant_installs_12_16',
    'cant_installs_16_20',
    'cant_installs_20_24',
    "mean_appearances_clicks",
    "mean_appearances_installs"
    'amount_auctions_in_last_half_hour',
    'amount_auctions_in_last_15_minutes',
    'cant_devices', 
    "mean_appearances_auctions",
   "mean_appearances_events",
   "std_appearances_events",
    "std_appearances_auctions",
    #"mean_of_first_auction",
    #"mean_of_first_event",
    "time_to_first_appearance_in_auctions",
    "mean_auctions_last_24hs",
    "mean_auctions_last_48hs",
    "std_auctions_last_24hs",
    "std_auctions_last_48hs"
    
]

## Funciones útiles para el entrenamento y el balanceo

In [None]:
def get_df_to_train(windows, features, labels, feature_list):
    df_list = []
    for window in windows:
        df = select_features(features[window], feature_list).join(labels[window], how="inner")
        df_list.append(df)
    df_full = pd.concat(df_list)
    df_full.reset_index(inplace=True, drop=True)
    return df_full

In [None]:
def get_train_test_split(df_full, label_name, test_size=0.3):
    y = df_full[label_name]
    X_data = df_full.drop(label_name, axis=1)
    return train_test_split(X_data, y, test_size=test_size, random_state=7)

# Parameter Tuning para XGBoost

In [None]:
df_full = get_df_to_train(windows, features, label_auc, features_list_class_auc)
df_full = df_full.sample(int(len(df_full)*0.1))
df_full = df_full[df_full["label_auc"] != 259200]

#### Max_depth and min_child_weight tuning

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring= 'neg_mean_squared_error', n_jobs=4,iid=False, cv=5)
gsearch1.fit(df_full[features_list_class_auc],df_full['label_auc'])
#gsearch1.cv_results_

In [None]:
np.sqrt(gsearch1.best_score_ * (-1))

In [None]:
gsearch1.best_params_

> #### Refinamos la búsqueda entre valores acotados

In [None]:
param_test2 = {
 'max_depth': [3,4,5],
 'min_child_weight': [3,4,5]
}
gsearch2 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.01, n_estimators=1000, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring= 'neg_mean_squared_error', n_jobs=4,iid=False, cv=5)
gsearch2.fit(df_full[features_list_class_auc],df_full['label_auc'])
#gsearch2.cv_results_

In [None]:
gsearch2.best_params_  #los mejores parámetros son max_depth 4 y min child_weight 3

In [None]:
np.sqrt(gsearch2.best_score_ * (-1))

#### Learning rate and n_estimators tunning

In [None]:
param_test3 = {
    'n_estimators' : [100,200,500, 1000],
    'learning_rate' : [0.1, 0.05, 0.01]
}
gsearch3 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=140, max_depth=3,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3, scoring= 'neg_mean_squared_error', n_jobs=4,iid=False, cv=5)
gsearch3.fit(df_full[features_list_class_auc],df_full['label_auc'])

In [None]:
gsearch3.best_params_

In [None]:
np.sqrt(gsearch3.best_score_ * (-1))

#### Gamma tuning

In [None]:
param_test4 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch4 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=4,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test4, scoring= 'neg_mean_squared_error', n_jobs=4,iid=False, cv=5)
gsearch4.fit(df_full[features_list_class_auc],df_full['label_auc'])

In [None]:
gsearch4.best_params_

In [None]:
np.sqrt(gsearch4.best_score_ * (-1))

#### Subsample and colsample_bytree tuning

In [None]:
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch5 = GridSearchCV(estimator = XGBRegressor( learning_rate = 0.1, n_estimators=200, max_depth=4,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch5.fit(df_full[features_list_class_auc],df_full['label_auc'])

In [None]:
gsearch5.best_params_

In [None]:
np.sqrt(gsearch5.best_score_ * (-1))

#### Regularization tuning

In [None]:
param_test6 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch6 = GridSearchCV(estimator = XGBRegressor( learning_rate = 0.1, n_estimators=200, max_depth=4,
 min_child_weight=3, gamma=0, subsample=0.9, colsample_bytree=0.7,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test6, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch6.fit(df_full[features_list_class_auc],df_full['label_auc'])

In [None]:
gsearch6.best_params_

In [None]:
np.sqrt(gsearch6.best_score_ * (-1))