In [235]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error

resultado_anterior = 0
maximo = 0

# Carga de datos

In [2]:
auctions = pd.read_csv("auctions.csv", dtype={"ref_type_id": np.uint8, "source_id": np.uint8})
auctions["date"] =  pd.to_datetime(auctions["date"], errors = "coerce")

In [None]:
clicks_dtypes = {'advertiser': np.int8, 'action_id': np.float16, 'source_id': np.int8, 'country_code': 'category',
                 'latitude': np.float16, 'longitude': np.float16, 'carrier_id': np.float16, 'brand': np.float16,
                 'touchX': np.float16, 'touchY': np.float16}
clicks = pd.read_csv('clicks.csv')
clicks["created"] =  pd.to_datetime(clicks["created"], errors = "coerce")

In [None]:
events = pd.read_csv('events.csv')
events["date"] =  pd.to_datetime(events["date"], errors = "coerce")

In [20]:
installs = pd.read_csv('installs.csv', low_memory = False)
installs["created"] =  pd.to_datetime(installs["created"], errors = "coerce") 

# Obtención de Features

### Creación del df para predecir

In [None]:
df_features = pd.DataFrame(auctions["device_id"].unique())
df_features.columns = ['ref_hash']
df_features = df_features.set_index('ref_hash')

In [18]:
def agregar_feature(df_features, df_to_join, nombre, fill=0):
    df_features = df_features.join(df_to_join.set_index('ref_hash')[[nombre]], how="left").fillna(value = fill)
    return df_features

## Features de installs

In [None]:
installs["tiene_installs"] = 1
df_features = agregar_feature(df_features, installs, "tiene_installs", 0)

# Funciones de entrenamiento

Función que persiste el resultado de cada modelo.

In [None]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

Funcion de entrenar

In [None]:
def entrenar_modelo(params):
    result = None#entrenar(params)
    guardar_submit(params, result)
    

Funcion bla

In [None]:
def modelfit(model, features, labels, cv_folds=5, early_stopping_rounds=50):
    parametros = model.get_xgb_params()
    dmatrix = xgb.DMatrix(features.values, label=labels.values)
    cvresult = xgb.cv(parametros, dmatrix, num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,metrics='auc', #verbose_eval=True,
                      early_stopping_rounds=early_stopping_rounds)
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(features, labels,eval_metric='auc')

Función que hace un grid search entrenando con diferentes hiperparametros.

In [None]:
def entrenar(df_features, labels):
    modelo = xgb.XGBRegressor(
        #booster='dart',
        learning_rate =0.01,
        n_estimators=1000,
        max_depth=4,
        min_child_weight=8,
        gamma=0.22,
        subsample=1,
        colsample_bytree=1,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=18.8,
        #sample_type='weighted',
        #rate_drop=0.1,
        #skip_dropout=0.5,
        random_state=272
    )
    
    df_features = df_features.join(labels, how = 'inner')
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
    
    modelfit(modelo, X_train, y_train, early_stopping_rounds=30)

    predsa = modelo.predict(X_test)
    resultado = roc_auc_score(y_test, predsa)

# Selección de features

# Entrenamiento

## Pruebas

In [91]:
def create_window(df, initial_day, final_day, date_feature_name, id_name, label_name):
    window = df.loc[(df[date_feature_name].dt.day >= initial_day) & (df[date_feature_name].dt.day <= final_day)]
    window = window.groupby(id_name).agg({date_feature_name : 'min'}).reset_index()
    window[label_name] = (window[date_feature_name] - pd.to_datetime("2019-04-" + str(initial_day))).dt.total_seconds()
    return window

In [164]:
def add_feature(df_features, df_to_join, index_df_features, index_df_to_join, feature_name, fill):
    df_features = df_features.merge(df_to_join[[index_df_to_join, feature_name]].set_index(index_df_to_join), how = 'left',\
                                    left_on = index_df_features, right_on = index_df_to_join).fillna(fill)
    return df_features

In [203]:
window_18to20 = create_window(auctions, 18, 20, "date", "device_id", "seconds_to_appear")

In [212]:
X, y = window_18to20.iloc[:,:-2], window_18to20[["seconds_to_appear"]] #X sera el dataframe con todos los features
#y es el label (en X DEVICE ID NO VAN!!)
#window_18to20.iloc[:,:-1] en este caso no me quedo con la serie date porque dmatrix solo acepta valores numericos

In [214]:
df_has_installs = pd.DataFrame(installs['ref_hash'].unique())
df_has_installs['has_installs'] = 1
df_has_installs.columns = ['ref_hash', 'has_installs']

In [None]:
df_has_installs.head()

In [216]:
X = add_feature(X, df_has_installs, 'device_id', 'ref_hash', 'has_installs', 0)

In [218]:
X = X[['has_installs']]

In [219]:
X.head()

Unnamed: 0,has_installs
0,1.0
1,1.0
2,1.0
3,0.0
4,1.0


In [220]:
data_dmatrix = xgb.DMatrix(data=X,label= y) #en realidad aca va el label de la siguiente ventana !!!! 

In [221]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [222]:
xg_reg.fit(X,y) #X es la data de train e y es el label de train

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.3, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [223]:
window_21to23 = create_window(auctions, 21, 23, "date", "device_id", "seconds_to_appear")

In [226]:
X_test, X_real_label = window_21to23.iloc[:,:-2], window_21to23.iloc[:,-1] 

In [228]:
X_test = add_feature(X_test, df_has_installs, 'device_id', 'ref_hash', 'has_installs', 0)

In [230]:
X_test = X_test[['has_installs']]

In [232]:
predictions = xg_reg.predict(X_test)

In [236]:
rmse = np.sqrt(mean_squared_error(X_real_label, predictions))
print("RMSE: %f" % (rmse))

RMSE: 79255.870375
