In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
import seaborn as sns

resultado_anterior = 0
maximo = 0

# Carga de datos

In [None]:
auctions = pd.read_csv('auctions.csv')
auctions["date"] =  pd.to_datetime(auctions["date"], errors = "coerce", low_memory=False)

In [None]:
clicks_dtypes = {'advertiser': np.int8, 'action_id': np.float16, 'source_id': np.int8, 'country_code': 'category',
                 'latitude': np.float16, 'longitude': np.float16, 'carrier_id': np.float16, 'brand': np.float16,
                 'touchX': np.float16, 'touchY': np.float16}
clicks = pd.read_csv('clicks.csv')
clicks["created"] =  pd.to_datetime(clicks["created"], errors = "coerce", low_memory=False) #parseo de fechas

In [None]:
events = pd.read_csv('events.csv', low_memory=False)
events["date"] =  pd.to_datetime(events["date"], errors = "coerce")

In [None]:
installs = pd.read_csv('installs.csv')
installs["created"] =  pd.to_datetime(installs["created"], errors = "coerce", low_memory=False) 

# Obtención de Features

### Creación del df para predecir

In [None]:
df_features = pd.DataFrame(auctions["device_id"].unique())
df_features.columns = ['ref_hash']
df_features = df_features.set_index('ref_hash')

In [None]:
def agregar_feature(df_features, df_to_join, nombre, fill=0):
    df_features = df_features.join(df_to_join.set_index('ref_hash')[[nombre]], how="left").fillna(value = fill)
    return df_features

## Features de installs

In [None]:
installs["tiene_installs"] = 1
df_features = agregar_feature(df_features, installs, "tiene_installs", 0)

# Funciones de entrenamiento

Función que persiste el resultado de cada modelo.

In [None]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

Funcion de entrenar

In [None]:
def entrenar_modelo(params):
    result = None#entrenar(params)
    guardar_submit(params, result)
    

Funcion bla

In [None]:
def modelfit(model, features, labels, cv_folds=5, early_stopping_rounds=50):
    parametros = model.get_xgb_params()
    dmatrix = xgb.DMatrix(features.values, label=labels.values)
    cvresult = xgb.cv(parametros, dmatrix, num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,metrics='auc', #verbose_eval=True,
                      early_stopping_rounds=early_stopping_rounds)
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(features, labels,eval_metric='auc')

Función que hace un grid search entrenando con diferentes hiperparametros.

In [None]:
def entrenar(df_features, labels):
    modelo = xgb.XGBRegressor(
        #booster='dart',
        learning_rate =0.01,
        n_estimators=1000,
        max_depth=4,
        min_child_weight=8,
        gamma=0.22,
        subsample=1,
        colsample_bytree=1,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=18.8,
        #sample_type='weighted',
        #rate_drop=0.1,
        #skip_dropout=0.5,
        random_state=272
    )
    
    df_features = df_features.join(labels, how = 'inner')
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
    
    modelfit(modelo, X_train, y_train, early_stopping_rounds=30)

    predsa = modelo.predict(X_test)
    resultado = roc_auc_score(y_test, predsa)

# Selección de features

# Entrenamiento