In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error

resultado_anterior = 0
maximo = 0

# Carga de datos

In [2]:
auctions = pd.read_csv("auctions.csv", dtype={"ref_type_id": np.uint8, "source_id": np.uint8})
auctions["date"] =  pd.to_datetime(auctions["date"], errors = "coerce")

In [None]:
clicks_dtypes = {'advertiser': np.int8, 'action_id': np.float16, 'source_id': np.int8, 'country_code': 'category',
                 'latitude': np.float16, 'longitude': np.float16, 'carrier_id': np.float16, 'brand': np.float16,
                 'touchX': np.float16, 'touchY': np.float16}
clicks = pd.read_csv('clicks.csv', dtypes = clicks_dtypes, low_memory=False)
clicks["created"] =  pd.to_datetime(clicks["created"], errors = "coerce")

In [None]:
events = pd.read_csv('events.csv', low_memory=False)
events["date"] =  pd.to_datetime(events["date"], errors = "coerce")

In [3]:
installs = pd.read_csv('installs.csv', low_memory = False)
installs["created"] =  pd.to_datetime(installs["created"], errors = "coerce") 

# Obtención de ventanas para entrenar y validar

In [6]:
#la fecha final debe ser las 00hs del día siguiente a la fecha final de la ventana
def create_window(df, initial_date, final_date, date_feature_name):
    initial = pd.to_datetime(initial_date)
    final = pd.to_datetime(final_date)
    return df.loc[ (df[date_feature_name] >= initial) \
                  & (df[date_feature_name] < final) ]

In [6]:
auctions_18_20 =  create_window(auctions, "2019-04-18", "2019-04-21", "date")
auctions_19_21 =  create_window(auctions, "2019-04-19", "2019-04-22", "date")
auctions_20_22 =  create_window(auctions, "2019-04-20", "2019-04-23", "date")
auctions_21_23 =  create_window(auctions, "2019-04-21", "2019-04-24", "date")
auctions_22_24 =  create_window(auctions, "2019-04-22", "2019-04-25", "date")

In [None]:
events_18_20 =  create_window(events, "2019-04-18", "2019-04-21", "date")
events_19_21 =  create_window(events, "2019-04-19", "2019-04-22", "date")
events_20_22 =  create_window(events, "2019-04-20", "2019-04-23", "date")
events_21_23 =  create_window(events, "2019-04-21", "2019-04-24", "date")
events_22_24 =  create_window(events, "2019-04-22", "2019-04-25", "date")

In [None]:
clicks_18_20 =  create_window(clicks, "2019-04-18", "2019-04-21", "created")
clicks_19_21 =  create_window(clicks, "2019-04-19", "2019-04-22", "created")
clicks_20_22 =  create_window(clicks, "2019-04-20", "2019-04-23", "created")
clicks_21_23 =  create_window(clicks, "2019-04-21", "2019-04-24", "created")
clicks_22_24 =  create_window(clicks, "2019-04-22", "2019-04-25", "created")

In [7]:
installs_18_20 = create_window(installs, "2019-04-18", "2019-04-21", "created")
installs_19_21 = create_window(installs, "2019-04-19", "2019-04-22", "created")
installs_20_22 = create_window(installs, "2019-04-20", "2019-04-23", "created")
installs_21_23 = create_window(installs, "2019-04-21", "2019-04-24", "created")
installs_22_24 = create_window(installs, "2019-04-22", "2019-04-25", "created")
installs_23_25 = create_window(installs, "2019-04-23", "2019-04-26", "created")
installs_24_26 = create_window(installs, "2019-04-24", "2019-04-27", "created")

# Obtención de labels

In [4]:
def calculate_label(reference_df, df_labels, id_feature_name, reference_date, label_name, date_feature_name):
    max_time_window = 259200 #es el tiempo maximo que se da en 3 dias
    date = pd.to_datetime(reference_date)
    df_result = df_labels.groupby(id_feature_name).agg({date_feature_name : 'min'}).reset_index()
    df_result[label_name] = (df_result[date_feature_name] - date).dt.total_seconds()
    del df_result[date_feature_name]
    merge_df = reference_df[[id_feature_name]].merge(df_result, how = 'left', on = id_feature_name).fillna(max_time_window)
    return merge_df

Labels de subastas. Segundos hasta que el usuario vuelva a aparecer en una subasta

In [None]:
labels_auc_18_20 = calculate_label(auctions_18_20, auctions_21_23, "device_id", "2019-04-18", "secs_to_appear", "date")
labels_auc_19_21 = calculate_label(auctions_19_21, auctions_22_24, "device_id", "2019-04-19", "secs_to_appear", "date")
labels_auc_20_22 = calculate_label(auctions_20_22, auctions_23_25, "device_id", "2019-04-20", "secs_to_appear", "date")
labels_auc_21_23 = calculate_label(auctions_21_23, auctions_24_26,"device_id", "2019-04-21", "secs_to_appear", "date")
#labels_auc_22_24 = calculate_label(auctions_22_24, auctions_25_27,"device_id", "2019-04-22", "secs_to_appear", "date")
#como mierda hacemos? no tenemos los datos del 27 para calcular el label de la ventana 22-24

Labels de installs. Segundos hasta que el usuario vuelva a convertir

In [11]:
labels_inst_18_20 = calculate_label(installs_18_20, installs_21_23, "ref_hash",  "2019-04-18", "seconds_to_install", "created")
labels_inst_19_21 = calculate_label(installs_19_21, installs_22_24, "ref_hash",  "2019-04-19", "seconds_to_install", "created")
labels_inst_20_22 = calculate_label(installs_20_22, installs_23_25, "ref_hash",  "2019-04-20", "seconds_to_install", "created")
labels_inst_21_23 = calculate_label(installs_21_23, installs_24_26, "ref_hash",  "2019-04-21", "seconds_to_install", "created")
#labels_inst_22_24 = calculate_label(installs_22_24, "ref_hash",  "2019-04-22", "seconds_to_install", "created")

In [30]:
labels_inst_18_20.head(3)

Unnamed: 0,ref_hash,seconds_to_install
0,5230323462636548010,259200.0
1,5097163995161606833,259200.0
2,6328027616411983332,259200.0


# Creación de estructura de los dataframes para entrenar

In [45]:
#le pasamos a esta funcion el dataframe de la ventana a la cual deseemos agregarle features posteriormente
def initialize_trainning_df(df, index_name):
    trainning_df = pd.DataFrame(df[index_name].unique())
    trainning_df.columns = ['ref_hash']
    trainning_df = trainning_df.set_index('ref_hash')
    return trainning_df

In [47]:
example = initialize_trainning_df(auctions_18_20, "device_id")
example.head()

1109595589636746168
5896614299191635403
4172466725848941608
2616279795187318849
8034952072073026056


# Funciones de entrenamiento

In [None]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

In [None]:
def entrenar_modelo(params):
    result = None#entrenar(params)
    guardar_submit(params, result)

In [None]:
def modelfit(model, features, labels, cv_folds=5, early_stopping_rounds=50):
    parametros = model.get_xgb_params()
    dmatrix = xgb.DMatrix(features.values, label=labels.values)
    cvresult = xgb.cv(parametros, dmatrix, num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,metrics='auc', #verbose_eval=True,
                      early_stopping_rounds=early_stopping_rounds)
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(features, labels,eval_metric='auc')

In [None]:
def entrenar(df_features, labels):
    modelo = xgb.XGBRegressor(
        #booster='dart',
        learning_rate =0.01,
        n_estimators=1000,
        max_depth=4,
        min_child_weight=8,
        gamma=0.22,
        subsample=1,
        colsample_bytree=1,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=18.8,
        #sample_type='weighted',
        #rate_drop=0.1,
        #skip_dropout=0.5,
        random_state=272
    )
    
    df_features = df_features.join(labels, how = 'inner')
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
    
    modelfit(modelo, X_train, y_train, early_stopping_rounds=30)

    predsa = modelo.predict(X_test)
    resultado = roc_auc_score(y_test, predsa)

# Entrenamiento

# Submit to Kaggle

Las predicciones tendrán seteadas como índice los ref_hash para no perder la referencia

In [29]:
def create_submit_df(auctions_predictions, installs_predictions, target):
    return resul