In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error

# Carga de los df de features para entrenar

In [2]:
features_18_20 = pd.read_csv("windows/18_20/features.csv") 
features_19_21 = pd.read_csv("windows/19_21/features.csv")
features_20_22 = pd.read_csv("windows/20_22/features.csv")
features_21_23 = pd.read_csv("windows/21_23/features.csv")
features_24_26 = pd.read_csv("windows/18_20/features.csv") 

In [3]:
label_auc_18_20 = pd.read_csv("windows/18_20/labels_auc.csv")
label_auc_19_21 = pd.read_csv("windows/19_21/labels_auc.csv")
label_auc_20_22 = pd.read_csv("windows/20_22/labels_auc.csv")
label_auc_21_23 = pd.read_csv("windows/21_23/labels_auc.csv")

In [4]:
label_inst_18_20 = pd.read_csv("windows/18_20/labels_inst.csv")
label_inst_19_21 = pd.read_csv("windows/19_21/labels_inst.csv")
label_inst_20_22 = pd.read_csv("windows/20_22/labels_inst.csv")
label_inst_21_23 = pd.read_csv("windows/21_23/labels_inst.csv")

# Funciones de entrenamiento

In [5]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

In [6]:
def entrenar_modelo(params):
    result = None#entrenar(params)
    guardar_submit(params, result)

In [7]:
def modelfit(model, features, labels, cv_folds=5, early_stopping_rounds=50):
    parametros = model.get_xgb_params()
    dmatrix = xgb.DMatrix(features.values, label=labels.values)
    cvresult = xgb.cv(parametros, dmatrix, num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,metrics='rmse', #verbose_eval=True,
                      early_stopping_rounds=early_stopping_rounds)
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(features, labels,eval_metric='rmse')

In [8]:
def entrenar(modelo, df_features, labels):
    df_features = df_features.merge(labels, how="left", left_on="ref_hash", right_on="ref_hash")
    df_features.set_index("ref_hash", inplace=True)
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

    modelo.fit(X_train, y_train, eval_metric='rmse')
    #modelfit(modelo, X_train, y_train, early_stopping_rounds=30)

    prediction = modelo.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    print("RMSE: %f" % (rmse))
    return prediction

# Entrenamiento

In [9]:
model_auc = xgb.XGBRegressor(
    #booster='dart',
    learning_rate =0.01,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=8,
    gamma=0.22,
    subsample=1,
    colsample_bytree=1,
    #objective='reg:linear',
    nthread=-1,
    scale_pos_weight=18.8,
    #sample_type='weighted',
    #rate_drop=0.1,
    #skip_dropout=0.5,
    random_state=272
)

prediction_auc = entrenar(model_auc, features_18_20, label_auc_18_20)

RMSE: 97244.100427


In [10]:
model_inst = xgb.XGBRegressor(
    #booster='dart',
    learning_rate =0.01,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=8,
    gamma=0.22,
    subsample=1,
    colsample_bytree=1,
    #objective='reg:linear',
    nthread=-1,
    scale_pos_weight=18.8,
    #sample_type='weighted',
    #rate_drop=0.1,
    #skip_dropout=0.5,
    random_state=272
)

prediction_inst = entrenar(model_inst, features_18_20, label_inst_18_20)

RMSE: 57162.476822


# Predecir

In [12]:
model_auc.predict(features_24_26.set_index("ref_hash"))


array([ 90608.74,  90608.74,  90608.74, ..., 163545.81, 163545.81,
       163545.81], dtype=float32)

# Submit to Kaggle

Las predicciones tendrán seteadas como índice los ref_hash para no perder la referencia
No es necesario filtrar los ref_hash y quedarnos solo con los target en las predicciones que obtenemos ya que de eso
se encarga la función 

In [None]:
example_submit = labels_inst_18_20.set_index('ref_hash')
example_submit.head(2)

In [None]:
targets = pd.read_csv('target_final_competencia_revamped.csv')
targets.head(1)

In [None]:
def create_submit_df(auctions_predictions, installs_predictions, target):
    
    target = target.set_index('ref_hash')
    
    auc = auctions_predictions.reset_index()
    auc.columns = ['ref_hash','obj']
    auc['ref_hash'] = auc['ref_hash'].astype(str) + "_sc"
    auc = auc.set_index('ref_hash')
    
    ins = installs_predictions.reset_index()
    ins.columns = ['ref_hash','obj']
    ins['ref_hash'] = ins['ref_hash'].astype(str) + "_st"
    ins = ins.set_index('ref_hash')
    
    frames = [ins,auc]
    submit_result = pd.concat(frames).reset_index()
    target_list = target.reset_index('ref_hash')['ref_hash'].tolist()
    return submit_result.loc[submit_result['ref_hash'].isin(target_list)].sort_values(by = 'ref_hash')

In [None]:
kaggle_sub = create_submit_df(labels_inst_18_20.set_index('ref_hash'), labels_inst_18_20.set_index('ref_hash'), targets)

In [None]:
kaggle_sub.head(2)