In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error

# Carga de los df de features para entrenar y predecir

In [None]:
features_18_20 = pd.read_csv("windows/18_20/features.csv") 
features_19_21 = pd.read_csv("windows/19_21/features.csv")
features_20_22 = pd.read_csv("windows/20_22/features.csv")
features_21_23 = pd.read_csv("windows/21_23/features.csv")
features_24_26 = pd.read_csv("windows/24_26/features.csv") 

In [None]:
label_auc_18_20 = pd.read_csv("windows/18_20/labels_auc.csv")
label_auc_19_21 = pd.read_csv("windows/19_21/labels_auc.csv")
label_auc_20_22 = pd.read_csv("windows/20_22/labels_auc.csv")
label_auc_21_23 = pd.read_csv("windows/21_23/labels_auc.csv")

In [None]:
label_inst_18_20 = pd.read_csv("windows/18_20/labels_inst.csv")
label_inst_19_21 = pd.read_csv("windows/19_21/labels_inst.csv")
label_inst_20_22 = pd.read_csv("windows/20_22/labels_inst.csv")
label_inst_21_23 = pd.read_csv("windows/21_23/labels_inst.csv")

# Funciones de entrenamiento

In [None]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

In [None]:
def entrenar_modelo(params):
    result = None#entrenar(params)
    guardar_submit(params, result)

In [None]:
def modelfit(model, features, labels, cv_folds=5, early_stopping_rounds=50):
    parametros = model.get_xgb_params()
    dmatrix = xgb.DMatrix(features.values, label=labels.values)
    cvresult = xgb.cv(parametros, dmatrix, num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,metrics='rmse', #verbose_eval=True,
                      early_stopping_rounds=early_stopping_rounds)
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(features, labels,eval_metric='rmse')

In [None]:
def entrenar(modelo, df_features, labels):
    df_features = df_features.merge(labels, how="left", left_on="ref_hash", right_on="ref_hash")
    df_features.set_index("ref_hash", inplace=True)
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

    modelo.fit(X_train, y_train, eval_metric='rmse')
    #modelfit(modelo, X_train, y_train, early_stopping_rounds=30)

    prediction = modelo.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    print("RMSE: %f" % (rmse))
    return prediction

# Entrenamiento

In [None]:
model_auc = xgb.XGBRegressor(
    #booster='dart',
    learning_rate =0.01,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=8,
    gamma=0.22,
    subsample=1,
    colsample_bytree=1,
    objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=18.8,
    #sample_type='weighted',
    #rate_drop=0.1,
    #skip_dropout=0.5,
    random_state=272
)

In [None]:
entrenar(model_auc, features_18_20, label_auc_18_20)
entrenar(model_auc, features_19_21, label_auc_19_21)
entrenar(model_auc, features_20_22, label_auc_20_22)
entrenar(model_auc, features_21_23, label_auc_21_23)

In [None]:
model_inst = xgb.XGBRegressor(
    #booster='dart',
    learning_rate =0.01,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=8,
    gamma=0.22,
    subsample=1,
    colsample_bytree=1,
    objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=18.8,
    #sample_type='weighted',
    #rate_drop=0.1,
    #skip_dropout=0.5,
    random_state=272
)

In [None]:
entrenar(model_inst, features_18_20, label_inst_18_20)
entrenar(model_inst, features_19_21, label_inst_19_21)
entrenar(model_inst, features_20_22, label_inst_20_22)
entrenar(model_inst, features_21_23, label_inst_21_23)

# Prueba para ver si mejoran las predicciones

In [45]:
label_inst_test = label_inst_21_23.loc[label_inst_21_23['label_inst'] != 259200].set_index('ref_hash')

In [48]:
features_inst_test = features_21_23.set_index('ref_hash').join(label_inst_test, how = 'inner')
features_inst_test.drop(columns = ["label_inst"], inplace = True)

In [51]:
label_inst_test.reset_index(inplace = True)
features_inst_test.reset_index(inplace= True)

In [52]:
entrenar(model_inst, features_inst_test, label_inst_test)

RMSE: 77003.258756


array([122809.25, 128975.59, 135625.6 , ..., 135044.47, 126761.62,
       133669.7 ], dtype=float32)

In [54]:
label_auc_test = label_auc_21_23.loc[label_auc_21_23['label_auc'] != 259200].set_index('ref_hash')

In [55]:
features_auc_test = features_21_23.set_index('ref_hash').join(label_auc_test, how = 'inner')
features_auc_test.drop(columns = ["label_auc"], inplace = True)

In [56]:
label_auc_test.reset_index(inplace = True)
features_auc_test.reset_index(inplace= True)

In [57]:
entrenar(model_auc, features_auc_test, label_auc_test)

RMSE: 64935.301571


array([ 37582.746,  11663.997, 115039.23 , ...,  73054.76 ,  86526.43 ,
       104295.086], dtype=float32)

# Predecir

In [58]:
pred_auctions = model_auc.predict(features_24_26.set_index("ref_hash"))
df_preds_auctions = pd.DataFrame({'ref_hash' : features_24_26['ref_hash'], 'obj' : pred_auctions})

In [59]:
pred_installs = model_inst.predict(features_24_26.set_index("ref_hash"))
df_preds_installs = pd.DataFrame({'ref_hash' : features_24_26['ref_hash'], 'obj' : pred_installs})

# Submit to Kaggle

In [60]:
def export_df(df, name):
    df.to_csv(name, index=False)

Las predicciones tendrán seteadas como índice los ref_hash para no perder la referencia
No es necesario filtrar los ref_hash y quedarnos solo con los target en las predicciones que obtenemos ya que de eso
se encarga la función 

In [61]:
target = pd.read_csv("target_competencia_ids.csv")

In [62]:
def create_submit_df(auctions_predictions, installs_predictions, target):
    
    target = target.set_index('ref_hash')
    
    auc = auctions_predictions.reset_index()
    auc.columns = ['ref_hash','obj']
    auc['ref_hash'] = auc['ref_hash'].astype(str) + "_sc"
    auc = auc.set_index('ref_hash')
    
    ins = installs_predictions.reset_index()
    ins.columns = ['ref_hash','obj']
    ins['ref_hash'] = ins['ref_hash'].astype(str) + "_st"
    ins = ins.set_index('ref_hash')
    
    frames = [ins,auc]
    submit_result = pd.concat(frames).reset_index()
    target_list = target.reset_index('ref_hash')['ref_hash'].tolist()
    return submit_result.loc[submit_result['ref_hash'].isin(target_list)].sort_values(by = 'ref_hash')

In [63]:
kaggle_sub = create_submit_df(df_preds_auctions.set_index('ref_hash'), df_preds_installs.set_index('ref_hash'), target)

In [64]:
kaggle_sub

Unnamed: 0,ref_hash,obj
703214,1000169251625791246_sc,54561.507812
242939,1000169251625791246_st,129908.804688
463356,1000395625957344683_sc,54885.867188
3081,1000395625957344683_st,129908.804688
467117,1003027494996471685_sc,78734.179688
6842,1003027494996471685_st,133725.859375
601704,1006670001679961544_sc,79591.859375
141429,1006670001679961544_st,127395.609375
494980,1007573308966476713_sc,64137.042969
34705,1007573308966476713_st,134368.671875


In [None]:
len(kaggle_sub)

In [65]:
export_df(kaggle_sub, "submit.csv")