In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Carga de los df de features para entrenar y predecir

In [2]:
features_18_20 = pd.read_csv("windows/18_20/features.csv") 
features_19_21 = pd.read_csv("windows/19_21/features.csv")
features_20_22 = pd.read_csv("windows/20_22/features.csv")
features_21_23 = pd.read_csv("windows/21_23/features.csv")
features_24_26 = pd.read_csv("windows/24_26/features.csv") 

In [3]:
label_auc_18_20 = pd.read_csv("windows/18_20/labels_auc.csv")
label_auc_19_21 = pd.read_csv("windows/19_21/labels_auc.csv")
label_auc_20_22 = pd.read_csv("windows/20_22/labels_auc.csv")
label_auc_21_23 = pd.read_csv("windows/21_23/labels_auc.csv")

In [4]:
label_inst_18_20 = pd.read_csv("windows/18_20/labels_inst.csv")
label_inst_19_21 = pd.read_csv("windows/19_21/labels_inst.csv")
label_inst_20_22 = pd.read_csv("windows/20_22/labels_inst.csv")
label_inst_21_23 = pd.read_csv("windows/21_23/labels_inst.csv")

Clasificador

In [5]:
labels_clas_auc_18_20 = pd.DataFrame({'ref_hash': label_auc_18_20['ref_hash'], 'label_auc': label_auc_18_20['label_auc'] == 259200})
labels_clas_auc_19_21 = pd.DataFrame({'ref_hash': label_auc_19_21['ref_hash'], 'label_auc': label_auc_19_21['label_auc'] == 259200})
labels_clas_auc_20_22 = pd.DataFrame({'ref_hash': label_auc_20_22['ref_hash'], 'label_auc': label_auc_20_22['label_auc'] == 259200})
labels_clas_auc_21_23 = pd.DataFrame({'ref_hash': label_auc_21_23['ref_hash'], 'label_auc': label_auc_21_23['label_auc'] == 259200})

In [6]:
labels_clas_inst_18_20 = pd.DataFrame({'ref_hash': label_inst_18_20['ref_hash'], 'label_inst': (label_inst_18_20['label_inst'] == 259200).astype(int)})
labels_clas_inst_19_21 = pd.DataFrame({'ref_hash': label_inst_19_21['ref_hash'], 'label_inst': label_inst_19_21['label_inst'] == 259200})
labels_clas_inst_20_22 = pd.DataFrame({'ref_hash': label_inst_20_22['ref_hash'], 'label_inst': label_inst_20_22['label_inst'] == 259200})
labels_clas_inst_21_23 = pd.DataFrame({'ref_hash': label_inst_21_23['ref_hash'], 'label_inst': label_inst_21_23['label_inst'] == 259200})

# Funciones de entrenamiento

In [7]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

In [8]:
def entrenar(modelo, df_features, labels):
    df_features = df_features.merge(labels, how="left", left_on="ref_hash", right_on="ref_hash")
    df_features.set_index("ref_hash", inplace=True)
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

    modelo.fit(X_train, y_train, eval_metric='rmse')

    prediction = modelo.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    print("RMSE: %f" % (rmse))
    return prediction

# Modelos

In [19]:
model_auc = xgb.XGBRegressor(
    #booster='dart',
    learning_rate = 0.01,
    n_estimators=100,
    max_depth=4,
    min_child_weight=3,
    gamma=0.22,
    subsample=1,
    colsample_bytree=1,
    #objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=18.8,
    #sample_type='weighted',
    #rate_drop=0.1,
    #skip_dropout=0.5,
    random_state=272,
    verbosity=2
)

In [16]:
model_inst = xgb.XGBRegressor(
    #booster='dart',
    learning_rate =0.01,
    n_estimators=500,
    max_depth=5,
    min_child_weight=8,
    gamma=0.22,
    subsample=1,
    colsample_bytree=1,
    #objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=18.8,
    #sample_type='weighted',
    #rate_drop=0.1,
    #skip_dropout=0.5,
    random_state=272,
    verbosity=2
)

## Clasificador [Work in progress] 

In [None]:
win_1 = features_18_20.merge(labels_clas_auc_18_20, left_on="ref_hash", right_on="ref_hash")
win_2 = features_19_21.merge(labels_clas_auc_19_21, left_on="ref_hash", right_on="ref_hash")
win_3 = features_20_22.merge(labels_clas_auc_20_22, left_on="ref_hash", right_on="ref_hash")
win_4 = features_21_23.merge(labels_clas_auc_21_23, left_on="ref_hash", right_on="ref_hash")

df_full = pd.concat([win_1,win_2,win_3,win_4])
df_full = df_full.drop('ref_hash', axis=1)
df_full.reset_index(inplace=True, drop=True)

# Balanceo
cant_ceros = df_full["label_auc"].value_counts()[0]
cant_unos = df_full["label_auc"].value_counts()[1]
a_borrar = int((cant_unos-cant_ceros))
np.random.seed(10)
index_to_drop = np.random.choice(df_full.loc[df_full["label_auc"] == 1].index,a_borrar, replace=False)
df_full = df_full.drop(index=index_to_drop)

y = df_full["label_auc"]
X_data = df_full.drop('label_auc', axis=1)

X_trian, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.25, random_state=7)

model_class = xgb.XGBClassifier()
train_model_class = model_class.fit(X_trian, y_train)
pred_model_class = train_model_class.predict(X_test)
print("Accuracy for model 1: %.2f" % (accuracy_score(y_test, pred_model_class) * 100))

In [None]:
win_1 = features_18_20.merge(labels_clas_inst_18_20, left_on="ref_hash", right_on="ref_hash")
win_2 = features_19_21.merge(labels_clas_inst_19_21, left_on="ref_hash", right_on="ref_hash")
win_3 = features_20_22.merge(labels_clas_inst_20_22, left_on="ref_hash", right_on="ref_hash")
win_4 = features_21_23.merge(labels_clas_inst_21_23, left_on="ref_hash", right_on="ref_hash")

df_full = pd.concat([win_1,win_2,win_3,win_4])
df_full = df_full.drop('ref_hash', axis=1)
df_full.reset_index(inplace=True, drop=True)

In [None]:
# Balanceo
cant_ceros = df_full["label_inst"].value_counts()[0]
cant_unos = df_full["label_inst"].value_counts()[1]
a_borrar = int((cant_unos-cant_ceros))
np.random.seed(10)
index_to_drop = np.random.choice(df_full.loc[df_full["label_inst"] == 1].index,a_borrar, replace=False)
df_full = df_full.drop(index=index_to_drop)

In [None]:
y = df_full.label_inst
X_data = df_full.drop('label_inst', axis=1)

X_trian, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.25, random_state=7)

In [None]:
model_class = xgb.XGBClassifier()
train_model_class = model_class.fit(X_trian, y_train)
pred_model_class = train_model_class.predict(X_test)
print("Accuracy for model 1: %.2f" % (accuracy_score(y_test, pred_model_class) * 100))

In [None]:
pd.Series(pred_model_class).value_counts()

In [None]:
y_test.value_counts()

In [None]:
df_full["label_inst"].value_counts()

# Selección de la cantidad de datos en los DataFrame

In [None]:
def select_max_time_rows(df_features, df_labels, label_name, percent)
    df_return = df
    df_max_time = df_labels[df_labels[label_name] == "259200"]
    num_samples = (percent * len(df_max_time)) / 100
    df_max_time_sample = df_max_time.sample(num_samples)
    

In [27]:
features_18_20.time_to_reappear == 0

SyntaxError: invalid syntax (<ipython-input-27-a1253c4c6af7>, line 1)

# Prueba filtrando los tpos máximos (mejor submit hasta ahora)

In [12]:
def filter_max_times(df_features, df_labels, label_name): #al parecer esta funcion mejora mucho las predicciones de auct
    df_labels = df_labels.loc[df_labels[label_name] != 259200].set_index('ref_hash')
    df_features = df_features.loc[df_features['time_to_reappear'] != 0]
    df_features = df_features.set_index('ref_hash').join(df_labels, how = 'inner')
    #df_features.drop(columns = [label_name], inplace = True)
    df_labels.reset_index(inplace = True)
    df_features.reset_index(inplace= True)
    return df_features

In [13]:
train_auc_18_20 = filter_max_times(features_18_20, label_auc_18_20, "label_auc")
train_auc_19_21 = filter_max_times(features_19_21, label_auc_19_21, "label_auc")
train_auc_20_22 = filter_max_times(features_20_22, label_auc_20_22, "label_auc")
train_auc_21_23 = filter_max_times(features_21_23, label_auc_21_23, "label_auc")

frames_auc = [train_auc_18_20, train_auc_19_21, train_auc_20_22, train_auc_21_23]
trainning_auc_data = pd.concat(frames_auc)

In [20]:
entrenar(model_auc, trainning_auc_data.iloc[:,:-1], trainning_auc_data[['ref_hash', 'label_auc']])

[19:26:06] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:13] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:20] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:28] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:35] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:42] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:49] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:26:56] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[19:27:03] INFO: src/tree/updater_prune.cc:74: tree prun

array([32510.14 , 23793.73 , 39436.43 , ..., 40872.117, 47490.62 ,
       29048.717], dtype=float32)

In [28]:
train_inst_18_20 = filter_max_times(features_18_20, label_inst_18_20, "label_inst")
train_inst_19_21 = filter_max_times(features_19_21, label_inst_19_21, "label_inst")
train_inst_20_22 = filter_max_times(features_20_22, label_inst_20_22, "label_inst")
train_inst_21_23 = filter_max_times(features_21_23, label_inst_21_23, "label_inst")

frames_inst = [train_inst_18_20, train_inst_19_21, train_inst_20_22, train_inst_21_23]
trainning_inst_data = pd.concat(frames_inst)

In [29]:
entrenar(model_inst, trainning_inst_data.iloc[:,:-1], trainning_inst_data[['ref_hash', 'label_inst']])

[19:43:25] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:26] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:27] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:28] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:29] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:30] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:31] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:32] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 0 pruned nodes, max_depth=5
[19:43:33] INFO: src/tree/updater_prune.cc:74: tree prun

array([137484.7  , 127836.625, 137383.16 , ..., 115443.766, 128654.99 ,
       137383.16 ], dtype=float32)

# Ensamble de XGBoost con Random Forest

In [None]:
inst_train = trainning_inst_data.set_index('ref_hash')
X, y = inst_train.iloc[:,:-1], inst_train[['label_inst']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 100)


random_forest_inst = RandomForestRegressor(
                           n_estimators=100, 
                           min_samples_split = 10, 
                           random_state=100
                         )

random_forest_inst.fit(X_train, y_train.values.ravel())

params = {'objective': 'reg:squarederror',
          'eta': 0.3,
          'max_depth': 5,
          'min_child_weight': 3,
          'silent': 1,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'seed': 1}

num_trees=250
xgboost_inst = xgb.train(params, xgb.DMatrix(
    X_train, y_train), num_trees)

prediction_inst = (random_forest_inst.predict(X_test) +
              xgboost_inst.predict(xgb.DMatrix(X_test)))/2

rmse = np.sqrt(mean_squared_error(y_test, prediction_inst))
print("RMSE: %f" % (rmse))

In [None]:
auc_train = trainning_auc_data.set_index('ref_hash').sample(200000)
X, y = auc_train.iloc[:,:-1], auc_train[['label_auc']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 100)


random_forest_auc = RandomForestRegressor(
                           n_estimators=100, 
                           min_samples_split = 10, 
                           random_state=100
                         )

random_forest_auc.fit(X_train, y_train.values.ravel())

params = {'objective': 'reg:squarederror',
          'eta': 0.3,
          'max_depth': 5,
          'min_child_weight': 3,
          'silent': 1,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'seed': 1}

num_trees=250
xgboost_auc = xgb.train(params, xgb.DMatrix(
    X_train, y_train), num_trees)

prediction_auc = (random_forest_auc.predict(X_test) +
              xgboost_auc.predict(xgb.DMatrix(X_test)))/2

rmse = np.sqrt(mean_squared_error(y_test, prediction_auc))
print("RMSE: %f" % (rmse))

In [None]:
pred_auctions = (random_forest_auc.predict(features_24_26.set_index('ref_hash'))\
                  + xgboost_auc.predict(xgb.DMatrix(features_24_26.set_index('ref_hash'))))/2

In [None]:
pred_installs = (random_forest_inst.predict(features_24_26.set_index('ref_hash'))\
                  + xgboost_inst.predict(xgb.DMatrix(features_24_26.set_index('ref_hash'))))/2

# Predecir

In [33]:
pred_auctions = model_auc.predict(features_24_26.set_index("ref_hash"))
df_preds_auctions = pd.DataFrame({'ref_hash' : features_24_26['ref_hash'], 'obj' : pred_auctions})
df_preds_auctions.to_csv("auctions_predictions.csv", index=False) #persistimos los resultados

In [34]:
pred_installs = model_inst.predict(features_24_26.set_index("ref_hash"))
df_preds_installs = pd.DataFrame({'ref_hash' : features_24_26['ref_hash'], 'obj' : pred_installs})
df_preds_installs.to_csv("installs_predictions.csv", index=False) #persistimos los resultados

# Feature Importance

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor

> ### Feature Importance Auctions Model

In [None]:
X = features_21_23.set_index('ref_hash')
Y = label_auc_21_23['label_auc']
names = X.columns.tolist()
rf = RandomForestRegressor()
rf.fit(X, Y)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), reverse=True))

> ### Feature Importance Installs Model

In [None]:
X = features_21_23.set_index('ref_hash')
Y = label_inst_21_23['label_inst']
names = X.columns.tolist()
rf = RandomForestRegressor()
rf.fit(X, Y)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), reverse=True))

# Submit to Kaggle

In [35]:
def export_df(df, name):
    df.to_csv(name, index=False)

Las predicciones tendrán seteadas como índice los ref_hash para no perder la referencia
No es necesario filtrar los ref_hash y quedarnos solo con los target en las predicciones que obtenemos ya que de eso
se encarga la función 

In [37]:
target = pd.read_csv("target_competencia_ids.csv")

In [38]:
def create_submit_df(auctions_predictions, installs_predictions, target):
    
    target = target.set_index('ref_hash')
    
    auc = auctions_predictions.reset_index()
    auc.columns = ['ref_hash','obj']
    auc['ref_hash'] = auc['ref_hash'].astype(str) + "_sc"
    auc = auc.set_index('ref_hash')
    
    ins = installs_predictions.reset_index()
    ins.columns = ['ref_hash','obj']
    ins['ref_hash'] = ins['ref_hash'].astype(str) + "_st"
    ins = ins.set_index('ref_hash')
    
    frames = [ins,auc]
    submit_result = pd.concat(frames).reset_index()
    target_list = target.reset_index('ref_hash')['ref_hash'].tolist()
    return submit_result.loc[submit_result['ref_hash'].isin(target_list)].sort_values(by = 'ref_hash')

In [39]:
kaggle_sub = create_submit_df(df_preds_auctions.set_index('ref_hash'), df_preds_installs.set_index('ref_hash'), target)

In [40]:
export_df(kaggle_sub, "submit.csv")