In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
np.random.seed(10)

# Carga de los df de features para entrenar y predecir

In [None]:
windows = ["18_20","19_21","20_22","21_23"]
features = {}
label_auc = {}
label_inst = {}
label_clas_auc = {}
label_clas_inst = {}

for window in windows:
    features[window] = pd.read_csv("windows/{}/features.csv".format(window), index_col="ref_hash")
    label_auc[window] = pd.read_csv("windows/{}/labels_auc.csv".format(window), index_col="ref_hash")
    label_inst[window] = pd.read_csv("windows/{}/labels_inst.csv".format(window), index_col="ref_hash")
    label_clas_auc[window] = pd.DataFrame({'ref_hash': label_auc[window].index, 'label_auc': (label_auc[window]['label_auc'] == 259200).astype(int)}).set_index("ref_hash")
    label_clas_inst[window] = pd.DataFrame({'ref_hash': label_inst[window].index, 'label_inst': (label_inst[window]['label_inst'] == 259200).astype(int)}).set_index("ref_hash")
    
features_to_predict = pd.read_csv("windows/24_26/features.csv", index_col="ref_hash") 

# Funciones de entrenamiento

In [None]:
def guardar_submit(params, result):
    tiempo = "time"
    with open("historial_submits.txt","a+") as f:
        f.write("\n"+tiempo+"|"+params+"|")

In [None]:
def entrenar(modelo, df_features, labels):
    df_features = df_features.merge(labels, how="left", left_on="ref_hash", right_on="ref_hash")
    df_features.set_index("ref_hash", inplace=True)
    X, y = df_features.iloc[:,:-1], df_features.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

    modelo.fit(X_train, y_train, eval_metric='rmse')

    prediction = modelo.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    print("RMSE: %f" % (rmse))
    return prediction

## Selección de features

In [None]:
def select_features(df, feature_list):
    return df.reindex(columns=feature_list)

In [None]:
features_list_class_auc = [
    'appearances_in_auctions',
    'user_appeared_last_day',
    'amount_of_clicks',
    'has_installed',
    'user_clicked_last_day',
    'user_installed_last_day',
    'amount_of_installs',
    'mean_time_to_click',
    'amount_auctions_in_last_hour',
    'amount_auctions_in_last_2_hours',
    'amount_auctions_in_last_5_hours',
    'amount_auctions_in_last_12_hours',
    'amount_auctions_in_last_24_hours',
    'amount_auctions_in_first_hour',
    'amount_auctions_in_first_3_hours',
    'amount_auctions_in_first_5_hours',
    'amount_auctions_in_first_12_hours',
    'amount_events_in_first_hour',
    'amount_events_in_first_5_hours',
    'amount_events_in_first_12_hours',
    'amount_clicks_in_last_2_hours',
    'amount_clicks_in_last_4_hours',
    'device_os',
    'time_to_reappear',
    'installs_per_clicks',
    'events_x_app_210',
    'events_x_app_122',
    'events_x_app_65',
    'events_x_app_121',
    'events_x_app_26',
    'most_installed_apps_used',
    'cant_apps_used',
    'std_time_to_click',
    'max_time_install',
    'min_time_install',
    'mean_time_install',
    'std_time_install',
    'max_time_events',
    'mean_time_events',
    'std_time_events',
    'installs_per_events',
    'cant_events_atributed',
    'has_events_atributed',
    'cant_events_0_4',
    'cant_events_4_8',
    'cant_events_8_12',
    'cant_events_12_16',
    'cant_events_16_20',
    'cant_events_20_24',
    'cant_auctions_0_4',
    'cant_auctions_4_8',
    'cant_auctions_8_12',
    'cant_auctions_12_16',
    'cant_auctions_16_20',
    'cant_auctions_20_24',
    'has_events_ids_with_installs',
    'has_events_ids_without_installs']

In [None]:
features_list_class_inst = [
    'appearances_in_auctions',
    'user_appeared_last_day',
    'amount_of_clicks',
    'has_installed',
    'user_clicked_last_day',
    'user_installed_last_day',
    'amount_of_installs',
    'mean_time_to_click',
    'amount_auctions_in_last_hour',
    'amount_auctions_in_last_2_hours',
    'amount_auctions_in_last_5_hours',
    'amount_auctions_in_last_12_hours',
    'amount_auctions_in_last_24_hours',
    'amount_auctions_in_first_hour',
    'amount_auctions_in_first_3_hours',
    'amount_auctions_in_first_5_hours',
    'amount_auctions_in_first_12_hours',
    'amount_events_in_first_hour',
    'amount_events_in_first_5_hours',
    'amount_events_in_first_12_hours',
    'amount_clicks_in_last_2_hours',
    'amount_clicks_in_last_4_hours',
    'device_os',
    'time_to_reappear',
    'installs_per_clicks',
    'events_x_app_210',
    'events_x_app_122',
    'events_x_app_65',
    'events_x_app_121',
    'events_x_app_26',
    'most_installed_apps_used',
    'cant_apps_used',
    'std_time_to_click',
    'max_time_install',
    'min_time_install',
    'mean_time_install',
    'std_time_install',
    'max_time_events',
    'mean_time_events',
    'std_time_events',
    'installs_per_events',
    'cant_events_atributed',
    'has_events_atributed',
    'cant_events_0_4',
    'cant_events_4_8',
    'cant_events_8_12',
    'cant_events_12_16',
    'cant_events_16_20',
    'cant_events_20_24',
    'cant_auctions_0_4',
    'cant_auctions_4_8',
    'cant_auctions_8_12',
    'cant_auctions_12_16',
    'cant_auctions_16_20',
    'cant_auctions_20_24',
    'has_events_ids_with_installs',
    'has_events_ids_without_installs']

# Modelos

## Clasificador

In [None]:
def get_df_to_train(windows, features, labels, feature_list):
    df_list = []
    for window in windows:
        df = select_features(features[window], feature_list).join(labels[window], how="inner")
        df_list.append(df)
    df_full = pd.concat(df_list)
    df_full.reset_index(inplace=True, drop=True)
    return df_full

In [None]:
# factor = 1 100% balanced
# factor = 0 not balanced
# label_value label que mas aparece y se debe balancear
def balance(df_full, label_name, factor, label_value = 1):
    cant_values = df_full[label_name].value_counts()[label_value]
    cant_no_values = len(df_full[label_name]) - cant_values
    a_borrar = int((cant_values-cant_no_values)*factor)
    index_to_drop = np.random.choice(df_full.loc[df_full[label_name] == label_value].index,a_borrar, replace=False)
    return df_full.drop(index=index_to_drop)

In [None]:
def get_train_test_split(df_full, label_name, test_size=0.3):
    y = df_full[label_name]
    X_data = df_full.drop(label_name, axis=1)
    return train_test_split(X_data, y, test_size=test_size, random_state=7)

### Clasificador installs

In [None]:
features_list = features_list_class_inst
df_full = get_df_to_train(windows, features, label_clas_inst, features_list)

df_full = balance(df_full, "label_inst", 1, 1)

X_train, X_test, y_train, y_test = get_train_test_split(df_full, "label_inst")

model_class_inst = xgb.XGBClassifier()
train_model_class = model_class_inst.fit(X_train, y_train)
pred_model_class = train_model_class.predict(X_test)
print("Accuracy for model installs: %.2f" % (accuracy_score(y_test, pred_model_class) * 100))
print("Using features: "+str(features_list))

In [None]:
pd.Series(pred_model_class).value_counts()

In [None]:
y_test.value_counts()

In [None]:
df_full["label_inst"].value_counts()

### Clasificador auctions

In [None]:
df_full = get_df_to_train(windows, features, label_clas_auc, features_list_class_auc)

df_full = balance(df_full, "label_auc", 1, 1)

X_train, X_test, y_train, y_test = get_train_test_split(df_full, "label_auc")

model_class_auc = xgb.XGBClassifier()
train_model_class = model_class_auc.fit(X_train, y_train)
pred_model_class = train_model_class.predict(X_test)
print("Accuracy for model auctions: %.2f" % (accuracy_score(y_test, pred_model_class) * 100))
print("Using features: "+str(features_list_class_auc))

## Modelos de predicción de tiempos

### Modelo auction

In [None]:
df_full = get_df_to_train(windows, features, label_auc, features_list_class_auc)

df_full = df_full.sample(int(len(df_full)*0.5))

df_full = balance(df_full, "label_auc", 0.7, 259200)

X_train, X_test, y_train, y_test = get_train_test_split(df_full, "label_auc")

In [None]:
model_auc = xgb.XGBRegressor(
    learning_rate = 0.1,
    n_estimators=500,
#    min_child_weight= 8,
    max_depth=3,
    gamma=0.1,
    nthread=-1
)

In [None]:
%%time
model_auc.fit(X_train, y_train, eval_metric='rmse')

prediction = model_auc.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, prediction))
print("RMSE: %f" % (rmse))

In [None]:
RMSE: 78968.633457
SAMPLE = 0.5
BALANCE = 0.7
    
    learning_rate = 0.1,
    n_estimators=500,
#    min_child_weight= 8,
    max_depth=3,
    gamma=0.1,
    nthread=-1

### Modelo installs

In [None]:
df_full = get_df_to_train(windows, features, label_inst, features_list_class_inst)

df_full = df_full.sample(int(len(df_full)*0.5))

df_full = balance(df_full, "label_inst", 0.9, 259200)

X_train, X_test, y_train, y_test = get_train_test_split(df_full, "label_inst")

In [None]:
model_inst = xgb.XGBRegressor(
    learning_rate = 0.1,
    n_estimators=500,
    gamma=0.1,
    nthread=-1
)

In [None]:
%%time
model_inst.fit(X_train, y_train, eval_metric='rmse')

prediction = model_inst.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, prediction))
print("RMSE: %f" % (rmse))

In [None]:
RMSE: 51037.791545
SAMPLE 0.5
BALANCE 0

learning_rate = 0.1,
    n_estimators=500,
    gamma=0.1,
    nthread=-1

# Predecir

### Predicción auctions

In [None]:
pred_auctions = model_auc.predict(features_to_predict)
df_preds_auctions = pd.DataFrame({'ref_hash' : features_to_predict.index, 'obj' : pred_auctions})
df_preds_auctions.to_csv("auctions_predictions.csv", index=False) #persistimos los resultados

### Predicción installs

In [None]:
pred_installs = model_inst.predict(features_to_predict)
df_preds_installs = pd.DataFrame({'ref_hash' : features_to_predict.index, 'obj' : pred_installs})
df_preds_installs.to_csv("installs_predictions.csv", index=False) #persistimos los resultados

In [None]:
pred_clas_inst = model_class_inst.predict(features_to_predict)

df_preds_installs = pd.DataFrame({"ref_hash":features_to_predict.index,"obj":pred_clas_inst})
ref_not_to_predict = []#df_preds_installs.loc[df_preds_installs["obj"] == 1]["ref_hash"].unique()

df_to_predict = features_to_predict.drop(index=ref_to_predict)

pred_installs = model_inst.predict(df_to_predict.reset_index(drop=True))

df_pred_value = pd.DataFrame({"ref_hash":df_to_predict.index,"value":pred_installs})

df_preds_installs = df_preds_installs.merge(df_pred_value,left_on="ref_hash",right_on="ref_hash",how="left")
df_preds_installs["obj"] = df_preds_installs["value"]
del df_preds_installs["value"]
df_preds_installs = df_preds_installs.fillna(259200)

df_preds_installs.to_csv("installs_predictions.csv", index=False) #persistimos los resultados

# Submit to Kaggle

In [None]:
def export_df(df, name):
    df.to_csv(name, index=False)

Las predicciones tendrán seteadas como índice los ref_hash para no perder la referencia
No es necesario filtrar los ref_hash y quedarnos solo con los target en las predicciones que obtenemos ya que de eso
se encarga la función 

In [None]:
target = pd.read_csv("target_competencia_ids.csv")

In [None]:
def create_submit_df(auctions_predictions, installs_predictions, target):
    
    target = target.set_index('ref_hash')
    
    auc = auctions_predictions.reset_index()
    auc.columns = ['ref_hash','obj']
    auc['ref_hash'] = auc['ref_hash'].astype(str) + "_sc"
    auc = auc.set_index('ref_hash')
    
    ins = installs_predictions.reset_index()
    ins.columns = ['ref_hash','obj']
    ins['ref_hash'] = ins['ref_hash'].astype(str) + "_st"
    ins = ins.set_index('ref_hash')
    
    frames = [ins,auc]
    submit_result = pd.concat(frames).reset_index()
    target_list = target.reset_index('ref_hash')['ref_hash'].tolist()
    return submit_result.loc[submit_result['ref_hash'].isin(target_list)].sort_values(by = 'ref_hash')

In [None]:
kaggle_sub = create_submit_df(df_preds_auctions.set_index('ref_hash'), \
                              df_preds_installs.set_index('ref_hash'), target)

In [None]:
export_df(kaggle_sub, "submit.csv")