In [7]:
from pathlib import Path
from tqdm import tqdm
from typing import *

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np


ROOT_DIR = Path(os.getcwd()).parent.parent

In [8]:
df = pd.read_parquet(
    os.path.join(ROOT_DIR, "data/datasets/train_26_05.parquet")
)

df = df.reset_index(drop=True)
df["pump_hash"] = df["pumped_ticker"] + df["pump_time"]
df.shape

(86, 138)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, KFold
from sklearn.metrics import f1_score, precision_recall_curve, auc
from functools import partial

import xgboost as xgb
import optuna
import gc


dtrain = xgb.DMatrix(data=df_train[num_cols], label=df_train["is_pumped"])
dtest = xgb.DMatrix(data=df_test[num_cols], label=df_test["is_pumped"])

In [None]:
def xgboost_objective(
    trial: optuna.Trial, df: pd.DataFrame, reg_cols: List[str], target: str, fold: TimeSeriesSplit
) -> float:
    xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss"],
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 300)
    }

    auc_scores = []
    
    i = 0
    
    for train_idx, val_idx in fold.split(df[reg_cols], df[target]):
        # split data to train and validation sets
        df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
        # train on train subset and use validation set to evaluate the model
        dtrain = xgb.DMatrix(df_train[reg_cols], label=df_train[target])
        dval = xgb.DMatrix(df_val[reg_cols], label=df_val[target])
        
        evals_result = {}
        
        # Fit the model with early stopping
        model = xgb.train(
            xgb_params, dtrain=dtrain, 
            evals=[(dtrain, "train"), (dval, "val")],
            num_boost_round=500, early_stopping_rounds=20,
            verbose_eval=False, evals_result=evals_result
        )
        
        y_proba = model.predict(dval)
        y_pred = (y_proba >= 0.5).astype(int)

        precision, recall, _ = precision_recall_curve(y_true=df_val["is_pumped"], probas_pred=y_proba)
        auc_score: float = auc(recall, precision)

        # _, f1_minority = f1_score(y_pred=y_pred, y_true=df_val[target], average=None)
        
        # get the best auc_score validation set
        auc_scores.append(auc_score)
        
        trial.report(auc_score, i)

        if trial.should_prune():
            raise optuna.TrialPruned()
        
        del model, dtrain, dval, y_pred
        _ = gc.collect()
        
        i += 1

    return np.mean(auc_scores)

In [None]:
fold = TimeSeriesSplit(n_splits=5)

study_xgboost = optuna.create_study(
    direction="maximize",
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5)
)

study_xgboost.optimize(
    partial(xgboost_objective, df=df_train, reg_cols=num_cols, target="is_pumped", fold=fold), 
    n_trials=10
)

In [None]:
params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss"]
}

params.update(study_xgboost.best_params)

In [None]:
# train with early stopping on validation set
model = xgb.train(
    params, dtrain=dtrain, 
    num_boost_round=500,
)

In [None]:
y_proba = model.predict(dtest)
y_pred = (y_proba >= 0.1).astype(int)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

print(classification_report(y_true=df_test["is_pumped"], y_pred=y_pred))

In [None]:
cm = confusion_matrix(y_true=df_test["is_pumped"], y_pred=y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm).plot()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, auc, PrecisionRecallDisplay

precision, recall, thresholds = precision_recall_curve(y_true=df_test["is_pumped"], probas_pred=y_proba)

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)

PrecisionRecallDisplay(
    precision=precision, recall=recall
).plot(ax=ax)

f_scores = np.linspace(0.1, 0.8, num=10)
lines, labels = [], []

for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    (l,) = ax.plot(x[y >= 0], y[y >= 0], color="blue", alpha=0.2)
    ax.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

plt.title(f"AUC score: {round(auc(recall, precision), 5)}")
plt.show()

In [None]:
# TOP-K metric
df_test["proba"] = y_proba

top_k_vals = []

for K in [1, 5, 10, 20, 40, 60]: 

    top_k: List[bool] = []

    for pump_hash, df_pump in df_test.groupby("pump_hash"):
        df_pump = df_pump.sort_values(by="proba", ascending=False)
        top_k_contains_pump: bool = df_pump.iloc[:K]["is_pumped"].any()
        top_k.append(top_k_contains_pump)

    top_k_vals.append(
        sum(top_k) / len(top_k)
    )
        
top_k_vals

In [None]:
importances = model.get_score(importance_type="gain")

df_res = pd.DataFrame({
    "features": importances.keys(),
    "value": importances.values()
})

df_res = df_res.sort_values(by="value", ascending=False).iloc[:30]


fig = plt.figure(figsize=(16, 10))
ax = fig.add_subplot(111)

sns.barplot(
    data=df_res, x="value", y="features", ax=ax
)

plt.show()

In [None]:
from catboost import Pool, CatBoostClassifier


def catboost_objective(
    trial: optuna.Trial, df: pd.DataFrame, reg_cols: List[str], target: str, fold: TimeSeriesSplit
) -> float:
    params = {
        "objective": "Logloss",
        "eval_metric": "Logloss",
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        "class_weights": [1, trial.suggest_float("scale_pos_weight", 1, 300)],
    }

    auc_scores = []
    
    i = 0
    
    for train_idx, val_idx in fold.split(df[reg_cols], df[target]):
        # split data to train and validation sets
        df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
        # train on train subset and use validation set to evaluate the model
        train = Pool(data=df_train[reg_cols], label=df_train[target], cat_features=["num_prev_pumps"])
        val = Pool(data=df_val[reg_cols], label=df_val[target], cat_features=["num_prev_pumps"])
        
        # Fit the model with early stopping
        model = CatBoostClassifier(
            **params, 
            task_type="GPU",
            devices="0",
            iterations=500,
            early_stopping_rounds=20,
            verbose=False
        )

        model.fit(
            train, eval_set=val
        )
        
        y_proba = model.predict_proba(val)[:, 1]
        y_pred = (y_proba >= 0.5).astype(int)

        precision, recall, _ = precision_recall_curve(y_true=df_val["is_pumped"], probas_pred=y_proba)
        auc_score: float = auc(recall, precision)

        # _, f1_minority = f1_score(y_pred=y_pred, y_true=df_val[target], average=None)
        
        # get the best auc_score validation set
        auc_scores.append(auc_score)
        
        trial.report(auc_score, i)

        if trial.should_prune():
            raise optuna.TrialPruned()
        
        del model, train, val, y_pred
        _ = gc.collect()
        
        i += 1

    return np.mean(auc_scores)

In [None]:
study_catboost = optuna.create_study(
    direction="maximize",
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5)
)

study_catboost.optimize(
    partial(catboost_objective, df=df_train, reg_cols=num_cols, target="is_pumped", fold=fold), 
    n_trials=20
)

In [None]:
train = Pool(data=df_train[num_cols], label=df_train["is_pumped"])
test = Pool(data=df_test[num_cols], label=df_test["is_pumped"])

In [None]:
cb_params = {
    "objective": "Logloss",
    "task_type": "GPU"
}

cb_params.update(study_catboost.best_params)
cb_params["class_weights"] = [1, study_catboost.best_params["scale_pos_weight"]]

del cb_params["scale_pos_weight"]

model = CatBoostClassifier(**cb_params, iterations=300)
model.fit(train)

In [None]:
y_proba = model.predict_proba(test)
y_pred = y_proba[:, 1] >= 0.3

In [None]:
print(classification_report(y_true=df_test["is_pumped"], y_pred=y_pred))

In [None]:
cm = confusion_matrix(y_true=df_test["is_pumped"], y_pred=y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm).plot()
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(y_true=df_test["is_pumped"], probas_pred=y_proba[:, 1])

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)

PrecisionRecallDisplay(
    precision=precision, recall=recall
).plot(ax=ax)

f_scores = np.linspace(0.1, 0.8, num=10)
lines, labels = [], []

for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    (l,) = ax.plot(x[y >= 0], y[y >= 0], color="blue", alpha=0.2)
    ax.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

plt.title(f"AUC score: {round(auc(recall, precision), 5)}")
plt.show()

In [None]:
# TOP-K metric
df_test["proba"] = y_proba[:, 1]

top_k_vals = []

for K in [1, 5, 10, 20, 40, 60]: 

    top_k: List[bool] = []

    for pump_hash, df_pump in df_test.groupby("pump_hash"):
        df_pump = df_pump.sort_values(by="proba", ascending=False)
        top_k_contains_pump: bool = df_pump.iloc[:K]["is_pumped"].any()
        top_k.append(top_k_contains_pump)

    top_k_vals.append(
        sum(top_k) / len(top_k)
    )

top_k_vals       

In [None]:
df_fi = pd.DataFrame({
    "features": num_cols,
    "feature_importance": model.feature_importances_
})

df_fi = df_fi.sort_values(by="feature_importance", ascending=False)


fig, axs = plt.subplots(1, 2, figsize=(20, 10))
ax1, ax2 = axs

sns.barplot(
    data=df_fi.iloc[:50], x="feature_importance", y="features", ax=ax1
)
sns.barplot(
    data=df_fi.iloc[-50:], x="feature_importance", y="features", ax=ax2
)

plt.tight_layout()
plt.show()