In [2]:
import wandb

In [3]:
wandb.login()

### XGBoost

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import xgboost as xgb
import pandas as pd
import numpy as np

In [16]:
# load the data
df = pd.read_csv("../datasets/BTC-USD.csv")
# define features and target
look_back = 20
feature_df = df.drop(columns="Date", axis=1)
features = []
targets = []
for i in range(look_back, len(df) - 1):
    features.append(feature_df.values[i-look_back:i])
    price_delta = df["Close"].values[i+1] - df["Close"].values[i]
    targets.append(0 if price_delta < 0 else 1)
# convert to numpy arrays for use with XGBoost
features = np.array(features)
targets = np.array(targets)
features_flattened = features.reshape(features.shape[0], -1)
# split the data into train/dev/test splits
X_train, X_dev_test, y_train, y_dev_test = train_test_split(
    features_flattened, targets, test_size=14, shuffle=False
)
X_dev, X_test, y_dev, y_test = train_test_split(
    X_dev_test, y_dev_test, test_size=7, shuffle=False
)
# create regression matrices
d_train = xgb.DMatrix(X_train, label=y_train)
d_dev = xgb.DMatrix(X_dev, label=y_dev)
d_test = xgb.DMatrix(X_test, label=y_test)
print(d_train.num_row(), d_dev.num_row(), d_test.num_row())

1216 7 7


### Training

In [6]:
from itertools import product
from random import shuffle
import tqdm
hp = [
    [50, 100, 150, 200],  # num_boost_round
    [0.01, 0.05, 0.1, 0.15, 0.2],  # learning_rate
    [3, 4, 5, 6, 7, 8, 9, 10],  # max_depth
    [0.5, 0.6, 0.7, 0.8, 0.9, 1],  # subsample
    [0.5, 0.6, 0.7, 0.8, 0.9, 1],  # colsample_bytree
]
n_rounds = 100
hps = list(product(*hp))
shuffle(hps)
print(len(hps))

In [7]:
import os
os.environ['OMP_NUM_THREADS'] = '2'

In [8]:
for num_boost_round, learning_rate, max_depth, subsample, colsample_bytree in tqdm.tqdm(hps[:n_rounds], "Grid search (random sampling)"):
    # configure the run
    training_params = {
        "objective": "reg:logistic",
        "learning_rate": learning_rate,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree
    }
    run_config = {
        "num_boost_round": num_boost_round,
        "learning_rate": learning_rate,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree
    }
    run = wandb.init(project="cp-xgb-ndt", config=run_config)
    # train
    model = xgb.train(
        params=training_params,
        dtrain=d_train,
        num_boost_round=num_boost_round
    )
    # log the rmse
    y_pred_dev = model.predict(d_dev)
    y_pred_train = model.predict(d_train)
    rmse_dev = f1_score(y_dev, np.round(y_pred_dev))
    rmse_train = f1_score(y_train, np.round(y_pred_train))
    run.log({"train_f1": rmse_train, "dev_f1": rmse_dev})
    run.finish()

In [None]:
model.save_model("xgb_next_day_trend.json")