In [None]:
# Feature engineering and model selection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve

# Classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

import vectorbt as vbt
import matplotlib.pyplot as plt
import joblib

# Custom imports to prevent clutter
from helper import *
from custom_transformers import *

In [None]:
%%time
# Load the data
df = construct_dataset_for_ml()

In [None]:
print('dataset shape:', df.shape)
print(f'database size: {df.memory_usage(deep = True).sum() / 1e9} GB')
df.head()

In [None]:
numeric_cols = [col for col in df.columns if col not in ('symbol_id')]
categorical_cols = ['symbol_id']

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, downcast = 'float')
df['triple_barrier_label'] = pd.to_numeric(df['triple_barrier_label'], downcast = 'integer')
df[categorical_cols] = df[categorical_cols].astype('category')

print(f'database size: {df.memory_usage(deep = True).sum() / 1e9} GB')

In [None]:
def clean_columns(X):
    # Remove one_hot__ and remainder__ from column names
    X.columns = X.columns.str.replace('one_hot__', '')
    X.columns = X.columns.str.replace('remainder__', '')
    X.columns = X.columns.str.replace('symbol_id_', '')
    return X

In [None]:
# Specify window sizes for rolling min-max and z-score scaling
window_sizes_scaling = [2 * 24, 2 * 24 * 7, 2 * 24 * 30]

# Specify window sizes for returns-based features
window_sizes_returns = [1, 2 * 24, 2 * 24 * 7, 2 * 24 * 30]

# Pipeline for feature engineering and modeling
feature_engineering_pipeline = Pipeline([

    # Add returns-based features to the dataset
    ('returns_features', ReturnsFeatures(window_sizes_returns)),

    # Add rolling min-max scaled features to the dataset
    ('rolling_min_max_scaler', RollingMinMaxScaler(window_sizes_scaling)),

    # Add rolling z-score scaled features to the dataset
    ('rolling_z_score_scaler', RollingZScoreScaler(window_sizes_scaling)),

    # Add price-based features to the dataset
    # ('price_features', PriceFeatures()),

    # Add more feature engineering steps here
    # ...
    # ...

    # Clean NaN/infinity values from the dataset
    ('fill_nan', FillNaN()),

    # Add lagged features to the dataset
    ('lag_features', LagFeatures(lags = [1, 2, 3])),

    # Add time-based features to the dataset
    # ('time_features', TimeFeatures()),

])

data_cleaning_pipeline = Pipeline([
    
    # One-hot encode the symbol_id column
    ('one_hot_encoding', ColumnTransformer(
        transformers = [
            ('one_hot', OneHotEncoder(sparse_output=False), ['symbol_id'])
        ],
        remainder='passthrough'
        )
    ),

    # Clean the column names
    ('clean_column_names', FunctionTransformer(clean_columns))
    
]).set_output(transform = 'pandas')

In [None]:
y_train

In [None]:
%%time

X_train, X_test, y_train, y_test = [], [], [], []
i = 1
n = len(df.symbol_id.unique())

for symbol_id in df.symbol_id.unique():
    print(f'Processing symbol_id: {symbol_id} ({i}/{n})')
    i += 1

    token = df[df.symbol_id == symbol_id]
    
    labels = token['triple_barrier_label']
    features = feature_engineering_pipeline.fit_transform(token.drop(['triple_barrier_label'], axis = 1))

    train_pct = 0.8
    train_size = int(train_pct * len(features))

    X_train.append(features[:train_size])
    X_test.append(features[train_size:])
    y_train.append(labels[:train_size])
    y_test.append(labels[train_size:])

X_train = pd.concat(X_train)
X_test = pd.concat(X_test)
y_train = pd.concat(y_train)
y_test = pd.concat(y_test)

In [None]:
print(f'X_train size: {X_train.memory_usage(deep = True).sum() / 1e9} GB')
print(f'X_test size: {X_test.memory_usage(deep = True).sum() / 1e9} GB')
print()
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
numeric_cols = [col for col in X_train.columns if col not in ('symbol_id')]
categorical_cols = ['symbol_id']

# Downcast data to save memory
X_train[numeric_cols] = X_train[numeric_cols].apply(pd.to_numeric, downcast = 'float')
X_train[categorical_cols] = X_train[categorical_cols].astype('category')

X_test[numeric_cols] = X_test[numeric_cols].apply(pd.to_numeric, downcast = 'float')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

print(f'X_train size: {X_train.memory_usage(deep = True).sum() / 1e9} GB')
print(f'X_test size: {X_test.memory_usage(deep = True).sum() / 1e9} GB')

In [None]:
%%time

# One-hot encode the symbol_id column
X_train = data_cleaning_pipeline.fit_transform(X_train)
X_test = data_cleaning_pipeline.fit_transform(X_test)

print(f'X_train size: {X_train.memory_usage(deep = True).sum() / 1e9} GB')
print(f'X_test size: {X_test.memory_usage(deep = True).sum() / 1e9} GB')

X_train.shape, X_test.shape

In [None]:
end = max(X_train.index)
df[df.index <= end]

In [None]:
print('Y train distribution:')
print(y_train.value_counts(normalize = True))
print()
print('Y test distribution:')
print(y_test.value_counts(normalize = True))

In [None]:
X_train = X_train.drop(['symbol_id'], axis = 1)
X_test = X_test.drop(['symbol_id'], axis = 1)

In [None]:
%%time

rf = RandomForestClassifier(
    bootstrap = False, 
    random_state = 9 + 10, 
    n_jobs = -1,
    verbose = True
)

rf.fit(X_train, y_train)

In [None]:
%%time

# Predictions on the training and test set
y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

# Predicted probabilities on the test set
y_pred_proba_test_rf = rf.predict_proba(X_test)[:, 1]

# Classification reports for the training and test set
print('RF Train:')
print(classification_report(y_train, y_pred_train_rf))
print()
print('RF Test:')
print(classification_report(y_test, y_pred_test_rf))

# Precision-recall curve and AUC
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_proba_test_rf)
auc_rf = auc(recall_rf, precision_rf)

# ROC curve and AUC
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_test_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

print(f'Random Forest ROC AUC: {roc_auc_rf}')
print(f'Random Forest Precision-Recall AUC: {auc_rf}')
print()

In [None]:
# Plot precision-recall curve and ROC curve side by side
plt.figure(figsize = (14, 6))

plt.subplot(1, 2, 1)
plt.plot(recall_rf, precision_rf, label = 'RF', linestyle = 'dashed', color = 'b')
plt.fill_between(recall_rf, precision_rf, alpha = 0.2, color = 'b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve (AUC = {auc_rf:.2f})')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(fpr_rf, tpr_rf, label = 'RF', marker = '.')
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve (AUC = {roc_auc_rf:.2f})')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Plot top N most important features as horizontal bar plot
top_n = 30
feature_importances = pd.Series(rf.feature_importances_, index = X_train.columns)
feature_importances = feature_importances.sort_values().tail(top_n)

plt.figure(figsize = (10, 6))
feature_importances.plot(kind = 'barh')
plt.title(f'Top {top_n} Most Important Features')
plt.show()

In [None]:
optimize_dict = {
    'prediction_threshold': [0.6, 2],
    'trade_size_multiplier': [0.2,2]
}
def k(**kwargs):
    print(kwargs)

k(**optimize_dict)

In [None]:
X_train