In [1]:
# Feature engineering and model selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

# Classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import os
import pandas as pd

# Custom imports to prevent clutter
from helper import *
from custom_transformers import *

In [2]:
def get_ml_dataset():
    # Construct the dataset for machine learning
    if not os.path.exists('/Users/louisspencer/Desktop/Trading-Bot/data/ml_dataset.csv'):
        df = construct_dataset_for_ml()

        print('dataset shape:', df.shape)
        print(f'database size before downcasting: {df.memory_usage(deep = True).sum() / 1e9} GB')

        numeric_cols = [col for col in df.columns if col not in ('symbol_id', 'time_period_end')]
        categorical_cols = ['symbol_id']

        df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, downcast = 'float')
        df[categorical_cols] = df[categorical_cols].astype('category')

        print(f'database size after downcasting: {df.memory_usage(deep = True).sum() / 1e9} GB')
    else:
        df = pd.read_csv('/Users/louisspencer/Desktop/Trading-Bot/data/ml_dataset.csv', index_col = 0)

    return df

df = get_ml_dataset()

In [3]:
def clean_columns(X):
    # Remove one_hot__ and remainder__ from column names
    X.columns = X.columns.str.replace('one_hot__', '')
    X.columns = X.columns.str.replace('remainder__', '')
    X.columns = X.columns.str.replace('symbol_id_', '')
    return X

In [4]:
# Specify window sizes for rolling min-max and z-score scaling
window_sizes_scaling = [2 * 12, 2 * 24, 2 * 24 * 7, 2 * 24 * 30]

# Specify window sizes for returns-based features
window_sizes_returns = [1, 2 * 24, 2 * 24 * 7, 2 * 24 * 30]

# Specify the window size for regression-based features
window_sizes_regression = [2 * 24, 2 * 24 * 7, 2 * 24 * 30]

windows_triple_barrier_label = [2 * 12, 2 * 24, 2 * 24 * 7, 2 * 24 * 30]
max_holding_times_triple_barrier_label = [2 * 12, 2 * 24, 2 * 24 * 7, 2 * 24 * 30]

# Pipeline for feature engineering and modeling
feature_engineering_pipeline = Pipeline([

    # Add regression-based features to the dataset
    ('regression_features', RegressionFeatures(window_sizes_regression)),
    
    # Add returns-based features to the dataset
    ('returns_features', ReturnsFeatures(window_sizes_returns)),

    # Add correlation-based features to the dataset
    ('correlation_features', CorrelationFeatures(window_sizes_returns)),

    # Add triple-barrier label features to the dataset
    ('triple_barrier_label_features', TripleBarrierLabelFeatures(windows_triple_barrier_label, max_holding_times_triple_barrier_label)),

    # Add price-based features to the dataset
    ('price_features', PriceFeatures()),

    # Add rolling z-score scaled features to the dataset
    ('rolling_z_score_scaler', RollingZScoreScaler(window_sizes_scaling)),

    # Clean NaN/infinity values from the dataset
    ('fill_nan', FillNaN()),

])

data_cleaning_pipeline = Pipeline([
    
    # One-hot encode the symbol_id column
    ('one_hot_encoding', ColumnTransformer(
        transformers = [
            ('one_hot', OneHotEncoder(sparse_output=False), ['symbol_id'])
        ],
        remainder='passthrough'
        )
    ),

    # Clean the column names
    ('clean_column_names', FunctionTransformer(clean_columns))
    
]).set_output(transform = 'pandas')

In [5]:
def get_ml_features():
    if not os.path.exists('/Users/louisspencer/Desktop/Trading-Bot/data/ml_features.csv'):
        X = []
        i = 1
        n = len(df.symbol_id.unique())

        for symbol_id in df.symbol_id.unique():                 
            print(f'Processing symbol_id: {symbol_id} ({i}/{n})')
            i += 1

            token = QUERY(
                f"""
                SELECT *
                FROM market_data.ml_dataset
                WHERE symbol_id = '{symbol_id}'
                ORDER BY time_period_end
                """
            )

            features = feature_engineering_pipeline.fit_transform(token)
            X.append(features)

        X = pd.concat(X)
        X.to_csv('/Users/louisspencer/Desktop/Trading-Bot/data/ml_features.csv', index = True)
    else:
        # X = QUERY(
        #     """
        #     SELECT *
        #     FROM market_data.ml_features
        #     ORDER BY symbol_id, time_period_end
        #     """
        # ).set_index('time_period_end')

        X = pd.read_csv('/Users/louisspencer/Desktop/Trading-Bot/data/ml_features.csv', index_col = 0)
        X.index = pd.to_datetime(X.index)
    
    return X

X = get_ml_features()

Processing symbol_id: AAVE_USDT_KUCOIN (1/67)
Processing symbol_id: AGIX_USDT_OKEX (2/67)
Processing symbol_id: ALGO_USDT_BINANCE (3/67)
Processing symbol_id: ALGO_USD_COINBASE (4/67)
Processing symbol_id: ARKM_USDT_BINANCE (5/67)
Processing symbol_id: ATOM_USDT_OKEX (6/67)
Processing symbol_id: BCH_USDT_OKEX (7/67)
Processing symbol_id: BCH_USD_COINBASE (8/67)
Processing symbol_id: BNB_USDT_BINANCE (9/67)
Processing symbol_id: BNB_USDT_OKEX (10/67)
Processing symbol_id: BTC_USD_COINBASE (11/67)
Processing symbol_id: BTC_USD_BITSTAMP (12/67)
Processing symbol_id: BTT_USDT_BINANCE (13/67)
Processing symbol_id: CRV_USDT_OKEX (14/67)
Processing symbol_id: DAI_USD_COINBASE (15/67)
Processing symbol_id: DOGE_USDT_BINANCE (16/67)
Processing symbol_id: DOT_USDT_KUCOIN (17/67)


In [None]:
triple_barrier_label_cols = [
    f'triple_barrier_label_w{w}_h{h}' 
    for w in windows_triple_barrier_label 
    for h in max_holding_times_triple_barrier_label
]

trade_returns_cols = [
    f'trade_returns_w{w}_h{h}' 
    for w in windows_triple_barrier_label 
    for h in max_holding_times_triple_barrier_label
]

non_z_score_cols = [c for c in X.columns if '_rz_' not in c]

cols_to_drop = triple_barrier_label_cols + trade_returns_cols + non_z_score_cols + ['symbol_id']

In [None]:
X_train = X[X.index <= '2023-06-01']
X_test = X[X.index > '2023-06-01']

X_train = X_train.sample(n = 1_000_000, replace = False)
X_test = X_test.sample(n = 500_000, replace = False)

y_train = X_train['triple_barrier_label_w336_h336']
y_test = X_test['triple_barrier_label_w336_h336']

X_train = X_train.drop(columns = cols_to_drop)
X_test = X_test.drop(columns = cols_to_drop)

In [None]:
print(f'X_train size before downcasting: {X_train.memory_usage(deep = True).sum() / 1e9} GB')
print(f'X_test size before downcasting: {X_test.memory_usage(deep = True).sum() / 1e9} GB')
print()
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
numeric_cols = [c for c in X_train.columns if c not in ('symbol_id', 'time_period_end')]

# Downcast data to save memory
X_train[numeric_cols] = X_train[numeric_cols].apply(pd.to_numeric, downcast = 'float')
X_test[numeric_cols] = X_test[numeric_cols].apply(pd.to_numeric, downcast = 'float')

print(f'X_train size after downcasting: {X_train.memory_usage(deep = True).sum() / 1e9} GB')
print(f'X_test size after downcasting: {X_test.memory_usage(deep = True).sum() / 1e9} GB')

In [None]:
%%time

# One-hot encode the symbol_id column
# X_train = data_cleaning_pipeline.fit_transform(X_train)
# X_test = data_cleaning_pipeline.fit_transform(X_test)

# print(f'X_train size: {X_train.memory_usage(deep = True).sum() / 1e9} GB')
# print(f'X_test size: {X_test.memory_usage(deep = True).sum() / 1e9} GB')

# X_train.shape, X_test.shape

In [None]:
print('Y train distribution:')
print(y_train.value_counts(normalize = True))
print()
print('Y test distribution:')
print(y_test.value_counts(normalize = True))

In [None]:
X_train.shape, X_test.shape

In [None]:
%%time

rf = RandomForestClassifier(
    max_depth = 10,
    bootstrap = True,
    random_state = 9 + 10,
    n_jobs = -1,
    verbose = True,
    class_weight = 'balanced'
)

rf.fit(
    X_train,
    y_train, 
    sample_weight = X_train['returns_1'].abs().values
)

In [None]:
rf.verbose = False

# Predictions on the training and test set
y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

# Predicted probabilities on the training and test set for each class
y_pred_proba_train_rf = rf.predict_proba(X_train)
y_pred_proba_test_rf = rf.predict_proba(X_test)

# Classification reports for the training and test set
print('RF Train:')
print(classification_report(y_train, y_pred_train_rf))
print()
print('RF Test:')
print(classification_report(y_test, y_pred_test_rf))

In [None]:
# Baseline model
print('Baseline model:')
print('Randomly predicting a class:')
print()

# Classification reports for the training and test set
print(classification_report(y_test, np.random.choice([-1, 0, 1], size = len(y_test))))

In [None]:
# Plot top N most important features as horizontal bar plot
top_n = 50
feature_importances = pd.Series(rf.feature_importances_, index = X_train.columns)
feature_importances = feature_importances.sort_values().tail(top_n)

plt.figure(figsize = (10, 7))
feature_importances.plot(kind = 'barh')
plt.title(f'Top {top_n} Most Important Features')
plt.show()

In [None]:
# Prepare data for training the meta model

# Create Meta Labels
meta_labels_train = (y_pred_train_rf == y_train).astype(int)
meta_labels_test = (y_pred_test_rf == y_test).astype(int)

# Copy the original dataset for meta-labeling
X_train_meta = X_train.copy()
X_test_meta = X_test.copy()

# Add the predictions of the Random Forest model as features for the meta model
X_train_meta['rf_predictions'] = y_pred_train_rf
X_test_meta['rf_predictions'] = y_pred_test_rf

# Use the predicted probabilities for each class as features for the meta model
class_0_index = rf.classes_.tolist().index(0)
class_1_index = rf.classes_.tolist().index(1)
class_neg_1_index = rf.classes_.tolist().index(-1)

X_train_meta['rf_proba_0'] = y_pred_proba_train_rf[:, class_0_index]
X_train_meta['rf_proba_1'] = y_pred_proba_train_rf[:, class_1_index]
X_train_meta['rf_proba_-1'] = y_pred_proba_train_rf[:, class_neg_1_index]

X_test_meta['rf_proba_0'] = y_pred_proba_test_rf[:, class_0_index]
X_test_meta['rf_proba_1'] = y_pred_proba_test_rf[:, class_1_index]
X_test_meta['rf_proba_-1'] = y_pred_proba_test_rf[:, class_neg_1_index]

In [None]:
# Meta-labeling with Random Forest

# Train a new Random Forest model on the meta labels
rf_meta = RandomForestClassifier(
    bootstrap = True,
    random_state = 9 + 10,
    n_jobs = -1,
    verbose = True,
    class_weight = 'balanced'
)

# Fit the Random Forest meta model
rf_meta.fit(X_train_meta, meta_labels_train)

In [None]:
# Meta-labeling with Neural Network

# Train a new Neural Network model on the meta labels
nn_meta = MLPClassifier(
    hidden_layer_sizes = (500, 250, 125),
    learning_rate = 'adaptive',
    verbose = True,
    activation = 'relu',
    solver = 'adam',
    random_state = 9 + 10
)

# Fit the Neural Network meta model
nn_meta.fit(X_train_meta, meta_labels_train)

In [None]:
# Distrubution of meta labels
print('Meta labels train distribution:')
print(meta_labels_train.value_counts(normalize = True))
print()

print('Meta labels test distribution:')
print(meta_labels_test.value_counts(normalize = True))

In [None]:
# Random Forest Meta Model Performance

rf_meta.verbose = False

# Predictions on the training and test set
y_pred_train_rf_meta = rf_meta.predict(X_train_meta)
y_pred_test_rf_meta = rf_meta.predict(X_test_meta)

# Classification reports for the training and test set
print('RF Meta Train:')
print(classification_report(meta_labels_train, y_pred_train_rf_meta))
print()
print('RF Meta Test:')
print(classification_report(meta_labels_test, y_pred_test_rf_meta))

In [None]:
# Neural Network Meta Model Performance
nn_meta.verbose = False

# Predictions on the training and test set
y_pred_train_nn_meta = nn_meta.predict(X_train_meta)
y_pred_test_nn_meta = nn_meta.predict(X_test_meta)

# Classification reports for the training and test set
print('NN Meta Train:')
print(classification_report(meta_labels_train, y_pred_train_nn_meta))
print()

print('NN Meta Test:')
print(classification_report(meta_labels_test, y_pred_test_nn_meta))

In [None]:
# Plot top N most important features as horizontal bar plot
top_n = 40
feature_importances = pd.Series(rf_meta.feature_importances_, index = X_train_meta.columns)
feature_importances = feature_importances.sort_values().tail(top_n)

plt.figure(figsize = (10, 7))
feature_importances.plot(kind = 'barh')
plt.title(f'Top {top_n} Most Important Features')
plt.show()

In [None]:
# See how much the meta model improves the base model when applied to the test set
print('Base model:')
print(classification_report(y_test, y_pred_test_rf))
print()

# Filter out the trades where the base model was wrong according to the meta model
X_test_filtered = X_test[y_pred_test_rf_meta == 1]
y_test_filtered = y_test[y_pred_test_rf_meta == 1]

# Predictions on the filtered test set
y_pred_test_rf_filtered = rf.predict(X_test_filtered)

# Classification report for the filtered test set
print('Base model on filtered test set:')
print(classification_report(y_test_filtered, y_pred_test_rf_filtered))

In [None]:
def get_model_performance():
    # Train and test a Random Forest model on each individual symbol
    performance_dict = {}
    i = 1

    for symbol_id in X.symbol_id.sort_values().unique():
        print(f'Processing symbol_id: {symbol_id} ({i}/{len(X.symbol_id.unique())})')
        print()

        i += 1

        # Get date of 75% of the data for this symbol
        X.index = pd.to_datetime(X.index)
        date_75 = X[X['symbol_id'] == symbol_id].index.to_series().quantile(0.75)

        # Split the data into training and test set
        X_train = X[(X.index <= date_75)]
        X_test = X[(X['symbol_id'] == symbol_id) & (X.index > date_75)]

        # Filter out rows where the price_close_rz_48 is less than 2
        X_train = X_train[X_train['price_close_rz_48'] >= 2]
        X_test = X_test[X_test['price_close_rz_48'] >= 2]

        y_train = X_train['triple_barrier_label_w336_h336']
        y_test = X_test['triple_barrier_label_w336_h336']

        X_train = X_train.drop(columns = cols_to_drop)
        X_test = X_test.drop(columns = cols_to_drop)

        # Print distribution of the target variable
        print('Y train distribution:')
        print(y_train.value_counts(normalize = True))
        print()
        print('Y test distribution:')
        print(y_test.value_counts(normalize = True))
        print()

        rf = RandomForestClassifier(
            max_features = 0.2,
            bootstrap = True,
            random_state = 9 + 10,
            n_jobs = -1
        )
        rf.fit(X_train, y_train, sample_weight = X_train['returns_1'].abs().values)

        # Classification reports for the test set
        y_pred_test_rf = rf.predict(X_test)
        report = classification_report(y_test, y_pred_test_rf, output_dict = True)

        # Distribution of predictions
        print('Predictions distribution:')
        print(pd.Series(y_pred_test_rf).value_counts())
        print()

        print('RF Test:')
        print(classification_report(y_test, y_pred_test_rf))
        print()

        # Difference between precision for the positive class and the proportion of positive class in the test set
        # If the difference is positive, the model is better than the baseline of predicting the positive class
        precision_diff = report['1.0']['precision'] - y_test.value_counts(normalize = True)[1]

        performance_dict[symbol_id] = {
            'precision': report['1.0']['precision'],
            'recall': report['1.0']['recall'],
            'f1-score': report['1.0']['f1-score'],
            'precision_diff': precision_diff
        }

    return performance_dict

performance_dict = get_model_performance()

In [None]:
# Convert performance_dict to DataFrame
performance_df = pd.DataFrame(performance_dict).T
performance_df.index.name = 'symbol_id'
performance_df = performance_df.sort_values('precision_diff', ascending = False)

In [None]:
performance_df.precision_diff.sort_values().plot(kind = 'barh', figsize = (20, 15))
plt.title('Percentage Improvement Over Baseline Model for Predicting Profitable Trades')

In [None]:
# Bootstrap performance_df to estimate the sampling distributions of the performance metrics
n_bootstraps = 100_000

bootstrap_precision_diff = []
bootstrap_precision = []
bootstrap_recall = []
bootstrap_f1_score = []

for i in range(n_bootstraps):
    bootstrap_precision_diff.append(performance_df['precision_diff'].sample(frac = 1, replace = True).mean())
    bootstrap_precision.append(performance_df['precision'].sample(frac = 1, replace = True).mean())
    bootstrap_recall.append(performance_df['recall'].sample(frac = 1, replace = True).mean())
    bootstrap_f1_score.append(performance_df['f1-score'].sample(frac = 1, replace = True).mean())

In [None]:
type(np.inf)

In [None]:
import seaborn as sns

def plot_bootstrapped_model_performance():
    # 2 x 2 subplot for the bootstrapped distributions
    plt.figure(figsize = (14, 10))

    plt.subplot(2, 2, 1)
    sns.histplot(bootstrap_precision_diff, bins = 100, kde = True)
    plt.title(f'Distribution of {n_bootstraps:,} Bootstrapped Avg. OOS Precision Differences')
    plt.xlabel('Precision Difference')
    plt.ylabel('Frequency')

    # Scatter plot of the mean precision difference (one point)
    plt.scatter(performance_df['precision_diff'].mean(), 50, color = 'r', s = 100, label = 'Mean Precision Difference')

    # Plot the 95% confidence interval
    plt.axvline(np.percentile(bootstrap_precision_diff, 2.5), color = 'r', linestyle = '--', label = '2.5th percentile')
    plt.axvline(np.percentile(bootstrap_precision_diff, 97.5), color = 'r', linestyle = '--', label = '97.5th percentile')
    plt.legend()

    plt.subplot(2, 2, 2)
    sns.histplot(bootstrap_precision, bins = 100, kde = True)
    plt.title(f'Distribution of {n_bootstraps:,} Bootstrapped Avg. OOS Precisions')
    plt.xlabel('Precision')
    plt.ylabel('Frequency')

    # Scatter plot of the mean precision (one point)
    plt.scatter(performance_df['precision'].mean(), 50, color = 'r', s = 100, label = 'Mean Precision')

    # Plot the 95% confidence interval
    plt.axvline(np.percentile(bootstrap_precision, 2.5), color = 'r', linestyle = '--', label = '2.5th percentile')
    plt.axvline(np.percentile(bootstrap_precision, 97.5), color = 'r', linestyle = '--', label = '97.5th percentile')
    plt.legend()

    plt.subplot(2, 2, 3)
    sns.histplot(bootstrap_recall, bins = 100, kde = True)
    plt.title(f'Distribution of {n_bootstraps:,} Bootstrapped Avg. OOS Recalls')
    plt.xlabel('Recall')
    plt.ylabel('Frequency')

    # Scatter plot of the mean recall (one point)
    plt.scatter(performance_df['recall'].mean(), 50, color = 'r', s = 100, label = 'Mean Recall')

    # Plot the 95% confidence interval
    plt.axvline(np.percentile(bootstrap_recall, 2.5), color = 'r', linestyle = '--', label = '2.5th percentile')
    plt.axvline(np.percentile(bootstrap_recall, 97.5), color = 'r', linestyle = '--', label = '97.5th percentile')
    plt.legend()

    plt.subplot(2, 2, 4)
    sns.histplot(bootstrap_f1_score, bins = 100, kde = True)
    plt.title(f'Distribution of {n_bootstraps:,} Bootstrapped Avg. OOS F1-Scores')
    plt.xlabel('F1-Score')
    plt.ylabel('Frequency')

    # Scatter plot of the mean f1-score (one point)
    plt.scatter(performance_df['f1-score'].mean(), 50, color = 'r', s = 100, label = 'Mean F1-Score')

    # Plot the 95% confidence interval
    plt.axvline(np.percentile(bootstrap_f1_score, 2.5), color = 'r', linestyle = '--', label = '2.5th percentile')
    plt.axvline(np.percentile(bootstrap_f1_score, 97.5), color = 'r', linestyle = '--', label = '97.5th percentile')
    plt.legend()

    plt.tight_layout()

plot_bootstrapped_model_performance()

In [None]:
print('95% Confidence Interval for Precision Difference:')
print(np.percentile(bootstrap_precision_diff, [2.5, 97.5]))
print()
print('Average Precision Difference:')
print(np.mean(bootstrap_precision_diff))
print()
print('95% Confidence Interval for Precision:')
print(np.percentile(bootstrap_precision, [2.5, 97.5]))
print()
print('Average Precision:')
print(np.mean(bootstrap_precision))
print()
print('95% Confidence Interval for Recall:')
print(np.percentile(bootstrap_recall, [2.5, 97.5]))
print()
print('Average Recall:')
print(np.mean(bootstrap_recall))
print()
print('95% Confidence Interval for F1-Score:')
print(np.percentile(bootstrap_f1_score, [2.5, 97.5]))
print()
print('Average F1-Score:')
print(np.mean(bootstrap_f1_score))