<a href="https://colab.research.google.com/github/Billy-Drunkenstein/MAFN/blob/main/Spring%202025/Machine%20Learning%20for%20Finance/Individual%20Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSI 300 Binary Return Classification

In [17]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import random
from tqdm import tqdm
from scipy.stats import norm
import plotly.graph_objects as go
from itertools import product
from sklearn.svm import SVC

## This project explores the predictability of Fundamental, Price, and Macro-Economic data for the returns of CSI 300 Index. The target variable is simplifiked as the future return of OPEN

In [3]:
data = pd.read_csv('000300.SH.csv', index_col = 0, parse_dates = True)
data.head()

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,AMT,TURN,TOTAL_SHARES,FREE_FLOAT_SHARES,MKT_CAP_ARD,MKT_FREESHARES,PE_TTM,VAL_PB_WGT,DIVIDENDYIELD2
2005-04-08,984.665,1003.699,979.529,1003.445,6.199041,1476253000.0,9151350000.0,1.3953,439004000000.0,104709000000.0,2580696000000.0,627479300000.0,15.605,2.0622,1.8341
2005-04-11,1003.879,1008.735,992.773,995.421,6.548813,1593607000.0,10436230000.0,1.5062,439004000000.0,104709000000.0,2558621000000.0,622298800000.0,15.4991,2.0377,1.8482
2005-04-12,993.711,993.711,978.204,978.697,6.336242,1022619000.0,6479563000.0,0.9665,439252200000.0,104709000000.0,2518147000000.0,611657100000.0,15.1283,2.0008,1.9026
2005-04-13,987.95,1006.499,987.95,1000.9,6.240541,1607169000.0,10029600000.0,1.519,439252200000.0,104709000000.0,2572078000000.0,625562200000.0,15.5734,2.0453,1.8531
2005-04-14,1004.64,1006.416,985.578,986.975,6.035532,1294571000.0,7813425000.0,1.2224,439594800000.0,104811400000.0,2548100000000.0,616985100000.0,15.4431,2.0256,1.8396


## Feature Expansion: Compressing time series into standardized filtration vector
###

In [4]:
WINDOWS = {"3y": 756, "1y": 252, "6m": 126, "3m": 63, "1m": 21, "2w": 10}

new_columns = {}
for col in data.columns:
    for window_name, window in WINDOWS.items():
        new_columns[f"{col}_{window_name}_MEAN"] = data[col].rolling(window=window).mean()
        new_columns[f"{col}_{window_name}_STD"] = data[col].rolling(window=window).std()
        new_columns[f"{col}_{window_name}_SKEW"] = data[col].rolling(window=window).skew()

    for lag in range(1, 3):
        new_columns[f"{col}_LAG_{lag}"] = data[col].shift(lag)

data = pd.concat([data, pd.DataFrame(new_columns)], axis = 1)
data.dropna(inplace = True)

In [5]:
data.head()

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,AMT,TURN,TOTAL_SHARES,FREE_FLOAT_SHARES,...,DIVIDENDYIELD2_3m_STD,DIVIDENDYIELD2_3m_SKEW,DIVIDENDYIELD2_1m_MEAN,DIVIDENDYIELD2_1m_STD,DIVIDENDYIELD2_1m_SKEW,DIVIDENDYIELD2_2w_MEAN,DIVIDENDYIELD2_2w_STD,DIVIDENDYIELD2_2w_SKEW,DIVIDENDYIELD2_LAG_1,DIVIDENDYIELD2_LAG_2
2008-05-20,3911.096,3946.441,3698.554,3710.818,16.263919,3970263000.0,64572030000.0,1.2424,1605192000000.0,282996700000.0,...,0.067731,-0.317431,0.619567,0.033052,0.080456,0.62938,0.029158,0.258369,0.6445,0.6466
2008-05-21,3668.181,3801.067,3591.566,3783.049,15.868166,4359616000.0,69179110000.0,1.3618,1606971000000.0,283567700000.0,...,0.067008,-0.307293,0.620014,0.033683,0.114955,0.63629,0.029658,-0.17477,0.6822,0.6445
2008-05-22,3732.641,3783.849,3704.157,3711.444,16.268236,4074272000.0,66281210000.0,1.2715,1611218000000.0,283889400000.0,...,0.066529,-0.298757,0.620771,0.034783,0.176265,0.64474,0.02733,-0.383638,0.6693,0.6822
2008-05-23,3697.89,3741.06,3629.009,3675.147,15.708758,3344043000.0,52530760000.0,1.0413,1612052000000.0,284925500000.0,...,0.06629,-0.273744,0.622424,0.03713,0.283645,0.653,0.026453,-0.574637,0.6757,0.6693
2008-05-26,3643.012,3643.012,3555.762,3559.217,16.154744,2818019000.0,45524370000.0,0.8768,1612599000000.0,285315700000.0,...,0.066673,-0.247759,0.625871,0.040813,0.312168,0.663,0.022846,0.293368,0.6876,0.6757


# Target Variable

In [6]:
Pred_Window = 20

shifted_close = data['OPEN'].shift(Pred_Window)
max_return = data['OPEN'].rolling(window = Pred_Window, min_periods = Pred_Window).max() / shifted_close - 1
data['MAX_RETURN'] = max_return.shift(-Pred_Window - 1)

data.dropna(inplace = True)

In [7]:
data['TARGET'] = data['MAX_RETURN'] > 0.03    # Arbitrary Return Threshold

data[['OPEN', 'MAX_RETURN', 'TARGET']].head(10)

Unnamed: 0,OPEN,MAX_RETURN,TARGET
2008-05-20,3911.096,0.017573,False
2008-05-21,3668.181,-0.00931,False
2008-05-22,3732.641,-0.008571,False
2008-05-23,3697.89,0.006364,False
2008-05-26,3643.012,0.032735,True
2008-05-27,3549.986,0.023139,False
2008-05-28,3583.284,-0.011353,False
2008-05-29,3666.196,0.010868,False
2008-05-30,3585.605,0.00676,False
2008-06-02,3600.234,-0.006354,False


In [8]:
mu, std = norm.fit(data['MAX_RETURN'])

hist = go.Histogram(
    x=data['MAX_RETURN'],
    histnorm='probability density',
    nbinsx=100,
    name='Histogram',
    opacity=0.6
)

x = np.linspace(data['MAX_RETURN'].min(), data['MAX_RETURN'].max(), 500)
pdf = norm.pdf(x, mu, std)
curve = go.Scatter(
    x=x,
    y=pdf,
    mode='lines',
    name=f'Normal Fit'
)

fig = go.Figure(data=[hist, curve])
fig.update_layout(
    title='MAX RETURN Distribution',
    bargap=0.05
)
fig.show()

# Classification

## The dataset is split into train-validation-test portions. The models will be trained and hyperparameter-tuned on train-validation set, and a final best-performing parameter set will be evaluated on the test set.

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop(columns=['MAX_RETURN', 'TARGET'])
y = data['TARGET']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression

In [10]:
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

In [11]:
best_f1 = 0
best_model = None
best_C = None

for C in tqdm(np.logspace(-3, 2, 10)):
    clf = LogisticRegression(C=C, class_weight='balanced', max_iter=5000, solver='liblinear')
    clf.fit(X_train_scaled, y_train)
    y_val_pred = clf.predict(X_val_scaled)
    report = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
    f1 = report['True']['f1-score']  # focus on minority class

    if f1 > best_f1:
        best_f1 = f1
        best_model = clf
        best_C = C

print(f"Best C: {best_C:.4f}, Validation F1 (True): {best_f1:.4f}")

100%|██████████| 10/10 [00:06<00:00,  1.45it/s]

Best C: 0.0129, Validation F1 (True): 0.1547





In [12]:
y_test_pred = best_model.predict(X_test_scaled)
print("\nFinal Test Evaluation:")
print(classification_report(y_test, y_test_pred))


Final Test Evaluation:
              precision    recall  f1-score   support

       False       0.67      0.98      0.80       531
        True       0.44      0.03      0.06       262

    accuracy                           0.67       793
   macro avg       0.56      0.51      0.43       793
weighted avg       0.60      0.67      0.55       793



# Random Forest

In [13]:
best_f1 = -1
best_rf = None
best_params = {}

# Define parameter grid
n_estimators_list = [50, 100, 200]
max_depth_list = [3, 5, 10, None]

for n in tqdm(n_estimators_list):
    for d in max_depth_list:
        rf = RandomForestClassifier(
            n_estimators=n,
            max_depth=d,
            class_weight='balanced',
            random_state=69,
            n_jobs=-1
        )
        rf.fit(X_train_scaled, y_train)
        y_val_pred = rf.predict(X_val_scaled)
        report = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
        f1 = report['True']['f1-score']

        if f1 > best_f1:
            best_f1 = f1
            best_rf = rf
            best_params = {'n_estimators': n, 'max_depth': d}

print(f"Best Params: {best_params}, Validation F1 (True): {best_f1:.4f}")

100%|██████████| 3/3 [00:05<00:00,  1.98s/it]

Best Params: {'n_estimators': 200, 'max_depth': None}, Validation F1 (True): 0.4746





In [14]:
y_test_pred = best_rf.predict(X_test_scaled)
print("\nFinal Test Evaluation:")
print(classification_report(y_test, y_test_pred, zero_division=0))


Final Test Evaluation:
              precision    recall  f1-score   support

       False       0.67      0.65      0.66       531
        True       0.33      0.36      0.35       262

    accuracy                           0.55       793
   macro avg       0.50      0.50      0.50       793
weighted avg       0.56      0.55      0.56       793



# XGBoost

In [22]:
best_f1 = -1
best_xgb = None
best_params = {}

n_estimators_list = [100, 200, 300]
max_depth_list = [3, 5, 7, 10]
learning_rate_list = [0.01, 0.05, 0.1, 0.2]

for n, d, lr in tqdm(list(product(n_estimators_list, max_depth_list, learning_rate_list))):
    xgb = XGBClassifier(
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        n_estimators=n,
        max_depth=d,
        learning_rate=lr,
        scale_pos_weight=(y_train == False).sum() / (y_train == True).sum(),
        use_label_encoder=False,
        eval_metric='logloss',
        verbosity=0,
        random_state=42
    )
    xgb.fit(X_train_scaled, y_train)
    y_val_pred = xgb.predict(X_val_scaled)
    report = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
    f1 = report['True']['f1-score']

    if f1 > best_f1:
        best_f1 = f1
        best_xgb = xgb
        best_params = {'n_estimators': n, 'max_depth': d, 'learning_rate': lr}

print(f"Best Params: {best_params}, Validation F1 (True): {best_f1:.4f}")

100%|██████████| 48/48 [00:23<00:00,  2.03it/s]

Best Params: {'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.01}, Validation F1 (True): 0.7195





In [16]:
y_test_pred = best_xgb.predict(X_test_scaled)
print("\nFinal Test Evaluation:")
print(classification_report(y_test, y_test_pred, zero_division=0))


Final Test Evaluation:
              precision    recall  f1-score   support

       False       0.83      0.31      0.45       531
        True       0.38      0.87      0.53       262

    accuracy                           0.49       793
   macro avg       0.61      0.59      0.49       793
weighted avg       0.68      0.49      0.48       793



# SVM

In [18]:
best_f1 = -1
best_svm = None
best_params = {}

C_list = np.logspace(-2, 2, 5)
kernel_list = ['linear', 'rbf']

for C in tqdm(C_list):
    for kernel in kernel_list:
        clf = SVC(C=C, kernel=kernel, class_weight='balanced', probability=False)
        clf.fit(X_train_scaled, y_train)
        y_val_pred = clf.predict(X_val_scaled)
        report = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
        f1 = report['True']['f1-score']

        if f1 > best_f1:
            best_f1 = f1
            best_svm = clf
            best_params = {'C': C, 'kernel': kernel}

print(f"Best Params: {best_params}, Validation F1 (True): {best_f1:.4f}")

100%|██████████| 5/5 [01:03<00:00, 12.66s/it]

Best Params: {'C': np.float64(100.0), 'kernel': 'rbf'}, Validation F1 (True): 0.6495





In [19]:
y_test_pred = best_svm.predict(X_test_scaled)
print("\nFinal Test Evaluation (SVM - CPU):")
print(classification_report(y_test, y_test_pred, zero_division=0))


Final Test Evaluation (SVM - CPU):
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       531
        True       0.33      1.00      0.50       262

    accuracy                           0.33       793
   macro avg       0.17      0.50      0.25       793
weighted avg       0.11      0.33      0.16       793



# Backtest with XGB

In [42]:
feature_importances = best_xgb.feature_importances_
features = X_train.columns

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

top_20 = importance_df.sort_values(by='Importance', ascending=False).head(20)

fig = go.Figure(go.Bar(
    x=top_20['Importance'][::-1],
    y=top_20['Feature'][::-1],
    orientation='h',
    marker=dict(color='rgba(58, 71, 80, 0.6)', line=dict(width=1.5, color='rgba(58, 71, 80, 1.0)'))
))

fig.update_layout(
    title='Top 20 Feature Importances (XGBoost)',
    xaxis_title='Importance Score',
    yaxis_title='Feature',
    template='plotly_white',
    height=600
)

fig.show()

## Backtest 1: Sigle Day Signal

In [25]:
y_pred = best_xgb.predict(X_test_scaled)

results_df = pd.DataFrame({
    'PREDICTION': y_pred,
    'OPEN': data.loc[X_test.index, 'OPEN']
}, index=X_test.index)

In [28]:
transaction = 0.001

balance = 1
position = 0
value_series = []

for i in tqdm(range(len(results_df)- 1)):
    date = results_df.index[i]
    next_date = results_df.index[i + 1]

    signal = results_df.loc[date, 'PREDICTION']
    price = results_df.loc[next_date, 'OPEN']

    if signal and position == 0:
        effective_price = price * (1 + transaction)
        position = balance / effective_price
        balance = 0

    elif not signal and position > 0:
        effective_price = price * (1 - transaction)
        balance = position * effective_price
        position = 0

    current_value = balance if position == 0 else position * price
    value_series.append(current_value)

value_series.append(value_series[-1])
results_df['Value'] = value_series

100%|██████████| 792/792 [00:00<00:00, 19035.52it/s]


In [29]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=results_df.index,
    y=results_df['Value'],
    mode='lines',
    name='Portfolio Value',
    line=dict(width=2)
))

fig.update_layout(
    title='Portfolio Value Over Time',
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    hovermode='x unified',
    template='plotly_white',
    height=500
)

fig.show()

## Backtest 2: Consecutive Days Signal

In [34]:
transaction = 0.001
balance = 1
position = 0
value_series_confirm2 = []

for i in tqdm(range(1, len(results_df) - 1)):
    date = results_df.index[i]
    prev_date = results_df.index[i - 1]
    next_date = results_df.index[i + 1]

    prev_signal = results_df.loc[prev_date, 'PREDICTION']
    curr_signal = results_df.loc[date, 'PREDICTION']
    price = results_df.loc[next_date, 'OPEN']

    if prev_signal and curr_signal and position == 0:
        effective_price = price * (1 + transaction)
        position = balance / effective_price
        balance = 0

    elif not prev_signal and not curr_signal and position > 0:
        effective_price = price * (1 - transaction)
        balance = position * effective_price
        position = 0

    current_value = balance if position == 0 else position * price
    value_series_confirm2.append(current_value)

value_series_confirm2 = [value_series_confirm2[0]] + value_series_confirm2 + [value_series_confirm2[-1]]
results_df['Value_Confirm2'] = value_series_confirm2

100%|██████████| 791/791 [00:00<00:00, 19949.46it/s]


In [35]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=results_df.index,
    y=results_df['Value_Confirm2'],
    mode='lines',
    name='Confirm2 Portfolio Value',
    line=dict(width=2)
))

fig.update_layout(
    title='Portfolio Value Over Time (Confirm2 Strategy)',
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    hovermode='x unified',
    template='plotly_white',
    height=500
)

fig.show()

## Backtest 3: Composite Signal

In [36]:
balance = 1
position = 0
transaction = 0.001
value_series_dip_open = []

for i in range(len(results_df) - 1):
    date = results_df.index[i]
    next_date = results_df.index[i + 1]

    signal = results_df.loc[date, 'PREDICTION']
    open_today = results_df.loc[date, 'OPEN']
    open_next = results_df.loc[next_date, 'OPEN']

    if signal and open_next < open_today and position == 0:
        effective_price = open_next * (1 + transaction)
        position = balance / effective_price
        balance = 0

    elif not signal and position > 0:
        effective_price = open_next * (1 - transaction)
        balance = position * effective_price
        position = 0

    current_value = balance if position == 0 else position * open_next
    value_series_dip_open.append(current_value)

value_series_dip_open.append(value_series_dip_open[-1])
results_df['Value_BuyDipOpen'] = value_series_dip_open

In [37]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=results_df.index,
    y=results_df['Value_BuyDipOpen'],
    mode='lines',
    name='Buy-on-Dip Strategy',
    line=dict(width=2)
))

fig.update_layout(
    title='Portfolio Value Over Time (Buy on Dip Strategy)',
    xaxis_title='Date',
    yaxis_title='Portfolio Value ($)',
    hovermode='x unified',
    template='plotly_white',
    height=500
)

fig.show()

## This final backtest method incorporates some mean reversion to the signal.
## Here are the resulting strategy performance

In [40]:
returns_bdo = results_df['Value_BuyDipOpen'].pct_change().dropna()

final_value = results_df['Value_BuyDipOpen'].iloc[-1]
total_return = (final_value - 1) * 100
mean_daily_return = returns_bdo.mean()
std_daily_return = returns_bdo.std()
sharpe_ratio = (mean_daily_return / std_daily_return) * np.sqrt(252)
max_value = results_df['Value_BuyDipOpen'].max()
min_value = results_df['Value_BuyDipOpen'].min()
max_drawdown = (results_df['Value_BuyDipOpen'] / results_df['Value_BuyDipOpen'].cummax() - 1).min() * 100

print(f"- Final Portfolio Value: ${final_value:,.2f}")
print(f"- Total Return: {total_return:.2f}%")
print(f"- Mean Daily Return: {mean_daily_return:.4f}")
print(f"- Std Dev Daily Return: {std_daily_return:.4f}")
print(f"- Sharpe Ratio: {sharpe_ratio:.2f}")
print(f"- Max Value: ${max_value:,.2f}")
print(f"- Min Value: ${min_value:,.2f}")
print(f"- Max Drawdown: {max_drawdown:.2f}%")

- Final Portfolio Value: $1.05
- Total Return: 4.88%
- Mean Daily Return: 0.0001
- Std Dev Daily Return: 0.0119
- Sharpe Ratio: 0.17
- Max Value: $1.19
- Min Value: $0.84
- Max Drawdown: -21.14%
