In [None]:
!pip -q install torch scikit-learn pandas matplotlib

import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim

os.makedirs("artifacts", exist_ok=True)
os.makedirs("metrics", exist_ok=True)
os.makedirs("reports/figs", exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
merged = pd.read_csv("/content/sample_data/merged_OHLCV_Sentiment.csv")
price  = pd.read_csv("/content/sample_data/price_features.csv")
sent   = pd.read_csv("/content/sample_data/sent_features.csv")

print(merged.head(3))
print(price.head(3))
print(sent.head(3))


  Ticker        Date     Open     High      Low    Close    Volume  Sentiment  \
0    QQQ  2015-01-02  95.7833  96.1811  94.5548  95.0215  33920142   0.569404   
1    QQQ  2015-01-05  94.6011  94.7069  93.3638  93.6168  39552414   0.715140   
2    QQQ  2015-01-06  93.7599  93.9255  91.9571  92.3698  71694249   0.394994   

   label_up_next  
0            0.0  
1            0.0  
2            1.0  
  Ticker        Date     Open     High      Low    Close    Volume  Sentiment  \
0    QQQ  2015-02-09  94.8216  95.3607  94.6590  94.8883  24482889   0.417528   
1    QQQ  2015-02-10  95.4283  96.5017  95.2843  96.3616  25655624   0.533132   
2    QQQ  2015-02-11  96.4822  96.9470  96.2842  96.7254  21940092   0.363622   

   label_up_next  pct_change      SMA5     SMA10     SMA20     EMA12  \
0              1   -0.230419 -1.456049 -1.462920 -1.464976 -1.463099   
1              1    0.898939 -1.453897 -1.460680 -1.463897 -1.461027   
2              1    0.195355 -1.451002 -1.457759 -1.462657

In [None]:
# columns that are NOT new features in price_features
price_drop = ["Open","High","Low","Close","Volume","Sentiment","label_up_next"]

price_new = price.copy()
for col in price_drop:
    if col in price_new.columns:
        price_new = price_new.drop(columns=col)

# price_new should now be: Ticker, Date, pct_change, SMA5, SMA10, SMA20, EMA12, EMA26, ...
print(price_new.columns)

# sentiment: we keep Ticker, Date, and sentiment feature columns
sent_new = sent.copy()
# if there's any duplicate label or original Sentiment column in here, drop it too
for col in ["Sentiment","label_up_next"]:
    if col in sent_new.columns:
        sent_new = sent_new.drop(columns=col)

print(sent_new.columns)


Index(['Ticker', 'Date', 'pct_change', 'SMA5', 'SMA10', 'SMA20', 'EMA12',
       'EMA26', 'MACD', 'RSI_14', 'ATR_14'],
      dtype='object')
Index(['Ticker', 'Date', 'sent_smooth', 'sent_z_20d', 'sent_high', 'sent_low'], dtype='object')


In [None]:
df = merged.merge(price_new, on=["Ticker","Date"], how="inner") \
           .merge(sent_new,  on=["Ticker","Date"], how="inner")

df = df.sort_values(["Date","Ticker"]).dropna().reset_index(drop=True)
df.head(5)


Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Sentiment,label_up_next,pct_change,...,SMA20,EMA12,EMA26,MACD,RSI_14,ATR_14,sent_smooth,sent_z_20d,sent_high,sent_low
0,SPY,2015-01-02,172.834,173.274,171.016,172.069,144993270,0.569404,0.0,-42.82561,...,2.445059,2.1135,2.350964,-3.482272,-3.991972,10.413595,0.569404,0.0,0,0
1,SPY,2015-01-05,171.006,171.148,168.662,168.957,202423620,0.71514,0.0,-1.113534,...,2.278005,1.656946,2.113499,-6.878608,-3.995193,9.650765,0.642272,0.0,0,0
2,SPY,2015-01-06,169.254,169.791,166.556,167.378,249677041,0.394994,1.0,-0.590233,...,2.109149,1.268639,1.89266,-9.477717,-3.996934,8.938449,0.559846,0.0,0,0
3,SPY,2015-01-07,168.718,169.791,168.249,169.435,149593226,-0.297627,1.0,0.705123,...,1.940464,0.942667,1.689435,-11.385909,-3.956161,8.258022,0.270836,0.0,0,0
4,SPY,2015-01-08,170.862,172.652,170.842,172.469,175671912,-0.242862,0.0,1.041431,...,1.772946,0.670674,1.503117,-12.72118,-3.89229,7.644772,-0.048498,0.0,0,0


In [None]:
target_col = "label_up_next"

ignore_cols = ["Date", "Ticker", target_col]

X = df.drop(columns=ignore_cols).astype("float32")
y = df[target_col].astype("float32").values.reshape(-1, 1)

len(df), X.shape, y.shape


(4721, (4721, 19), (4721, 1))

In [None]:
split_idx = int(len(df) * 0.7)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_train = y[:split_idx]
y_test  = y[split_idx:]

X_train.shape, X_test.shape


((3304, 19), (1417, 19))

In [None]:
scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# convert to tensors
Xt = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
Yt = torch.tensor(y_train,        dtype=torch.float32).to(device)

Xv = torch.tensor(X_test_scaled,  dtype=torch.float32).to(device)
Yv = torch.tensor(y_test,         dtype=torch.float32).to(device)


In [None]:
class FusionMLP(nn.Module):
    def __init__(self, in_dim, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = FusionMLP(in_dim=X_train.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()


In [None]:
num_epochs = 50
patience = 5
best_val_loss = float("inf")
wait = 0

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    # train
    model.train()
    optimizer.zero_grad()
    y_pred = model(Xt)
    loss = criterion(y_pred, Yt)
    loss.backward()
    optimizer.step()

    # validation loss
    model.eval()
    with torch.no_grad():
        y_val_pred = model(Xv)
        val_loss = criterion(y_val_pred, Yv).item()

    train_losses.append(loss.item())
    val_losses.append(val_loss)

    print(f"Epoch {epoch+1}: train_loss={loss.item():.4f}, val_loss={val_loss:.4f}")

    # early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        wait = 0
        torch.save(model.state_dict(), "artifacts/fusion_model.pth")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break


Epoch 1: train_loss=0.7083, val_loss=0.7030
Epoch 2: train_loss=0.7056, val_loss=0.6993
Epoch 3: train_loss=0.7027, val_loss=0.6964
Epoch 4: train_loss=0.7001, val_loss=0.6940
Epoch 5: train_loss=0.6976, val_loss=0.6923
Epoch 6: train_loss=0.6963, val_loss=0.6912
Epoch 7: train_loss=0.6940, val_loss=0.6907
Epoch 8: train_loss=0.6922, val_loss=0.6907
Epoch 9: train_loss=0.6917, val_loss=0.6911
Epoch 10: train_loss=0.6897, val_loss=0.6921
Epoch 11: train_loss=0.6893, val_loss=0.6936
Epoch 12: train_loss=0.6871, val_loss=0.6958
Epoch 13: train_loss=0.6882, val_loss=0.6984
Early stopping.


In [None]:
plt.plot(train_losses, label="Train")
plt.plot(val_losses, label="Val")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Fusion MLP Learning Curve")
plt.legend()
plt.savefig("reports/figs/learning_curve.png", dpi=150)
plt.close()


In [None]:
# load best weights just to be safe
model.load_state_dict(torch.load("artifacts/fusion_model.pth"))
model.eval()

with torch.no_grad():
    probs = model(Xv).cpu().numpy().ravel()

pred_labels = (probs >= 0.5).astype(int)
true_labels = Yv.cpu().numpy().ravel().astype(int)

acc  = accuracy_score(true_labels, pred_labels)
prec = precision_score(true_labels, pred_labels, zero_division=0)
rec  = recall_score(true_labels, pred_labels, zero_division=0)
f1   = f1_score(true_labels, pred_labels, zero_division=0)

metrics = {
    "accuracy":  float(acc),
    "precision": float(prec),
    "recall":    float(rec),
    "f1_score":  float(f1)
}

print(metrics)

with open("metrics/fusion.json", "w") as f:
    json.dump(metrics, f, indent=2)


{'accuracy': 0.5187014820042343, 'precision': 0.5222052067381318, 'recall': 0.9216216216216216, 'f1_score': 0.6666666666666666}


In [None]:
cm = confusion_matrix(true_labels, pred_labels)

fig, ax = plt.subplots()
im = ax.imshow(cm)

ax.set_xticks([0,1]); ax.set_yticks([0,1])
ax.set_xticklabels(["Pred 0","Pred 1"])
ax.set_yticklabels(["True 0","True 1"])

for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha="center", va="center")

ax.set_title("Confusion Matrix - Fusion Model")
plt.savefig("reports/figs/confusion_matrix.png", dpi=150)
plt.close()
