In [1]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import pickle

# Load cleaned news data
df_news = pd.read_csv("cleaned_news_data.csv")

# Fill missing values with empty strings
df_news.fillna("", inplace=True)

# Combine title, body, and source
df_news["combined_text"] = df_news["title"] + " [SEP] " + df_news["body"] + " [SEP] " + df_news["source"]

# Load DistilBERT model & tokenizer
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

print(f"Model loaded on {device}")

# Function to compute embeddings
def get_news_embeddings(texts, batch_size=16):
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=256)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)

        print(f"Processed {i + len(batch)}/{len(texts)} articles")

    return np.array(embeddings)

# Generate embeddings for combined text
news_texts = df_news["combined_text"].tolist()
news_embeddings = get_news_embeddings(news_texts, batch_size=16)

# Save embeddings as a DataFrame
df_news_embeddings = pd.DataFrame(news_embeddings, columns=[f"embed_{i}" for i in range(news_embeddings.shape[1])])
df_news_embeddings["published_at"] = df_news["published_at"]

# Save embeddings
with open("final_news_embeddings.pkl", "wb") as f:
    pickle.dump(df_news_embeddings, f)

df_news_embeddings.to_csv("news_embeddings.csv", index=False)

print("News embeddings generated")


  from .autonotebook import tqdm as notebook_tqdm


Model loaded on mps
Processed 16/5851 articles
Processed 32/5851 articles
Processed 48/5851 articles
Processed 64/5851 articles
Processed 80/5851 articles
Processed 96/5851 articles
Processed 112/5851 articles
Processed 128/5851 articles
Processed 144/5851 articles
Processed 160/5851 articles
Processed 176/5851 articles
Processed 192/5851 articles
Processed 208/5851 articles
Processed 224/5851 articles
Processed 240/5851 articles
Processed 256/5851 articles
Processed 272/5851 articles
Processed 288/5851 articles
Processed 304/5851 articles
Processed 320/5851 articles
Processed 336/5851 articles
Processed 352/5851 articles
Processed 368/5851 articles
Processed 384/5851 articles
Processed 400/5851 articles
Processed 416/5851 articles
Processed 432/5851 articles
Processed 448/5851 articles
Processed 464/5851 articles
Processed 480/5851 articles
Processed 496/5851 articles
Processed 512/5851 articles
Processed 528/5851 articles
Processed 544/5851 articles
Processed 560/5851 articles
Proces

In [2]:
# Load financial data
df_financial = pd.read_csv("enhanced_financial_data.csv", parse_dates=["Date"])

# Load enhanced news embeddings
df_news_final = pd.read_csv("news_embeddings.csv", parse_dates=["published_at"])

# Merge datasets on Date
df_merged = pd.merge(df_financial, df_news_final, left_on="Date", right_on="published_at", how="left")

# Fill missing news embeddings with zeros
embedding_columns = [col for col in df_news_final.columns if "embed_" in col]
df_merged[embedding_columns] = df_merged[embedding_columns].fillna(0)

# Drop duplicate published_at column
df_merged.drop(columns=["published_at"], inplace=True)

# Save final dataset
df_merged.to_csv("merged_final_data.csv", index=False)

print(f"Final dataset merged and saved! Shape: {df_merged.shape}")


Final dataset merged and saved! Shape: (5549, 794)


In [3]:
import pandas as pd

# Load merged dataset
df = pd.read_csv("merged_final_data.csv")

# Exclude non-numeric columns
numeric_df = df.select_dtypes(include=["number"])

# Compute correlation matrix
corr_matrix = numeric_df.corr().abs()

# Find highly correlated features (correlation > 0.95)
high_corr_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > 0.95:  # Threshold of 0.95
            colname = corr_matrix.columns[i]
            high_corr_features.add(colname)

# Drop highly correlated features
df_reduced = df.drop(columns=high_corr_features)

# Save reduced dataset
df_reduced.to_csv("feature_selected_data.csv", index=False)

print(f"Removed {len(high_corr_features)} highly correlated features! New shape: {df_reduced.shape}")


Removed 18 highly correlated features! New shape: (5549, 776)


In [4]:
import pandas as pd
import numpy as np

# Load merged dataset
df = pd.read_csv("merged_final_data.csv", parse_dates=["Date"])

# Identify news embedding columns
news_columns = [col for col in df.columns if "embed_" in col]

# Apply a rolling window (taking the average of past 3 days' news embeddings)
df[news_columns] = df[news_columns].rolling(window=3, min_periods=1).mean()

# Save the dataset with rolling news embeddings
df.to_csv("rolling_news_data.csv", index=False)

print(f"News embeddings now use a rolling window! Shape: {df.shape}")


News embeddings now use a rolling window! Shape: (5549, 794)


In [5]:
from sklearn.preprocessing import StandardScaler

# Load reduced dataset
df = pd.read_csv("rolling_news_data.csv", parse_dates=["Date"])

# Define feature columns (excluding Date & Target)
feature_cols = [col for col in df.columns if col not in ["Date", "Close"]]

# Apply StandardScaler
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])

# Save scaled dataset
df.to_csv("scaled_final_data.csv", index=False)

print(" Scaled dataset saved")


 Scaled dataset saved


In [6]:
import pandas as pd
import numpy as np
import torch

# Load scaled dataset
df = pd.read_csv("scaled_final_data.csv", parse_dates=["Date"])

# Define feature columns (excluding 'Close' which we are predicting)
feature_cols = [col for col in df.columns if col not in ["Date", "Close"]]

# Convert to numpy array
X_data = df[feature_cols].values
y_data = df["Close"].values

# Function to create sequences
def create_sequences(data, target, seq_length=10):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(target[i+seq_length])
    return np.array(X), np.array(y)

# Create sequences
seq_length = 10
X_seq, y_seq = create_sequences(X_data, y_data, seq_length)

# Split into train & test sets (80% train, 20% test)
train_size = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:train_size], X_seq[train_size:]
y_train, y_test = y_seq[:train_size], y_seq[train_size:]

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

print(f"Data formatted for LSTM. Training size: {X_train.shape}, Test size: {X_test.shape}")


Data formatted for LSTM. Training size: (4431, 10, 792), Test size: (1108, 10, 792)


In [22]:
import torch.nn as nn
import torch.optim as optim

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])  # Take last output of sequence

# Initialize model
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = LSTMModel(X_train.shape[2]).to(device)

print(f"Model initialized on {device}")

Model initialized on mps


In [23]:
import torch.optim as optim
import pickle
from torch.utils.data import DataLoader, TensorDataset

# Prepare DataLoader
batch_size = 64  # Increase batch size for stability
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define Loss & Optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)  # Use AdamW for better stability

# Learning rate scheduler (ReduceLROnPlateau)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50, verbose=True)

# Training Configuration
epochs = 1000
best_loss = float("inf")

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()

        # Apply gradient clipping to stabilize training
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.6f}")
    
    # Adjust learning rate based on validation loss
    scheduler.step(epoch_loss)
    
    # Save the best model
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), "lstm_model_best.pth")
        torch.save(optimizer.state_dict(), "lstm_optimizer_best.pth")
        print(f"Model saved at epoch {epoch+1} with best loss {best_loss:.6f}")

print("LSTM Training Complete")



Epoch 1/1000, Loss: 14628.092160
Model saved at epoch 1 with best loss 14628.092160
Epoch 2/1000, Loss: 13217.156864
Model saved at epoch 2 with best loss 13217.156864
Epoch 3/1000, Loss: 12152.826353
Model saved at epoch 3 with best loss 12152.826353
Epoch 4/1000, Loss: 11182.695201
Model saved at epoch 4 with best loss 11182.695201
Epoch 5/1000, Loss: 10250.584989
Model saved at epoch 5 with best loss 10250.584989
Epoch 6/1000, Loss: 9375.984194
Model saved at epoch 6 with best loss 9375.984194
Epoch 7/1000, Loss: 8499.723877
Model saved at epoch 7 with best loss 8499.723877
Epoch 8/1000, Loss: 7698.846687
Model saved at epoch 8 with best loss 7698.846687
Epoch 9/1000, Loss: 6950.830050
Model saved at epoch 9 with best loss 6950.830050
Epoch 10/1000, Loss: 6225.411816
Model saved at epoch 10 with best loss 6225.411816
Epoch 11/1000, Loss: 5573.560575
Model saved at epoch 11 with best loss 5573.560575
Epoch 12/1000, Loss: 4922.075014
Model saved at epoch 12 with best loss 4922.075014


**Bi-LSTM**

In [17]:
import torch.nn as nn
import torch.optim as optim

# Define Optimized BiLSTM Model with Batch Norm
class FinalBiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=3, dropout=0.2):
        super(FinalBiLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout, bidirectional=True
        )
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)  # Normalization layer
        self.fc = nn.Linear(hidden_dim * 2, 1)  # Output layer

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.batch_norm(lstm_out[:, -1, :])  # Apply batch norm
        return self.fc(lstm_out)

# Initialize Model
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model = FinalBiLSTM(X_train.shape[2]).to(device)

print(f"BiLSTM Model initialized on {device}")




BiLSTM Model initialized on mps


In [18]:
from torch.utils.data import DataLoader, TensorDataset

# Prepare DataLoader
batch_size = 128
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define Loss & Optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)

# Cyclic Learning Rate Scheduler
scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-5, max_lr=0.001, step_size_up=100, mode='triangular2')

# Train Model
epochs = 1500
best_loss = float("inf")

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.6f}")

    # Adjust learning rate dynamically
    scheduler.step()

    # Save the best model
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), "final_bilstm_best.pth")
        torch.save(optimizer.state_dict(), "final_bilstm_optimizer.pth")
        print(f"Model saved at epoch {epoch+1} with best loss {best_loss:.6f}")

print("BiLSTM Training Complete")




Epoch 1/1500, Loss: 15787.088783
Model saved at epoch 1 with best loss 15787.088783
Epoch 2/1500, Loss: 15769.774498
Model saved at epoch 2 with best loss 15769.774498
Epoch 3/1500, Loss: 15745.184487
Model saved at epoch 3 with best loss 15745.184487
Epoch 4/1500, Loss: 15692.920619
Model saved at epoch 4 with best loss 15692.920619
Epoch 5/1500, Loss: 15651.046512
Model saved at epoch 5 with best loss 15651.046512
Epoch 6/1500, Loss: 15574.870898
Model saved at epoch 6 with best loss 15574.870898
Epoch 7/1500, Loss: 15524.589481
Model saved at epoch 7 with best loss 15524.589481
Epoch 8/1500, Loss: 15495.778404
Model saved at epoch 8 with best loss 15495.778404
Epoch 9/1500, Loss: 15450.155190
Model saved at epoch 9 with best loss 15450.155190
Epoch 10/1500, Loss: 15421.930887
Model saved at epoch 10 with best loss 15421.930887
Epoch 11/1500, Loss: 15378.305273
Model saved at epoch 11 with best loss 15378.305273
Epoch 12/1500, Loss: 15342.418471
Model saved at epoch 12 with best loss

In [24]:
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error, r2_score

# Load trained LSTM model
model_lstm = LSTMModel(X_train.shape[2]).to(device)
model_lstm.load_state_dict(torch.load("lstm_model_best.pth"))
model_lstm.eval()

# Load trained BiLSTM model
model_bilstm = BiLSTMModel(X_train.shape[2]).to(device)
model_bilstm.load_state_dict(torch.load("bilstm_model_best.pth"))
model_bilstm.eval()

# Evaluate models on test set
models = {"LSTM": model_lstm, "BiLSTM": model_bilstm}

for model_name, model in models.items():
    with torch.no_grad():
        y_pred_tensor = model(X_test_tensor.to(device))
    
    y_pred = y_pred_tensor.cpu().numpy().flatten()
    y_true = y_test_tensor.cpu().numpy().flatten()
    
    # Compute Metrics
    mse = mean_squared_error(y_true, y_pred)
    spearman_corr, _ = spearmanr(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name} Evaluation:")
    print(f"MSE: {mse:.4f}")
    print(f"Spearman Correlation: {spearman_corr:.4f}")
    print(f"R² Score: {r2:.4f}")



LSTM Evaluation:
MSE: 3884.3511
Spearman Correlation: -0.1540
R² Score: -3.4527

BiLSTM Evaluation:
MSE: 3785.4490
Spearman Correlation: 0.3296
R² Score: -3.3393


  model_lstm.load_state_dict(torch.load("lstm_model_best.pth"))
  model_bilstm.load_state_dict(torch.load("bilstm_model_best.pth"))
