In [426]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

In [427]:
# Load the dataset
df = pd.read_csv('articles_with_sp500.csv')

df.head()

Unnamed: 0,title,pubDate,SP500_Open,SP500_Close,Day_Before_Close,Two_Days_Later_Close,Week_Later_Close,SP500_MA_3,SP500_MA_7
0,recession now... or stagflation forever,2024-09-13 16:45:00,5603.339844,5626.02002,5595.759766,5626.02002,5713.640137,,
1,"mortgage rates are dropping, but homes are not...",2024-09-13 16:40:00,5603.339844,5626.02002,5595.759766,5626.02002,5713.640137,,
2,hospitality stocks out-innovate challenges as ...,2024-09-13 16:34:47,5603.339844,5626.02002,5595.759766,5626.02002,5713.640137,5626.02002,
3,ratings agency fitch says extended strike at b...,2024-09-13 16:30:33,5603.339844,5626.02002,5595.759766,5626.02002,5713.640137,5626.02002,
4,"google parent company in bear territory, down ...",2024-09-13 16:28:47,5603.339844,5626.02002,5595.759766,5626.02002,5713.640137,5626.02002,


In [428]:
# Clean the data
df.dropna(inplace=True)

# Convert pubDate to datetime
df['pubDate'] = pd.to_datetime(df['pubDate'])

# Convert headlines to numerical representation using TF-IDF
vectorizer = TfidfVectorizer(min_df=2, max_df=0.9)
X_text = vectorizer.fit_transform(df['title'])

# Extract the numerical features that will be the output targets
numerical_features = df[['SP500_Open', 'SP500_Close', 'Day_Before_Close', 'Two_Days_Later_Close', 'Week_Later_Close', 
                         'SP500_MA_3', 'SP500_MA_7']].values

# Define the target variables (numerical features)
target = numerical_features

In [429]:
X_train, X_test, y_train, y_test = train_test_split(X_text.toarray(), target, test_size=0.2, random_state=42)

In [430]:
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.transform(y_test)

In [431]:
import torch
import torch.nn as nn

class MarketPredictionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MarketPredictionModel, self).__init__()
        
        # Define the 12 layers
        self.fc1 = nn.Linear(input_dim, 8192)  # First layer
        self.fc2 = nn.Linear(8192, 4096)
        self.fc3 = nn.Linear(4096, 4096)
        self.fc4 = nn.Linear(4096, 4096)
        self.fc5 = nn.Linear(4096, 2048)
        self.fc6 = nn.Linear(2048, 1024)
        self.fc7 = nn.Linear(1024, 512)
        self.fc8 = nn.Linear(512, 256)
        self.fc9 = nn.Linear(256, 128)
        self.fc10 = nn.Linear(128, 64)
        self.fc11 = nn.Linear(64, 32)
        self.fc12 = nn.Linear(32, output_dim)  # Final output layer

    def forward(self, x):
        # Apply ReLU activation to each layer
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = torch.relu(self.fc7(x))
        x = torch.relu(self.fc8(x))
        x = torch.relu(self.fc9(x))
        x = torch.relu(self.fc10(x))
        x = torch.relu(self.fc11(x))
        x = self.fc12(x)  # Output layer (no activation for regression)
        return x


# Instantiate the model
input_dim = X_train.shape[1]  # Number of features (TF-IDF features)
output_dim = y_train_scaled.shape[1]  # Number of target variables (numerical features)
model = MarketPredictionModel(input_dim, output_dim)

In [432]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [433]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_scaled, dtype=torch.float32)

In [434]:
# Training the model
num_epochs = 90
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Zero the gradients
    outputs = model(X_train_tensor)  # Forward pass
    loss = criterion(outputs, y_train_tensor)  # Compute the loss
    loss.backward()  # Backward pass
    optimizer.step()  # Update the weights

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [10/90], Loss: 1.0091
Epoch [20/90], Loss: 1.0060
Epoch [30/90], Loss: 1.0014
Epoch [40/90], Loss: 0.9973
Epoch [50/90], Loss: 0.9660
Epoch [60/90], Loss: 0.7769
Epoch [70/90], Loss: 0.6662
Epoch [80/90], Loss: 0.6386
Epoch [90/90], Loss: 0.6301


In [435]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Evaluate the model
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    # Get predictions from the model
    predictions_scaled = model(X_test_tensor)
    
    # Convert predictions to numpy for inverse scaling
    predictions_scaled_numpy = predictions_scaled.numpy()

    # Inverse scale the predictions to the original scale (for all 7 features)
    predictions_unscaled = scaler.inverse_transform(predictions_scaled_numpy)

    # Calculate the overall Mean Squared Error (MSE)
    mse_overall = mean_squared_error(y_test, predictions_unscaled)
    print(f"Test MSE (Overall): {mse_overall}")

    # Calculate Mean Squared Error (MSE) for each feature
    feature_names = ['SP500_Open', 'SP500_Close', 'Day_Before_Close', 
                     'Two_Days_Later_Close', 'Week_Later_Close', 
                     'SP500_MA_3', 'SP500_MA_7']

    for feature_idx, feature_name in enumerate(feature_names):
        mse_feature = mean_squared_error(y_test[:, feature_idx], predictions_unscaled[:, feature_idx])
        print(f"MSE for {feature_name}: {mse_feature}")


Test MSE (Overall): 11187.676657010223
MSE for SP500_Open: 11234.079008303548
MSE for SP500_Close: 10855.17092314485
MSE for Day_Before_Close: 11212.116222051452
MSE for Two_Days_Later_Close: 11116.872180562585
MSE for Week_Later_Close: 11302.53296149022
MSE for SP500_MA_3: 11424.655097327064
MSE for SP500_MA_7: 11168.310206191838


In [436]:

# Optionally, print sample-wise predictions and actual values for each feature
for i in range(3):
    print(f"Sample {i}")
    for feature_idx, feature_name in enumerate(feature_names):
        print(f"    {feature_name} - Predicted: {predictions_unscaled[i, feature_idx]}, Actual: {y_test[i, feature_idx]}")
    print("-" * 50)

Sample 0
    SP500_Open - Predicted: 5716.19677734375, Actual: 5641.68017578125
    SP500_Close - Predicted: 5716.40234375, Actual: 5618.259765625
    Day_Before_Close - Predicted: 5707.28125, Actual: 5634.580078125
    Two_Days_Later_Close - Predicted: 5728.59765625, Actual: 5702.5498046875
    Week_Later_Close - Predicted: 5762.2607421875, Actual: 5732.93017578125
    SP500_MA_3 - Predicted: 5713.86181640625, Actual: 5618.259765625
    SP500_MA_7 - Predicted: 5709.4013671875, Actual: 5618.259765625
--------------------------------------------------
Sample 1
    SP500_Open - Predicted: 5816.47412109375, Actual: 5912.7900390625
    SP500_Close - Predicted: 5842.4755859375, Actual: 5870.6201171875
    Day_Before_Close - Predicted: 5824.21875, Actual: 5870.6201171875
    Two_Days_Later_Close - Predicted: 5827.7783203125, Actual: 5916.97998046875
    Week_Later_Close - Predicted: 5863.02734375, Actual: 5969.33984375
    SP500_MA_3 - Predicted: 5821.95458984375, Actual: 5870.6201171875
   

In [437]:
import numpy as np
import pandas as pd

# Assuming you have the following variables:
# predictions_unscaled (your inverse scaled predictions)
# y_test (actual values)

# Step 1: Calculate absolute errors for each feature
abs_errors = np.abs(predictions_unscaled - y_test)

# Step 2: Calculate the maximum absolute error for each sample (across all features)
max_abs_errors = np.max(abs_errors, axis=1)

# Step 3: Sort samples by the maximum absolute error
sorted_indices = np.argsort(max_abs_errors)[::-1]  # Sort in descending order

# Step 4: Display the top N samples with the largest errors
top_n = 5  # Set the number of top samples you want to view
print(f"Top {top_n} samples with the largest errors:")

for i in range(top_n):
    idx = sorted_indices[i]
    print(f"Sample {i+1} (Index {idx}) - Max Error: {max_abs_errors[idx]:.2f}")
    for feature_idx, feature_name in enumerate(['SP500_Open', 'SP500_Close', 'Day_Before_Close', 
                                                 'Two_Days_Later_Close', 'Week_Later_Close', 
                                                 'SP500_MA_3', 'SP500_MA_7']):
        print(f"    {feature_name} - Predicted: {predictions_unscaled[idx, feature_idx]}, Actual: {y_test[idx, feature_idx]}")
    print("-" * 50)


Top 5 samples with the largest errors:
Sample 1 (Index 561) - Max Error: 296.29
    SP500_Open - Predicted: 5709.17529296875, Actual: 5976.759765625
    SP500_Close - Predicted: 5707.5361328125, Actual: 5995.5400390625
    Day_Before_Close - Predicted: 5699.24853515625, Actual: 5995.5400390625
    Two_Days_Later_Close - Predicted: 5721.69482421875, Actual: 6001.35009765625
    Week_Later_Close - Predicted: 5754.73779296875, Actual: 5870.6201171875
    SP500_MA_3 - Predicted: 5706.5966796875, Actual: 5995.5400390625
    SP500_MA_7 - Predicted: 5700.966796875, Actual: 5995.5400390625
--------------------------------------------------
Sample 2 (Index 592) - Max Error: 293.46
    SP500_Open - Predicted: 5711.65380859375, Actual: 5976.759765625
    SP500_Close - Predicted: 5710.666015625, Actual: 5995.5400390625
    Day_Before_Close - Predicted: 5702.083984375, Actual: 5995.5400390625
    Two_Days_Later_Close - Predicted: 5724.1318359375, Actual: 6001.35009765625
    Week_Later_Close - Pred