In [444]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error

In [445]:
# Load the dataset
df = pd.read_csv('articles_with_sp500.csv')

df.head(200)

Unnamed: 0,title,pubDate,SP500_Open,SP500_Close,Day_Before_Close,Two_Days_Later_Close,Week_Later_Close,SP500_MA_3,SP500_MA_7
0,recession now... or stagflation forever,2024-09-13 16:45:00,5603.339844,5626.020020,5595.759766,5626.020020,5713.640137,,
1,"mortgage rates are dropping, but homes are not...",2024-09-13 16:40:00,5603.339844,5626.020020,5595.759766,5626.020020,5713.640137,,
2,hospitality stocks out-innovate challenges as ...,2024-09-13 16:34:47,5603.339844,5626.020020,5595.759766,5626.020020,5713.640137,5626.020020,
3,ratings agency fitch says extended strike at b...,2024-09-13 16:30:33,5603.339844,5626.020020,5595.759766,5626.020020,5713.640137,5626.020020,
4,"google parent company in bear territory, down ...",2024-09-13 16:28:47,5603.339844,5626.020020,5595.759766,5626.020020,5713.640137,5626.020020,
...,...,...,...,...,...,...,...,...,...
195,"ing gives update on climate action approach, a...",2024-09-19 06:00:00,5702.629883,5713.640137,5618.259766,5702.549805,5722.259766,5681.846680,5645.511300
196,the central bank bonanza continues with the bo...,2024-09-19 05:46:38,5702.629883,5713.640137,5618.259766,5702.549805,5722.259766,5713.640137,5659.137068
197,final result of onni bidco's voluntary recomme...,2024-09-19 05:35:00,5702.629883,5713.640137,5618.259766,5702.549805,5722.259766,5713.640137,5672.762835
198,sampo plc’s share buybacks 18 september 2024,2024-09-19 05:30:00,5702.629883,5713.640137,5618.259766,5702.549805,5722.259766,5713.640137,5686.388602


In [446]:
# Clean the data
df.dropna(inplace=True)

# Convert pubDate to datetime
df['pubDate'] = pd.to_datetime(df['pubDate'])

# Convert headlines to numerical representation using TF-IDF
vectorizer = TfidfVectorizer(min_df=1, max_df=0.95)
X_text = vectorizer.fit_transform(df['title'])

# Scale SP500_Open by dividing it by 100
X_open_scaled = df['SP500_Open'] / 8000

# Combine the scaled SP500_Open with the TF-IDF matrix
X_combined = np.hstack((X_open_scaled.values.reshape(-1, 1), X_text.toarray()))

# Extract the numerical features that will be the output targets
numerical_features = df[['SP500_Close', 'Two_Days_Later_Close']].values

pub_dates = df['pubDate']

# Define the target variables (numerical features)
target = numerical_features

In [447]:
X_train, X_test, y_train, y_test, pub_dates_train, pub_dates_test = train_test_split(
    X_combined, target, pub_dates, test_size=0.2)

In [448]:
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.transform(y_test)

In [449]:
class MarketPredictionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MarketPredictionModel, self).__init__()
        
        # Define the 12 layers
        self.fc1 = nn.Linear(input_dim, 8192)  # First layer
        self.fc2 = nn.Linear(8192, 4096)
        self.fc3 = nn.Linear(4096, 4096)
        self.fc4 = nn.Linear(4096, 4096)
        self.fc5 = nn.Linear(4096, 4096)
        self.fc6 = nn.Linear(4096, 2048)
        self.fc7 = nn.Linear(2048, 2048)
        self.fc8 = nn.Linear(2048, 1024)
        self.fc9 = nn.Linear(1024, 512)
        self.fc10 = nn.Linear(512, 256)
        self.fc11 = nn.Linear(256, 128)
        self.fc12 = nn.Linear(128, output_dim)  # Final output layer

    def forward(self, x):
        # Apply ReLU activation to each layer
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = torch.relu(self.fc7(x))
        x = torch.relu(self.fc8(x))
        x = torch.relu(self.fc9(x))
        x = torch.relu(self.fc10(x))
        x = torch.relu(self.fc11(x))
        x = self.fc12(x)  # Output layer (no activation for regression)
        return x


# Instantiate the model
input_dim = X_train.shape[1]  # Number of features (TF-IDF features)
output_dim = y_train_scaled.shape[1]  # Number of target variables (numerical features)
model = MarketPredictionModel(input_dim, output_dim)

In [450]:
print(X_train.shape)

(3503, 7860)


In [451]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

In [452]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_scaled, dtype=torch.float32)

In [453]:
# Training the model
num_epochs = 90
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Zero the gradients
    outputs = model(X_train_tensor)  # Forward pass
    loss = criterion(outputs, y_train_tensor)  # Compute the loss
    loss.backward()  # Backward pass
    optimizer.step()  # Update the weights

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [10/90], Loss: 0.9993
Epoch [20/90], Loss: 1.0004
Epoch [30/90], Loss: 0.7978
Epoch [40/90], Loss: 0.5554
Epoch [50/90], Loss: 0.1752
Epoch [60/90], Loss: 0.1049
Epoch [70/90], Loss: 0.0878
Epoch [80/90], Loss: 0.0809
Epoch [90/90], Loss: 0.0790


In [454]:
# Evaluate the model
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    # Get predictions from the model
    predictions_scaled = model(X_test_tensor)
    
    # Convert predictions to numpy for inverse scaling
    predictions_scaled_numpy = predictions_scaled.numpy()

    # Inverse scale the predictions to the original scale
    predictions_unscaled = scaler.inverse_transform(predictions_scaled_numpy)

    # Calculate the overall Mean Squared Error (MSE)
    mse_overall = mean_squared_error(y_test, predictions_unscaled)
    print(f"Test MSE (Overall): {mse_overall}")

    # Calculate Mean Squared Error (MSE) for each feature
    feature_names = ['SP500_Close', 'Two_Days_Later_Close']

    for feature_idx, feature_name in enumerate(feature_names):
        mse_feature = mean_squared_error(y_test[:, feature_idx], predictions_unscaled[:, feature_idx])
        print(f"MSE for {feature_name}: {mse_feature}")


Test MSE (Overall): 10977.688344238555
MSE for SP500_Close: 11161.927814785748
MSE for Two_Days_Later_Close: 10793.448873691363


In [455]:
# Optionally, print sample-wise predictions and actual values for each feature
for i in range(3):
    print(f"Sample {i}")
    for feature_idx, feature_name in enumerate(feature_names):
        print(f"    {feature_name} - Predicted: {predictions_unscaled[i, feature_idx]}, Actual: {y_test[i, feature_idx]}")
    print("-" * 50)

Sample 0
    SP500_Close - Predicted: 5780.58203125, Actual: 5813.669921875
    Two_Days_Later_Close - Predicted: 5784.189453125, Actual: 5728.7998046875
--------------------------------------------------
Sample 1
    SP500_Close - Predicted: 5735.3212890625, Actual: 5732.93017578125
    Two_Days_Later_Close - Predicted: 5739.921875, Actual: 5745.3701171875
--------------------------------------------------
Sample 2
    SP500_Close - Predicted: 5817.56640625, Actual: 5712.68994140625
    Two_Days_Later_Close - Predicted: 5821.28369140625, Actual: 5929.0400390625
--------------------------------------------------


In [456]:
# Step 1: Calculate absolute errors for each feature
abs_errors = np.abs(predictions_unscaled - y_test)

# Step 2: Calculate the maximum absolute error for each sample (across all features)
max_abs_errors = np.max(abs_errors, axis=1)

# Step 3: Sort samples by the maximum absolute error
sorted_indices = np.argsort(max_abs_errors)[::-1]  # Sort in descending order

# Step 4: Display the top N samples with the largest errors
top_n = 5  # Set the number of top samples you want to view
print(f"Top {top_n} samples with the largest errors:")

for i in range(top_n):
    idx = sorted_indices[i]
    print(f"Sample {i+1} (Index {idx}) - Max Error: {max_abs_errors[idx]:.2f}")
    for feature_idx, feature_name in enumerate(['SP500_Close', 'Two_Days_Later_Close']):
        print(f"    {feature_name} - Predicted: {predictions_unscaled[idx, feature_idx]}, Actual: {y_test[idx, feature_idx]}")
    print("-" * 50)


Top 5 samples with the largest errors:
Sample 1 (Index 321) - Max Error: 290.57
    SP500_Close - Predicted: 5901.9091796875, Actual: 5633.08984375
    Two_Days_Later_Close - Predicted: 5908.83349609375, Actual: 5618.259765625
--------------------------------------------------
Sample 2 (Index 67) - Max Error: 288.56
    SP500_Close - Predicted: 5899.88134765625, Actual: 5633.08984375
    Two_Days_Later_Close - Predicted: 5906.82177734375, Actual: 5618.259765625
--------------------------------------------------
Sample 3 (Index 391) - Max Error: 285.53
    SP500_Close - Predicted: 5699.84814453125, Actual: 5985.3798828125
    Two_Days_Later_Close - Predicted: 5705.36181640625, Actual: 5870.6201171875
--------------------------------------------------
Sample 4 (Index 740) - Max Error: 278.41
    SP500_Close - Predicted: 5706.9677734375, Actual: 5985.3798828125
    Two_Days_Later_Close - Predicted: 5712.2978515625, Actual: 5870.6201171875
--------------------------------------------------

In [457]:
# Rows generation section
rows = []

# List of feature names
feature_names = ["SP500_Close", "Two_Days_Later_Close"]

# Ensure pub_dates_test is aligned with X_test and y_test
for i in range(len(predictions_unscaled)):
    # Use pub_dates_test for alignment
    row = {"Sample": i, "pubDate": pub_dates_test.iloc[i]}  # Add the corresponding pubDate
    for feature_idx, feature_name in enumerate(feature_names):
        # Add predicted, actual, and error values
        row[f"{feature_name}_Predicted"] = predictions_unscaled[i, feature_idx]
        row[f"{feature_name}_Actual"] = y_test[i, feature_idx]
        row[f"{feature_name}_Error"] = abs(predictions_unscaled[i, feature_idx] - y_test[i, feature_idx])
    rows.append(row)

# Convert rows to a DataFrame
df_pred_vs_actual = pd.DataFrame(rows)

# Export to CSV
csv_filename = "results/pred_vs_actual.csv"
df_pred_vs_actual.to_csv(csv_filename, index=False)

print(f"Predicted vs Actual data has been exported to '{csv_filename}'")

Predicted vs Actual data has been exported to 'results/pred_vs_actual.csv'


In [458]:
#Final Eval
df_pred_vs_actual = pd.read_csv("results/pred_vs_actual.csv")

# Compute mean and median for each error column
error_columns = [col for col in df_pred_vs_actual.columns if "Error" in col]
error_stats = {}

for error_column in error_columns:
    mean_error = df_pred_vs_actual[error_column].mean()
    median_error = df_pred_vs_actual[error_column].median()
    error_stats[error_column] = {"Mean": mean_error, "Median": median_error}

# Print out the error statistics
for error_column, stats in error_stats.items():
    print(f"{error_column} - Mean Error: {stats['Mean']:.2f}, Median Error: {stats['Median']:.2f}")


SP500_Close_Error - Mean Error: 84.16, Median Error: 69.12
Two_Days_Later_Close_Error - Mean Error: 83.30, Median Error: 68.30
