## Project setup

### Step 1
Load the data. You need to create a folder in your google drive called dataset and upload the csv file. Rename it to finance.csv
### Step 2
Train the neural network,
### Step 3
Use this map in the testing phase to determine how a news article will affect a given stock


### **Step 1.1 Loading the dataset**

In [1]:
import sys
print(sys.version)


3.11.14 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 18:30:03) [MSC v.1929 64 bit (AMD64)]


In [None]:
# for colab users uncomment the following lines to mount your google drive
#from google.colab import drive
# drive.mount('/content/drive')

In [2]:
#! pip install lightning > /dev/null
#! pip install tokenizers > /dev/null
#! pip install pandas > /dev/null
# ! pip install lightning
# ! pip install tokenizers
import pandas as pd
#glbl_ds_pth = "/content/drive/MyDrive/dataset" # for colab users
file_path = "finance.csv" #TODO: 
print(f"Loading data from: {file_path}")
df = pd.read_csv(file_path, encoding="ISO-8859-1")
print(f"üìä Original data shape: {df.shape}")



Loading data from: finance.csv
üìä Original data shape: (602248, 28)


In [None]:
#! python --version

### **Step 1.2 make headlines lowercase + special character removal**

In [None]:
# SCRAPPED for FINbert
#
# import string
# # This section makes the words easier to work with
# cleaner = str.maketrans('', '', string.punctuation)

# df['news_article'] = df['news_article'].apply(lambda x: x.translate(cleaner).lower())
# print(f"üìä Original data shape: {df.shape}")

### Step 1.3 Add daily price change

In [None]:
def calculate_score(row):
    close_price = row['close_price']
    open_price = row['open_price']
    percent_diff = (close_price - open_price) / open_price
    if percent_diff > 0.02:
        return 5
    elif percent_diff > 0.005:
        return 4
    elif percent_diff > -0.005:
        return 3
    elif percent_diff > -0.02:
        return 2
    else:
        return 1
   


    #return round(percent_diff * 100, 3)

for index, row in df.iterrows():
    score = calculate_score(row)
    df.at[index, 'price_change'] = score

# Count how many rows belong to each class
class_counts = df["price_change"].value_counts().sort_index()

print("Price Change Class Counts:")
print(class_counts)

#print(df["price_change"])

### Step 1.4: Extract sentiment and embeddings from FINbert

In [3]:
%pip install tqdm transformers torch
import torch
torch.cuda.is_available()

Note: you may need to restart the kernel to use updated packages.


False

In [None]:


import torch.nn.functional as F
from sklearn.decomposition import PCA
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

# FinBERT sentiment model
model_name = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_hidden_states=False)

#put on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Clean Data Set
class HeadLineDataset(Dataset):
  def __init__(self, texts):
    self.texts = texts

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    return self.texts[idx]

def collate_fn(batch):
    enc = tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    return batch, enc

print("üöÄ Starting FinBERT processing...")

batch_size = 32
dataset = HeadLineDataset(df["news_article"].tolist())
loader = DataLoader(
    dataset,
    batch_size=batch_size,
    collate_fn=collate_fn
)

all_sentiments = []
all_embeddings = []

print(f" Processing {len(dataset)} headlines in batches of {batch_size}...")

# Batch Processing Loop
for texts, inputs in tqdm(loader, desc="Processing FinBERT"):
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
    # Get sentiment probabilities
    outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1).cpu().numpy()
    all_sentiments.extend(probs)

    # Get embeddings from base model
    base_outputs = model.bert(**inputs)
    cls_vectors = base_outputs.last_hidden_state[:, 0, :].cpu().numpy()
    all_embeddings.extend(cls_vectors)

    #   CRITICAL: Store original indices for each processed row
    # all_original_indices.extend(indices)

print("üìä Compressing embeddings with PCA...")
all_embeddings_array = np.array(all_embeddings)
pca = PCA(n_components=128)
reduced_embeddings = pca.fit_transform(all_embeddings_array)

print(f" Compression complete! Retained {np.sum(pca.explained_variance_ratio_):.2%} variance")

finbert_df = pd.DataFrame()

# finbert_df['original_index'] = all_original_indices

# Add sentiment probabilities (3 columns)
finbert_df[['sent_neg', 'sent_neu', 'sent_pos']] = all_sentiments

# Add all embeddings (128 columns)
for i in range(128):
  finbert_df[f'finbert_emb_{i}'] = reduced_embeddings[:, i]

# VERIFICATION: Check alignment
print("\n VERIFICATION REPORT:")
print(f"   Original dataset rows: {len(df)}")
print(f"   Processed features rows: {len(finbert_df)}")
print(f"   Rows filtered out: {len(df) - len(finbert_df)}")

print("üéâ PROCESSING COMPLETE!")
print(f"    Final shape: {finbert_df.shape}")
print(f"    Columns: {len(finbert_df.columns)} (3 sentiments + 128 embeddings)")

In [None]:
print("üîç SAMPLING KEY COLUMNS:")
print("=" * 50)

# finbert_df = pd.read_parquet(f"{glbl_ds_pth}/finbert_features.parquet") # USED TO SKIP ABOVE STEP, REMOVE WHEN FINISHED
# finbert_df = (finbert_df
#     .drop('original_index', axis=1)
#     .rename(columns=lambda x: x.replace('emb_', 'finbert_emb_') if x.startswith('emb_') else x)
# )
# # Reorder columns: embeddings first, then sentiments
# embedding_cols = [col for col in finbert_df.columns if 'finbert_emb_' in col]
# sentiment_cols = ['sent_neg', 'sent_neu', 'sent_pos']
# other_cols = [col for col in finbert_df.columns if col not in embedding_cols + sentiment_cols]

# finbert_df = finbert_df[other_cols + sentiment_cols + embedding_cols]

# Show original_index + sentiments + first 5 embeddings
sample_cols = ['sent_neg', 'sent_neu', 'sent_pos']
sample_cols.extend([f'finbert_emb_{i}' for i in range(5)])  # First 5 embeddings

print(finbert_df[sample_cols].head(-10).round(3))  # Round to 3 decimal places

In [None]:
print(f"üìä Original data shape: {df.shape}")

# Your FinBERT features (already loaded as features_df)
print(f"üìà FinBERT features shape: {finbert_df.shape}")

### 1.5: getting numerics of categories

In [None]:
from sklearn.preprocessing import LabelEncoder
import json
import os

def debug_label_encoded_csvs():
    """
    Debug version with extensive error handling
    """
    print("üêõ DEBUG MODE: Starting label encoding...")


    # Load your data
    try:
        print(f" First few rows:")
        print(df[['stock_name', 'sector', 'industry']].head(2))
    except Exception as e:
        print(f" Failed to load CSV: {e}")
        return

    # Categorical columns to encode
    categorical_columns = ['stock_name', 'sector', 'industry']

    # Check if columns exist
    print(f"\nüîç CHECKING COLUMNS:")
    for col in categorical_columns:
        exists = col in df.columns
        print(f"   {col}: {' EXISTS' if exists else ' MISSING'}")
        if exists:
            print(f"      dtype: {df[col].dtype}")
            print(f"      non-null: {df[col].notna().sum()}/{len(df)}")
            print(f"      sample: {df[col].iloc[:3].tolist()}")

    # Store encoders and results
    encoders = {}
    results = {}

    for col in categorical_columns:
        if col in df.columns:
            print(f"\nüéØ Processing {col}...")

            try:
                # Check for null values
                null_count = df[col].isnull().sum()
                if null_count > 0:
                    print(f"    Warning: {null_count} null values in {col}")
                    # Fill nulls with a placeholder
                    df[col] = df[col].fillna('UNKNOWN')

                # Create label encoder
                encoder = LabelEncoder()
                encoded_values = encoder.fit_transform(df[col])
                print(f"    Label encoding successful")

                # Create DataFrame for this category
                encoded_df = pd.DataFrame({
                    'original_index': df.index,
                    f'{col}_encoded': encoded_values
                })
                print(f"    DataFrame created: {encoded_df.shape}")

                # Store encoder info
                encoders[col] = {
                    'encoder': encoder,
                    'classes': encoder.classes_.tolist(),
                    'num_classes': len(encoder.classes_)
                }

                results[col] = {
                    'shape': encoded_df.shape,
                    'num_classes': len(encoder.classes_)
                }

                print(f"    Unique values: {len(encoder.classes_)}")
                print(f"    Sample mapping: {encoder.classes_[:3]} ‚Üí [0, 1, 2]")

            except Exception as e:
                print(f"    Error processing {col}: {e}")
                import traceback
                traceback.print_exc()
                continue

    # Save encoder mappings to JSON
    if encoders:
        try:
            encoder_info = {}
            for col, info in encoders.items():
                encoder_info[col] = {
                    'num_classes': info['num_classes'],
                    'classes_sample': info['classes'][:10]
                }

            encoder_file = f"label_encoder_mappings.json"
            with open(encoder_file, 'w') as f:
                json.dump(encoder_info, f, indent=2)

            print(f"\n Encoder mappings saved to: {encoder_file}")
            print(f" JSON file exists: {os.path.exists(encoder_file)}")

        except Exception as e:
            print(f" Failed to save JSON: {e}")
    else:
        print(" No encoders created - skipping JSON save")

    # Create merged version
    try:
        print(f"\nüîÑ Creating merged dataset...")
        merged_df = df.copy()
        print(f" Merged base shape: {merged_df.shape}")

        for col in categorical_columns:
            if col in df.columns and col in encoders:
                merged_df[f'{col}_encoded'] = encoders[col]['encoder'].transform(df[col])

        # Drop original categorical columns
        #merged_df = pd.concat([merged_df_, finbert_df], axis=1) uncomment this 

        merged_file = "finance.csv"
        merged_df.to_csv(merged_file, index=False)

        print(f" Merged dataset saved to: {merged_file}")
        print(f" Merged shape: {merged_df.shape}")
        print(f" Merged file exists: {os.path.exists(merged_file)}")

    except Exception as e:
        print(f" Failed to create merged dataset: {e}")
        merged_df = None

    # List all files in output directory
    print(f"\n FILES IN OUTPUT DIRECTORY:")
    try:
        files = os.listdir(".")
        if files:
            for file in files:
                file_path = os.path.join(".", file)
                file_size = os.path.getsize(file_path)
                print(f"    {file} ({file_size} bytes)")
        else:
            print("   ‚ùå No files found!")
    except Exception as e:
        print(f"   ‚ùå Could not list directory: {e}")

    return results, encoders, merged_df

### 1.6: getting numerics of categories

In [None]:
# Run the debug version
print("STARTING DEBUG LABEL ENCODING...")
results, encoders, merged_df = debug_label_encoded_csvs()


## This section allows you to use a precreated merged_df
Please use this to avoid heavy finBERT computation, after_fin_finance.csv contains its output and is the merged_df from above

In [7]:
import pandas as pd
merged_df = pd.read_csv("after_fin_finance.csv", encoding="ISO-8859-1")

def calculate_score(row):
    close_price = row['close_price']
    open_price = row['open_price']
    percent_diff = (close_price - open_price) / open_price
    if percent_diff > 0.02:
        return 5
    elif percent_diff > 0.005:
        return 4
    elif percent_diff > -0.005:
        return 3
    elif percent_diff > -0.02:
        return 2
    else:
        return 1
   

for index, row in merged_df.iterrows():
    score = calculate_score(row)
    merged_df.at[index, 'price_change'] = score

# Count how many rows belong to each class
class_counts = merged_df["price_change"].value_counts().sort_index()

print("Price Change Class Counts:") # verify even distribution
print(class_counts)



Price Change Class Counts:
price_change
1.0    113917
2.0    116029
3.0    135805
4.0    123292
5.0    113205
Name: count, dtype: int64


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scale_columns = [
    'volume',                    # Millions/billions
    'open_price', 'high_price', 'low_price', 'close_price',  # Dollar values
    'float_shares',              # Large numbers
    'shares_outstanding',        # Very large numbers
    'market_cap',                # Billions/trillions
    'pe_ratio',                  # Can be 0-100+ range
    'daily_volatility_pct'       # Percentages but can vary widely
]
optional_scale = [
    #'signed_price_move_pct', using this is leakage
    'sent_neg', 'sent_neu', 'sent_pos', 'has_fda', 'has_merger','has_upgrade', 'has_downgrade'

]
total_scale = scale_columns + optional_scale


# Split data first (important!)
train_df, val_df = train_test_split(merged_df, test_size=0.3, random_state=42)

scalar = StandardScaler()
train_df[total_scale] = scalar.fit_transform(train_df[total_scale])
val_df[total_scale] = scalar.transform(val_df[total_scale])

print(f"\nüìà Scaled {len(total_scale)} features to mean=0, std=1")
print(f"   train_df shape: {train_df.shape}")
print(f"   val_df shape: {val_df.shape}")
print(f"   train_df columns: {train_df.columns}")
print(f"   val_df columns: {val_df.columns}")
print(f"   train_df head: {train_df.head(2)}")
print(f"   val_df head: {val_df.head(2)}")


üìà Scaled 17 features to mean=0, std=1
   train_df shape: (421573, 159)
   val_df shape: (180675, 159)
   train_df columns: Index(['date_only', 'news_article', 'stock_name', 'volume', 'open_price',
       'high_price', 'low_price', 'close_price', 'signed_price_move_pct',
       'move_category',
       ...
       'finbert_emb_118', 'finbert_emb_119', 'finbert_emb_120',
       'finbert_emb_121', 'finbert_emb_122', 'finbert_emb_123',
       'finbert_emb_124', 'finbert_emb_125', 'finbert_emb_126',
       'finbert_emb_127'],
      dtype='object', length=159)
   val_df columns: Index(['date_only', 'news_article', 'stock_name', 'volume', 'open_price',
       'high_price', 'low_price', 'close_price', 'signed_price_move_pct',
       'move_category',
       ...
       'finbert_emb_118', 'finbert_emb_119', 'finbert_emb_120',
       'finbert_emb_121', 'finbert_emb_122', 'finbert_emb_123',
       'finbert_emb_124', 'finbert_emb_125', 'finbert_emb_126',
       'finbert_emb_127'],
      dtype='obj

# Training Loop



## Prepare DataLoaders

In [9]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix

# Make sure we're using GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ---- 1. Define which columns to use as features ----

# FinBERT embedding columns (if present)
embedding_cols = [f"finbert_emb_{i}" for i in range(128) if f"finbert_emb_{i}" in train_df.columns]

# Sentiment columns
sent_cols = ["sent_neg", "sent_neu", "sent_pos"]

# Encoded categorical columns
cat_cols = ["stock_name_encoded", "sector_encoded", "industry_encoded"]

# Numeric columns you already scaled earlier
num_cols = total_scale  # from your previous code

# Final feature list
feature_cols = embedding_cols + sent_cols + cat_cols + num_cols
#leak_features = ["signed_price_move_pct"] not using to begin with
deployment_features = [col for col in feature_cols]
print("Number of features:", len(feature_cols))
print("price_change in feature_cols?", "price_change" in feature_cols)

# ---- 2. Clean NaNs just in case ----
target_col = "price_change"

train_df_m = train_df.dropna(subset=feature_cols + [target_col]).copy()
val_df_m   = val_df.dropna(subset=feature_cols + [target_col]).copy()

# ---- 3. Numpy arrays for X and y ----
X_train = train_df_m[feature_cols].values.astype("float32")
X_val   = val_df_m[feature_cols].values.astype("float32")

# price_change is in {1,2,3,4,5}; CrossEntropyLoss expects class indices 0..(C-1). Looks like there are no 1's in price change to create the zero, causing 4 to be out of bounds.
y_train = train_df_m[target_col].values.astype("int64") - 1
y_val   = val_df_m[target_col].values.astype("int64") - 1

num_classes = len(np.unique(y_train))
print("Classes:", np.unique(y_train), "‚Üí num_classes =", num_classes)

# ---- 4. Torch Dataset wrappers ----
class FinanceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)       # float32
        self.y = torch.from_numpy(y)       # int64
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = FinanceDataset(X_train, y_train)
val_dataset   = FinanceDataset(X_val, y_val)

#atch_size = 128

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, drop_last=False)
val_loader   = DataLoader(val_dataset, batch_size=512, shuffle=False, drop_last=False)


# Or check for any negative values after transformation
negative_mask = y_train < 0
if negative_mask.any():
    print(f"Warning: Found {negative_mask.sum()} negative values in y_train")
    print("Negative values:", y_train[negative_mask])

Using device: cpu
Number of features: 151
price_change in feature_cols? False
Classes: [0 1 2 3 4] ‚Üí num_classes = 5


## Implement MLP

In [15]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.net(x)


input_dim = X_train.shape[1]
model = MLPClassifier(input_dim=input_dim, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=5e-5)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=3
)



## Training loop + validation
An interesting observation is that while the model correctly predicts the price change category with ~40-45% accuracy, it is far more successful in correctly guessing the direction of the price change, where (0,1) is negative and (2,3,4) is positive

In [17]:
import torch.nn as nn
epochs = 50
best_val_loss = float("inf")
best_epoch = -1
best_state = None
for epoch in range(1, epochs + 1):
    # ---- Train ----
    model.train()
    running_loss = 0.0
    running_conf = 0.0


    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        logits = model(batch_X)              # shape: (batch, num_classes)

        loss = criterion(logits, batch_y) # problem here
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_X.size(0)

        probs = torch.softmax(logits, dim=1)
        max_probs, preds = torch.max(probs, dim=1) # Correctly get max probabilities (confidence) and indices (predictions)
        running_conf += max_probs.sum().item()             # sum of confidences

    epoch_train_loss = running_loss / len(train_dataset)
    epoch_train_conf = running_conf / len(train_dataset)



    # ---- Validate ----
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_true = []
    val_conf_sum = 0.0 # Initialize val_conf_sum for the validation loop

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            logits = model(batch_X)
            loss = criterion(logits, batch_y)
            val_loss += loss.item() * batch_X.size(0)


            probs = torch.softmax(logits, dim=1)
            max_probs, preds = torch.max(probs, dim=1) # Correctly get max probabilities (confidence) and indices (predictions)

            val_conf_sum += max_probs.sum().item()      # Use max_probs for sum of confidences

            all_preds.extend(preds.cpu().numpy())
            all_true.extend(batch_y.cpu().numpy())

    epoch_val_loss = val_loss / len(val_dataset)
    epoch_val_conf = val_conf_sum / len(val_dataset)      # avg val confidence

    accuracy = (np.array(all_preds) == np.array(all_true)).mean()

    dir_true = np.where(np.array(all_true) <= 1, 0, 1)
    dir_pred = np.where(np.array(all_preds) <= 1, 0, 1)
    direction_acc = (dir_true == dir_pred).mean()

    scheduler.step(epoch_val_loss)

    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        best_epoch = epoch
        best_state = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "epoch": epoch,
        }



    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss: {epoch_train_loss:.4f} | "
        f"Val Loss: {epoch_val_loss:.4f} | "
        f"Val Acc: {accuracy:.4f} | "
        f"Dir Acc: {direction_acc:.4f} | "
        f"Train Conf: {epoch_train_conf:.3f} | "
        f"Val Conf: {epoch_val_conf:.3f}"
    )
print(f"\nBest epoch: {best_epoch} | Best Val Loss: {best_val_loss:.4f}")
torch.save(best_state, "best_mlp_checkpoint.pth")




Epoch 01 | Train Loss: 1.5175 | Val Loss: 1.4815 | Val Acc: 0.3218 | Dir Acc: 0.6420 | Train Conf: 0.319 | Val Conf: 0.310
Epoch 02 | Train Loss: 1.4697 | Val Loss: 1.4695 | Val Acc: 0.3198 | Dir Acc: 0.6472 | Train Conf: 0.335 | Val Conf: 0.350
Epoch 03 | Train Loss: 1.4373 | Val Loss: 1.5035 | Val Acc: 0.3270 | Dir Acc: 0.6484 | Train Conf: 0.347 | Val Conf: 0.434
Epoch 04 | Train Loss: 1.4083 | Val Loss: 1.4245 | Val Acc: 0.3435 | Dir Acc: 0.6419 | Train Conf: 0.357 | Val Conf: 0.396
Epoch 05 | Train Loss: 1.3827 | Val Loss: 1.4194 | Val Acc: 0.3518 | Dir Acc: 0.6446 | Train Conf: 0.365 | Val Conf: 0.403
Epoch 06 | Train Loss: 1.3598 | Val Loss: 1.3464 | Val Acc: 0.3797 | Dir Acc: 0.6403 | Train Conf: 0.373 | Val Conf: 0.363
Epoch 07 | Train Loss: 1.3359 | Val Loss: 1.3477 | Val Acc: 0.3819 | Dir Acc: 0.5921 | Train Conf: 0.382 | Val Conf: 0.330
Epoch 08 | Train Loss: 1.3183 | Val Loss: 1.3634 | Val Acc: 0.3749 | Dir Acc: 0.6129 | Train Conf: 0.390 | Val Conf: 0.331
Epoch 09 | Train

## Evaluate

In [18]:
# ---- 7. Final evaluation: classification report & confusion matrix ----
print("\nClassification report (classes 0‚Äì4 correspond to original price_change 1‚Äì5):")
print(classification_report(all_true, all_preds, digits=4))

print("Confusion matrix:")
print(confusion_matrix(all_true, all_preds))



Classification report (classes 0‚Äì4 correspond to original price_change 1‚Äì5):
              precision    recall  f1-score   support

           0     0.3908    0.5520    0.4576      5034
           1     0.3168    0.3272    0.3219      7715
           2     0.7187    0.2783    0.4013      9665
           3     0.3377    0.4346    0.3801      8346
           4     0.3650    0.4562    0.4055      4798

    accuracy                         0.3884     35558
   macro avg     0.4258    0.4097    0.3933     35558
weighted avg     0.4479    0.3884    0.3876     35558

Confusion matrix:
[[2779  362    0  285 1608]
 [1042 2524  519 2855  775]
 [ 523 2351 2690 3637  464]
 [ 817 2406  534 3627  962]
 [1950  323    0  336 2189]]


## Deployment

View output in a text editor or scrollable element

In [None]:
import random
import numpy as np
import torch
import pandas as pd

import torch

input_dim = len(deployment_features)   # safer than X_train.shape[1] here
model = MLPClassifier(input_dim=input_dim, num_classes=num_classes).to(device)

ckpt = torch.load("best_mlp_checkpoint.pth", map_location=device)
model.load_state_dict(ckpt["model"])

model.eval()  # just to be explicit
print(f"Loaded model from epoch {ckpt['epoch']} with best val loss {best_val_loss if 'best_val_loss' in globals() else 'unknown'}")


def deployment_demo(model, df, deployment_features, num_examples=5):
    """
    Deployment-style demo using the SAME feature space as training.
    Assumes `df` already has features in the same transformed form as X_train/X_val.
    """
    print("üöÄ REAL-WORLD DEPLOYMENT DEMONSTRATION")
    print("Using ONLY information available when news breaks")
    print("=" * 60)

    # Reset index to avoid weird index behavior
    df = df.reset_index(drop=True)

    model.eval()
    results = []

    # 5-class mapping (your categories)
    categories = {
        1: "Small Down",
        2: "Medium Down",
        3: "Neutral",
        4: "Medium Up",
        5: "Large Up"
    }

    for i in range(num_examples):
        idx = random.randint(0, len(df) - 1)
        row = df.iloc[idx]

        print(f"\nüì∞ EXAMPLE {i+1}:")
        print(f"Headline: {row['news_article'][:120]}...")
        print(f"Company: {row.get('stock_name', 'N/A')}")

        # Build feature vector in the SAME order as during training
        X = row[deployment_features].values.astype("float32").reshape(1, -1)

        # No extra scaling here ‚Äì df is already in model's feature space
        X_tensor = torch.from_numpy(X).to(device)

        with torch.no_grad():
            logits = model(X_tensor)
            probs = torch.softmax(logits, dim=1)
            confidence, pred_idx = torch.max(probs, dim=1)

            # y_train = price_change - 1, so:
            # model outputs 0..4 -> add 1 to get back 1..5
            predicted_pc = int(pred_idx.item()) + 1
            actual_pc = int(row["price_change"])

        print(f"ü§ñ PREDICTION (made at news time):")
        print(f"  Category: {categories.get(predicted_pc, predicted_pc)}")
        print(f"  Confidence: {confidence.item():.1%}")

        print(f"üéØ ACTUAL RESULT (unknown at prediction time):")
        print(f"  Actual Category: {categories.get(actual_pc, actual_pc)}")

        if "signed_price_move_pct" in row.index:
            print(f"  Actual % Move: {row['signed_price_move_pct']:.2f}%")

        is_correct = (predicted_pc == actual_pc)
        print(f"üìä RESULT: {'‚úÖ CORRECT' if is_correct else '‚ùå INCORRECT'}")

        results.append(is_correct)

    print(f"\n{'=' * 60}")
    accuracy = sum(results) / len(results)
    print("Deployment Model Performance (sampled from val set):")
    print(f"  Demo Accuracy: {accuracy:.1%} ({sum(results)}/{len(results)} correct)")
    print("\nüí° This shows how the model would behave on new headlines in a live setting.")


deployment_demo(model, val_df_m, deployment_features, num_examples=30)


Loaded model from epoch 42 with best val loss 1.22179864293592
üöÄ REAL-WORLD DEPLOYMENT DEMONSTRATION
Using ONLY information available when news breaks

üì∞ EXAMPLE 1:
Headline: Earnings Scheduled For August 25, 2016...
Company: MDT
ü§ñ PREDICTION (made at news time):
  Category: Medium Down
  Confidence: 34.3%
üéØ ACTUAL RESULT (unknown at prediction time):
  Actual Category: Medium Down
  Actual % Move: 0.38%
üìä RESULT: ‚úÖ CORRECT

üì∞ EXAMPLE 2:
Headline: Shares of several financial services companies are trading higher amid economic optimism, stemming from some US states r...
Company: DB
ü§ñ PREDICTION (made at news time):
  Category: Large Up
  Confidence: 49.6%
üéØ ACTUAL RESULT (unknown at prediction time):
  Actual Category: Medium Up
  Actual % Move: 1.05%
üìä RESULT: ‚ùå INCORRECT

üì∞ EXAMPLE 3:
Headline: Advanced Micro Devices shares are trading lower. Not seeing any news to justify the price action....
Company: AMD
ü§ñ PREDICTION (made at news time):
  Catego