## Transformer Classification Model

In [89]:
import numpy as np
import pandas as pd
import seaborn as sns
import calendar
import matplotlib.pyplot as plt
import yfinance as yf
import re
from nltk.tokenize import word_tokenize

from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import torch

sns.set_theme(style="whitegrid")

In [90]:
data = pd.read_csv("grouped_dataset.csv")

In [91]:
train_dataset, test_dataset = train_test_split(data, test_size=0.2, shuffle=True)

In [92]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
text = " ".join(train_dataset['Headlines'].to_list())
# tokenizer(
#         text,
#         padding="max_length",
#         truncation=True,
#         max_length=800,
#         return_tensors="pt"
#         )

In [93]:
MAX_LENGTH = 600
BATCH_SIZE = 32
EMBED_DIM = 256
NUM_HEAD = 16
NUM_LAYER = 4
EPOCHS = 25
LEARNING_RATE = 1e-5

In [94]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_length):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        # Tokenize individual headline
        text = self.headlines[idx]
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return tokens['input_ids'].squeeze(0), torch.tensor(self.labels[idx], dtype=torch.float)
        

In [95]:
from torch.utils.data import DataLoader


train_PYdataset = NewsDataset(
    headlines=train_dataset['Headlines'].tolist(),
    labels=train_dataset['trend_up'].astype(int).tolist(),
    tokenizer=tokenizer,
    max_length=MAX_LENGTH
)
train_dataloader = DataLoader(
    train_PYdataset, 
    batch_size=BATCH_SIZE,
    shuffle=True,
)


test_PYdataset = NewsDataset(
    headlines=test_dataset['Headlines'].tolist(),
    labels=test_dataset['trend_up'].astype(int).tolist(),
    tokenizer=tokenizer,
    max_length=MAX_LENGTH
)
test_dataloader = DataLoader(
    test_PYdataset, 
    batch_size=BATCH_SIZE,
    shuffle=True,
)

In [96]:
import torch.nn as nn

class UntrainedTransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, max_length, dropout = 0.2):
        super(UntrainedTransformerClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Embedding(max_length,embed_dim)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            batch_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        #
        self.linear = nn.Linear(embed_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)

        # Token embedding + positional encoding
        x = self.embedding(input_ids) + self.positional_encoding(torch.arange(seq_length,device=input_ids.device))

        x = self.transformer(x)

        x = self.dropout(x)
        x = x.max(dim=1)[0]
        out = self.linear(x)
        return out   


In [97]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [98]:
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total_samples = 0

    with torch.no_grad():
        for X, Y in dataloader:
            input_ids = X.to(device)
            labels = Y.to(device)

            logits = model(input_ids)
            outputs = torch.sigmoid(logits).squeeze(1)


            preds = (outputs >= 0.5).float()
            correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = correct / total_samples
    model.train()
    return accuracy

In [99]:
import torch.optim as optim
from torch.optim.lr_scheduler import ExponentialLR

model = UntrainedTransformerClassifier(vocab_size=tokenizer.vocab_size, 
                                       embed_dim= EMBED_DIM, 
                                       num_heads= NUM_HEAD, 
                                       num_layers=NUM_LAYER, 
                                       max_length=MAX_LENGTH).to(device)

criterion = nn.BCEWithLogitsLoss()


optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = ExponentialLR(optimizer, gamma=0.9)

for epoch in range(EPOCHS):
    for xb, yb in train_dataloader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb).squeeze(1)
        loss = criterion(logits, yb)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    scheduler.step()
    print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}, Train Acc: {evaluate(model,train_dataloader,device):.4f}, Test Acc: {evaluate(model,test_dataloader,device):.4f}")

Epoch [1/25], Loss: 0.8634, Train Acc: 0.5455, Test Acc: 0.6077
Epoch [2/25], Loss: 0.6839, Train Acc: 0.5358, Test Acc: 0.6000
Epoch [3/25], Loss: 0.6115, Train Acc: 0.5126, Test Acc: 0.5462
Epoch [4/25], Loss: 0.7115, Train Acc: 0.5319, Test Acc: 0.6000
Epoch [5/25], Loss: 0.8547, Train Acc: 0.5493, Test Acc: 0.5692
Epoch [6/25], Loss: 0.5545, Train Acc: 0.5435, Test Acc: 0.5692
Epoch [7/25], Loss: 0.6965, Train Acc: 0.5571, Test Acc: 0.6077
Epoch [8/25], Loss: 0.7752, Train Acc: 0.5609, Test Acc: 0.6154
Epoch [9/25], Loss: 0.8426, Train Acc: 0.5629, Test Acc: 0.5846
Epoch [10/25], Loss: 0.7632, Train Acc: 0.5667, Test Acc: 0.5692
Epoch [11/25], Loss: 0.6054, Train Acc: 0.5725, Test Acc: 0.5846
Epoch [12/25], Loss: 0.6682, Train Acc: 0.5822, Test Acc: 0.6077
Epoch [13/25], Loss: 0.5943, Train Acc: 0.5667, Test Acc: 0.5923
Epoch [14/25], Loss: 0.7452, Train Acc: 0.5745, Test Acc: 0.5769
Epoch [15/25], Loss: 0.7003, Train Acc: 0.5725, Test Acc: 0.6154
Epoch [16/25], Loss: 0.5680, Train