## Transformer Classification Model

In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import calendar
import matplotlib.pyplot as plt
import yfinance as yf
import re
from nltk.tokenize import word_tokenize

from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import torch

sns.set_theme(style="whitegrid")

In [33]:
grouped_data = pd.read_csv("grouped_dataset.csv")

In [34]:
train_dataset, test_dataset = train_test_split(grouped_data, test_size=0.2)

In [35]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
text = " ".join(train_dataset['Headlines'].to_list())
# tokenizer(
#         text,
#         padding="max_length",
#         truncation=True,
#         max_length=800,
#         return_tensors="pt"
#         )

In [36]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_length):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        # Tokenize individual headline
        text = self.headlines[idx]
        tokens = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return tokens['input_ids'].squeeze(0), torch.tensor(self.labels[idx], dtype=torch.long)
        

In [37]:
from torch.utils.data import DataLoader

batch_size = 16  # Adjust as needed


train_PYdataset = NewsDataset(
    headlines=train_dataset['Headlines'].tolist(),
    labels=train_dataset['trend_up'].astype(int).tolist(),
    tokenizer=tokenizer,
    max_length=32
)
train_dataloader = DataLoader(
    train_PYdataset, 
    batch_size=batch_size,
    shuffle=True,  # Shuffle the data for better generalization
    num_workers=0,  # Set number of workers for parallel data loading
)


test_PYdataset = NewsDataset(
    headlines=test_dataset['Headlines'].tolist(),
    labels=test_dataset['trend_up'].astype(int).tolist(),
    tokenizer=tokenizer,
    max_length=32
)
test_dataloader = DataLoader(
    test_PYdataset, 
    batch_size=batch_size,
    shuffle=True,  # Shuffle the data for better generalization
    num_workers=0  # Set number of workers for parallel data loading
)

In [38]:
import torch
import torch.nn as nn
import math

class UntrainedTransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes, max_length):
        """
        Args:
            vocab_size: Size of the vocabulary.
            embed_dim: Dimension of the embedding layer.
            num_heads: Number of attention heads.
            num_layers: Number of transformer encoder layers.
            num_classes: Number of output classes for classification.
            max_length: Maximum sequence length.
        """
        super(UntrainedTransformerClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Embedding(max_length,embed_dim)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)

        # Token embedding + positional encoding
        x = self.embedding(input_ids) + self.positional_encoding(torch.arange(seq_length,device=input_ids.device))

        # Transformer encoder
        transformer_output = self.transformer(x)

        # Classification head: Use the first token's representation (similar to CLS in BERT)
        cls_output = transformer_output[:, 0, :]
        logits = self.fc(cls_output)

        return logits


In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [40]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient computation
        for X,Y in dataloader:
            input_ids = X.to(device)  # Move to device
            labels = Y.to(device)        # Move to device


            logits = model(input_ids)
            # Calculate predictions
            _,preds = torch.max(logits.data, 1)
            correct += (preds == labels).sum().item()
            total_samples += labels.size(0)


    accuracy = correct / total_samples
    model.train()
    return accuracy

In [41]:
import torch.optim as optim
model = UntrainedTransformerClassifier(vocab_size=tokenizer.vocab_size, embed_dim= 64, num_heads=4, num_layers=4, num_classes=2, max_length=32).to(device)
criterion = nn.CrossEntropyLoss()  # Loss function for classification
optimizer = optim.Adam(model.parameters(), lr=1e-5)  

for epoch in range(50):
    for xb, yb in train_dataloader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{100}], Loss: {loss.item():.4f}, Test Acc: {evaluate(model, test_dataloader,device):.4f}")

Epoch [1/100], Loss: 0.6409, Test Acc: 0.4923
Epoch [2/100], Loss: 0.7256, Test Acc: 0.4923
Epoch [3/100], Loss: 0.5970, Test Acc: 0.4923
Epoch [4/100], Loss: 0.5673, Test Acc: 0.4923
Epoch [5/100], Loss: 0.6911, Test Acc: 0.4923
Epoch [6/100], Loss: 0.5652, Test Acc: 0.4923
Epoch [7/100], Loss: 0.6297, Test Acc: 0.4846
Epoch [8/100], Loss: 0.6918, Test Acc: 0.4923
Epoch [9/100], Loss: 0.5463, Test Acc: 0.5154
Epoch [10/100], Loss: 0.5858, Test Acc: 0.4923
Epoch [11/100], Loss: 0.7219, Test Acc: 0.4769
Epoch [12/100], Loss: 0.7367, Test Acc: 0.5000
Epoch [13/100], Loss: 0.8260, Test Acc: 0.4923
Epoch [14/100], Loss: 0.9123, Test Acc: 0.4846
Epoch [15/100], Loss: 0.5889, Test Acc: 0.4923
Epoch [16/100], Loss: 0.6282, Test Acc: 0.5000
Epoch [17/100], Loss: 0.4975, Test Acc: 0.4923
Epoch [18/100], Loss: 0.6826, Test Acc: 0.4846
Epoch [19/100], Loss: 0.5783, Test Acc: 0.4846
Epoch [20/100], Loss: 0.6978, Test Acc: 0.4923
Epoch [21/100], Loss: 0.7157, Test Acc: 0.5154
Epoch [22/100], Loss: 

KeyboardInterrupt: 