In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import BertModel, BertTokenizer
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
torch.cuda.is_available()
torch.cuda.empty_cache()

In [None]:
# CONSTANTS
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL = 'google-bert/bert-base-uncased'
BATCH_SIZE = 64

In [None]:
df = load_dataset("kdave/Indian_Financial_News")['train'].to_pandas()
df.sample(5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,URL,Content,Summary,Sentiment
20153,https://economictimes.indiatimes.com/markets/e...,Unlock Leadership Excellence with a Range of C...,a range of CXO courses offering college course...,Positive
10133,https://economictimes.indiatimes.com/markets/e...,Unlock Leadership Excellence with a Range of C...,independent analyst says if you have to be in ...,Neutral
6637,https://www.financialexpress.com/economy/vivad...,Neeraj Sharma\n\nThe Budget 2020 introduced Di...,the scheme was introduced in Parliament on 5 F...,Negative
5653,http://www.financialexpress.com/market/stock-m...,After the stock market correction continued fo...,the stock market correction continues for the ...,Negative
14387,https://economictimes.indiatimes.com/news/econ...,NEW DELHI: The government has started working ...,the government is considering a booster shot t...,Neutral


In [None]:
df.describe()

Unnamed: 0,URL,Content,Summary,Sentiment
count,26961,26961,26961,26961
unique,13121,12710,12657,3
top,https://economictimes.indiatimes.com/news/inte...,"Amid Boardroom Feuds, Spotlight Falls on Women...",women playing key roles in boardroom conflicts...,Negative
freq,13,198,198,8987


In [None]:
df['Content'].str.len().describe()

Unnamed: 0,Content
count,26961.0
mean,3722.494195
std,3389.123049
min,52.0
25%,2067.0
50%,3035.0
75%,4509.0
max,49680.0


In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, max_length=512):
        dataframe['Label'] = dataframe['Sentiment'].str.lower().map({'positive': 2, 'neutral': 1, 'negative': 0})
        self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
        self.max_length = max_length
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        inputs = self.tokenizer(
            self.data['Summary'][index],
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True,
        )
        label = torch.tensor(self.data['Label'][index], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': label
        }

    @staticmethod
    def collate_fn(batch):
        input_ids = torch.stack([item['input_ids'] for item in batch])
        token_type_ids = torch.stack([item['token_type_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        label = torch.stack([item['label'] for item in batch])

        return {
            'input_ids': input_ids,
            'token_type_ids': token_type_ids,
            'attention_mask': attention_mask,
            'label': label
        }

In [None]:
dataset = TextDataset(df)
train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=TextDataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=TextDataset.collate_fn)



In [None]:
class BERTSentimentClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(BERT_MODEL).train().to(DEVICE)
        # Sequential block for dense layers
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(256, num_classes)
        ).train().to(DEVICE)
        # Initialize weights
        BERTSentimentClassifier.initialize_weights(self.classifier)


    def forward(self, inputs):
        embeddings = self.bert(**inputs).pooler_output
        logits = self.classifier(embeddings)
        return logits


    @staticmethod
    def initialize_weights(model):
        with torch.no_grad():
            for m in model.modules():
                if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d, nn.Linear)):
                    nn.init.normal_(m.weight, 0.0, 0.02)

In [None]:
model = BERTSentimentClassifier()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler('cuda')
losses = []
num_epochs = 5

for epoch in range(num_epochs):
    for batch in (pbar := tqdm(train_loader)):
        labels = batch.pop('label').to(DEVICE)
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.amp.autocast('cuda'):
            outputs = model(batch)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        pbar.set_postfix({'loss': loss.item()})

        del batch, labels, outputs
        torch.cuda.empty_cache()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

In [None]:
# Plot the loss curve
plt.plot(losses)
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.title('Loss Curve')
plt.show()

In [None]:
torch.save(model.state_dict(), '../Models/sentiment_classification/bert_classifier.model')