In [1]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
torch.cuda.is_available()
torch.cuda.empty_cache()

In [3]:
# CONSTANTS
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL = 'google-bert/bert-base-uncased'
BATCH_SIZE = 16

In [4]:
class BERTSentimentClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(BERT_MODEL).train().to(DEVICE)
        # Sequential block for dense layers
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.3),
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(256, num_classes)
        ).train().to(DEVICE)
        # Initialize weights
        BERTSentimentClassifier.initialize_weights(self.classifier)


    def forward(self, inputs):
        embeddings = self.bert(**inputs).pooler_output
        logits = self.classifier(embeddings)
        return logits


    @staticmethod
    def initialize_weights(model):
        with torch.no_grad():
            for m in model.modules():
                if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d, nn.Linear)):
                    nn.init.normal_(m.weight, 0.0, 0.02)

In [5]:
class NewsDataset(Dataset):
    def __init__(self, dataframe, max_length=512):
        self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
        self.max_length = max_length
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        summary_inputs = self.tokenizer(
            str(self.data['summary'][index]),
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True,
        )
        description_inputs = self.tokenizer(
            str(self.data['description'][index]),
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True,
        )

        summary_dict = {
            'input_ids': summary_inputs['input_ids'].squeeze(),
            'token_type_ids': summary_inputs['token_type_ids'].squeeze(),
            'attention_mask': summary_inputs['attention_mask'].squeeze(),
        }

        description_dict = {
            'input_ids': description_inputs['input_ids'].squeeze(),
            'token_type_ids': description_inputs['token_type_ids'].squeeze(),
            'attention_mask': description_inputs['attention_mask'].squeeze(),
        }

        return index, summary_dict, description_dict

    @staticmethod
    def collate_fn(batch):
        indices = []

        summary_input_ids = []
        summary_token_type_ids = []
        summary_attention_mask = []

        description_input_ids = []
        description_token_type_ids = []
        description_attention_mask = []

        for item in batch:
            indices.append(item[0])
            summary_input_ids.append(item[1]['input_ids'])
            summary_token_type_ids.append(item[1]['token_type_ids'])
            summary_attention_mask.append(item[1]['attention_mask'])

            description_input_ids.append(item[2]['input_ids'])
            description_token_type_ids.append(item[2]['token_type_ids'])
            description_attention_mask.append(item[2]['attention_mask'])

        summary_dict = {
            'input_ids': torch.stack(summary_input_ids),
            'token_type_ids': torch.stack(summary_token_type_ids),
            'attention_mask': torch.stack(summary_attention_mask),
        }

        description_dict = {
            'input_ids': torch.stack(description_input_ids),
            'token_type_ids': torch.stack(description_token_type_ids),
            'attention_mask': torch.stack(description_attention_mask),
        }

        return indices, summary_dict, description_dict


In [None]:
model = BERTSentimentClassifier()

state_dict = torch.load('../Dataset/BERT/bert_classifier.pth', map_location=DEVICE)
# Remove the 'module.' prefix from the keys if present.
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith('module.'):
        new_state_dict[k[len('module.'):]] = v
    else:
        new_state_dict[k] = v
# Now load the modified state dictionary into your model.
model.load_state_dict(new_state_dict, strict=False)

model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  state_dict = torch.load('/content/drive/MyDrive/DSP_project/Dataset/BERT/bert_classifier.pth', map_location=DEVICE)


BERTSentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [None]:
news_df = pd.read_csv('../Dataset/newsapi/preprocessed/company_news_preprocessed.csv')
news_df.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName,source_url,summary_vader,description_vader
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors,https://www.forbes.com,0.0,0.296
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors,https://www.forbes.com,0.7783,0.4767
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors,https://www.livemint.com,0.0772,0.6486
3,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors,https://www.business-standard.com,0.4939,0.4939
4,Livemint,Nikita Prasad,Ratan Tata passes away at 86: Top business tyc...,"Ratan Tata breathed his last on Wednesday, Oct...",2024-10-09 19:46:32,https://www.livemint.com/companies/people/rata...,"Ratan Tata passes away:Ratan Tata, chairman em...",Tata Motors,https://www.livemint.com,-0.3182,-0.1779


In [12]:
dataset = NewsDataset(news_df)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=NewsDataset.collate_fn)

In [13]:
for batch in (pbar := tqdm(loader)):
    torch.cuda.empty_cache()
    indices, summary, description = batch
    summary = {k: v.to(DEVICE) for k, v in summary.items()}
    description = {k: v.to(DEVICE) for k, v in description.items()}
    with torch.amp.autocast('cuda'):
        summary_logits = model(summary).cpu()
        description_logits = model(description).cpu()

    for idx, index in enumerate(indices):
        news_df.loc[index, 'summary_sentiment'] = summary_logits[idx].argmax().item()
        news_df.loc[index, 'description_sentiment'] = description_logits[idx].argmax().item()

  0%|          | 0/722 [00:00<?, ?it/s]

In [14]:
news_df.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName,source_url,summary_vader,description_vader,summary_sentiment,description_sentiment
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors,https://www.forbes.com,0.0,0.296,1.0,2.0
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors,https://www.forbes.com,0.7783,0.4767,2.0,2.0
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors,https://www.livemint.com,0.0772,0.6486,1.0,0.0
3,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors,https://www.business-standard.com,0.4939,0.4939,2.0,2.0
4,Livemint,Nikita Prasad,Ratan Tata passes away at 86: Top business tyc...,"Ratan Tata breathed his last on Wednesday, Oct...",2024-10-09 19:46:32,https://www.livemint.com/companies/people/rata...,"Ratan Tata passes away:Ratan Tata, chairman em...",Tata Motors,https://www.livemint.com,-0.3182,-0.1779,0.0,1.0


In [None]:
news_df.to_excel('../Dataset/news_ratings/company_article_df_with_ratings.xlsx', index=False)

In [None]:
news_df.to_csv('../Dataset/news_ratings/company_article_df_with_ratings.csv', index=False)