<a href="https://colab.research.google.com/github/Bene939/BERT_News_Sentiment_Classifier/blob/main/BERT_News_Sentiment_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install pathlib
!pip install sklearn
!pip install numpy
#!pip install simpletransformers

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertTokenizer, get_linear_schedule_with_warmup, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from torch.nn import functional as F
from collections import defaultdict
import random

#from simpletransformers.classification import ClassificationModel

In [None]:
#defining tokenizerm, model and optimizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)


if torch.cuda.is_available():
  print("\nUsing: ", torch.cuda.get_device_name(0))
  device = torch.device('cuda')
else:
  print("\nUsing: CPU")
  device = torch.device('cpu')
model = model.to(device)


In [None]:
#loading dataset
labeled_dataset = "news_headlines_sentiment.csv"
labeled_dataset_file = Path(labeled_dataset)
file_loaded = False
while not file_loaded:
  if labeled_dataset_file.exists():
    labeled_dataset = pd.read_csv(labeled_dataset_file)
    file_loaded = True
    print("Dataset Loaded")
  else:
    print("File not Found")
print(labeled_dataset)

#counting sentiments
negative = 0
neutral = 0
positive = 0
for idx, row in labeled_dataset.iterrows():
  if row["sentiment"] == 0:
    negative += 1
  elif row["sentiment"] == 1:
    neutral += 1
  else:
    positive += 1
print("Unbalanced Dataset")
print("negative: ", negative)
print("neutral: ", neutral)
print("positive: ", positive)

#balancing dataset to 1/3 per sentiment
for idx, row in labeled_dataset.iterrows():
  if row["sentiment"] == 0:
    if negative - neutral != 0:
      index_name = labeled_dataset[labeled_dataset["news"] == row["news"]].index
      labeled_dataset.drop(index_name, inplace=True)
      negative -= 1
  elif row["sentiment"] == 2:
    if positive - neutral != 0:
      index_name = labeled_dataset[labeled_dataset["news"] == row["news"]].index
      labeled_dataset.drop(index_name, inplace=True)
      positive -= 1

negative = 0
neutral = 0
positive = 0
for idx, row in labeled_dataset.iterrows():
  if row["sentiment"] == 0:
    negative += 1
  elif row["sentiment"] == 1:
    neutral += 1
  else:
    positive += 1
print("Balanced Dataset:")
print("negative: ", negative)
print("neutral: ", neutral)
print("positive: ", positive)

In [None]:
#loading phrase bank dataset
phrase_bank_dataset = "all-data.csv"
phrase_bank_dataset_file = Path(phrase_bank_dataset)
file_loaded = False
while not file_loaded:
  if phrase_bank_dataset_file.exists():
    phrase_bank_dataset = pd.read_csv(phrase_bank_dataset, encoding='latin-1')
    phrase_bank_dataset = phrase_bank_dataset.values.tolist()
    file_loaded = True
    print("Dataset Loaded")
  else:
    print("File not Found")

In [None]:
#correcting the format of phrase bank dataset
phrase_dataset = pd.DataFrame(columns=["news", "sentiment"])
for ele in phrase_bank_dataset:
  news = ele[1]
  #converting sentiment text into numbers
  sentiment = 0 if ele[0] == 'negative' else 1 if ele[0] == 'neutral' else 2
  row = [news, sentiment]
  phrase_dataset.loc[len(phrase_dataset)] = row
print(phrase_dataset)

In [None]:
#merge both datasets
"""
final_dataset = pd.DataFrame(columns=["news", "sentiment"])
for idx,row in phrase_dataset.iterrows():
  final_dataset.loc[len(final_dataset)] = [row["news"], row["sentiment"]]
for idx,row in labeled_dataset.iterrows():
  final_dataset.loc[len(final_dataset)] = [row["news"], row["sentiment"]]
print(final_dataset)
"""

In [None]:
#custom dataset class

class NewsSentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

  def __len__(self):
      return len(self.labels)

In [None]:
#method for tokenizing dataset list

def tokenize_headlines(headlines, labels, tokenizer):

  encodings = tokenizer.batch_encode_plus(
      headlines,
      add_special_tokens = True,
      truncation = True,
      padding = 'max_length',
      return_attention_mask = True,
      return_token_type_ids = True
  )

  dataset = NewsSentimentDataset(encodings, labels)
  return dataset

In [None]:
#splitting dataset into training and validation set
#TODO: split dataset into train-val-test .7-.1-.2
#load news sentiment dataset

#all_headlines = phrase_dataset['news'].tolist()
#all_labels = phrase_dataset['sentiment'].tolist()

all_headlines = labeled_dataset['news'].tolist()
all_labels = labeled_dataset['sentiment'].tolist()

train_headlines, val_headlines, train_labels, val_labels = train_test_split(all_headlines, all_labels, test_size=.2)

val_dataset = tokenize_headlines(val_headlines, val_labels, tokenizer)
train_dataset = tokenize_headlines(train_headlines, val_labels, tokenizer)

In [None]:
#data loader
train_batch_size = 16
val_batch_size = 8
#alternative to shuffle:
#sampler=RandomSampler(train_dataset)
train_data_loader = DataLoader(train_dataset, batch_size = train_batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size = val_batch_size, sampler=SequentialSampler(val_dataset))

In [None]:
#optimizer and scheduler
num_epochs = 1
num_steps = len(train_data_loader) * num_epochs
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_steps*0.06, num_training_steps=num_steps)

In [None]:
#simple transformers training
"""
#simple transformers model
args = {
  "output_dir": "outputs/",
    "cache_dir": "cache_dir/",

    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 1,
    "weight_decay": 0,
    "learning_rate": 5e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,

    "logging_steps": 50,
    "save_steps": 2000,

    "overwrite_output_dir": True,
    "reprocess_input_data": False,

    "manual_seed": 64,
    "n_gpu": 1
}
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3,use_cuda=True, args=args)
train,eva = train_test_split(labeled_dataset,test_size = 0.2)

train_df = pd.DataFrame({
    'text': train['news'],
    'label': train['sentiment']
})

eval_df = pd.DataFrame({
    'text': eva['news'],
    'label': eva['sentiment']
})

model.train_model(train_df)

result, model_outputs, wrong_predictions = model.eval_model(eval_df)


lst = []
for arr in model_outputs:
    lst.append(np.argmax(arr))
true = eval_df['label'].tolist()
predicted = lst
print(predicted)
print(true)
sklearn.metrics.accuracy_score(true,predicted)
"""


In [None]:
#training and evaluation
seed_val = 64

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in range(num_epochs):

  print("\n###################################################")
  print("Epoch: {}/{}".format(epoch+1, num_epochs))
  print("###################################################\n")

  #training phase
 
  average_train_loss = 0
  average_train_acc = 0
  model.train() 
  for step, batch in enumerate(train_data_loader):
      
      
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      token_type_ids = batch['token_type_ids'].to(device)


      outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids = token_type_ids)

      loss = F.cross_entropy(outputs[0], labels)
      average_train_loss += loss

      if step % 40 == 0:
        print("Training Loss: ", loss)

      logits = outputs[0].detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()

      average_train_acc += sklearn.metrics.accuracy_score(label_ids, np.argmax(logits, axis=1))
      print("predictions: ",np.argmax(logits, axis=1))
      print("labels:      ",label_ids)
      print("#############")
      optimizer.zero_grad()
      loss.backward()
      #maximum gradient clipping
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      
      optimizer.step()
      scheduler.step()
      model.zero_grad()

  average_train_loss = average_train_loss / len(train_data_loader)
  average_train_acc = average_train_acc / len(train_data_loader)
  print("======Average Training Loss: {:.5f}======".format(average_train_loss))
  print("======Average Training Accuracy: {:.2f}%======".format(average_train_acc*100))

  #validation phase
  average_val_loss = 0
  average_val_acc = 0
  model.eval()
  for step,batch in enumerate(val_data_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)

    pred = []
    with torch.no_grad():
      

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

      loss = F.cross_entropy(outputs[0], labels)
      average_val_loss += loss

      logits = outputs[0].detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()
      print("predictions: ",np.argmax(logits, axis=1))
      print("labels:      ",label_ids)
      print("#############")

      average_val_acc += sklearn.metrics.accuracy_score(label_ids, np.argmax(logits, axis=1))

  average_val_loss = average_val_loss / len(val_data_loader)
  average_val_acc = average_val_acc / len(val_data_loader)

  print("======Average Validation Loss: {:.5f}======".format(average_val_loss))
  print("======Average Validation Accuracy: {:.2f}%======".format(average_val_acc*100))

In [None]:
#training and evaluation with trainer moduel from huggingfaces
"""
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset  ,          # evaluation dataset
    compute_metrics=compute_metrics           
)

trainer.train()
trainer.evaluate()
"""