<a href="https://colab.research.google.com/github/Bene939/BERT_News_Sentiment_Classifier/blob/main/BERT_News_Sentiment_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install pathlib
!pip install sklearn
!pip install numpy

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 19.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 51.0MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 27.4MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |█

In [2]:
from transformers import BertModel, DistilBertModel, BertForSequenceClassification, AdamW, BertTokenizer, get_linear_schedule_with_warmup, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from torch.nn import functional as F
from collections import defaultdict
import random
import os

In [3]:
# setting device as gpu or cpu

if torch.cuda.is_available():
  print("\nUsing: ", torch.cuda.get_device_name(0))
  device = torch.device('cuda')
else:
  print("\nUsing: CPU")
  device = torch.device('cpu')


Using:  Tesla T4


In [4]:
# loading dataset

labeled_dataset = "news_headlines_sentiment.csv"
labeled_dataset_file = Path(labeled_dataset)
file_loaded = False
while not file_loaded:
  if labeled_dataset_file.exists():
    labeled_dataset = pd.read_csv(labeled_dataset_file)
    file_loaded = True
    print("Dataset Loaded")
  else:
    print("File not Found")
print(labeled_dataset)

Dataset Loaded
                                                   news  sentiment
0     UPDATE 3-Brazil economy back to 2009 size afte...          1
1     GLOBAL MARKETS-Manufacturing data lifts stocks...          2
2     TREASURIES-Yields move higher after U.S. manuf...          0
3     UPDATE 2-Dollar weakness lifts pound to 8-mont...          0
4     UPDATE 1-U.S. House Oversight Committee to sub...          1
...                                                 ...        ...
7995  Trian Investment in Comcast Fuels Debate on Br...          1
7996                               Is Roku Stock a Buy?          2
7997                10 Most Profitable TV Shows in 2020          0
7998  Comcasts Amy Banse Transitions to Senior Advis...          2
7999  Comcast and REVOLT Sign Agreement to Expand th...          0

[8000 rows x 2 columns]


In [5]:
# loading phrase bank dataset and correcting format

phrase_bank_dataset = "all-data.csv"
phrase_bank_dataset_file = Path(phrase_bank_dataset)
file_loaded = False
while not file_loaded:
  if phrase_bank_dataset_file.exists():
    phrase_dataset = pd.read_csv(phrase_bank_dataset, encoding='latin-1', names=["sentiment", "news"])
    phrase_dataset = phrase_dataset[["news", "sentiment"]]
    phrase_dataset["sentiment"].replace(['positive', 'negative', 'neutral'], [0,1,2], inplace=True)
    file_loaded = True
    print("Dataset Loaded")
  else:
    print("File not Found")
print(phrase_dataset)

Dataset Loaded
                                                   news  sentiment
0     According to Gran , the company has no plans t...          2
1     Technopolis plans to develop in stages an area...          2
2     The international electronic industry company ...          1
3     With the new production plant the company woul...          0
4     According to the company 's updated strategy f...          0
...                                                 ...        ...
4841  LONDON MarketWatch -- Share prices ended lower...          1
4842  Rinkuskiai 's beer sales fell by 6.5 per cent ...          2
4843  Operating profit fell to EUR 35.4 mn from EUR ...          1
4844  Net sales of the Paper segment decreased to EU...          1
4845  Sales in Finland decreased by 10.5 % in Januar...          1

[4846 rows x 2 columns]


In [6]:
# merge both datasets
merged_dataset = pd.concat([phrase_dataset, labeled_dataset], axis=0)
print(merged_dataset)

                                                   news  sentiment
0     According to Gran , the company has no plans t...          2
1     Technopolis plans to develop in stages an area...          2
2     The international electronic industry company ...          1
3     With the new production plant the company woul...          0
4     According to the company 's updated strategy f...          0
...                                                 ...        ...
7995  Trian Investment in Comcast Fuels Debate on Br...          1
7996                               Is Roku Stock a Buy?          2
7997                10 Most Profitable TV Shows in 2020          0
7998  Comcasts Amy Banse Transitions to Senior Advis...          2
7999  Comcast and REVOLT Sign Agreement to Expand th...          0

[12846 rows x 2 columns]


In [7]:
# custom dataset class which returns the encodings and labels when called by the data loader
# code is from https://huggingface.co/transformers/custom_datasets.html

class NewsSentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

  def __len__(self):
      return len(self.labels)

In [8]:
# expects data frame and tokenizer as input
# returns encoded data as NewsSentimentDataset
# token type ids are not included in encoding as they are only for Q&A to mark if its a question or an answer

def tokenize_headlines(df, tokenizer):
  encodings = tokenizer.batch_encode_plus(
      df["news"].tolist(),           # input the news headlines
      add_special_tokens = True,     # special tokens added to mark beginning & end of sentence
      truncation = True,             # make all sentences a fixed length
      padding = 'max_length',        # pad with zeros to max length
      return_attention_mask = True,  # include attention mask in encoding
      return_tensors = 'pt'          # return as pytorch tensor
  )

  dataset = NewsSentimentDataset(encodings, df["sentiment"].tolist())
  return dataset

In [28]:
#splitting dataset into training and validation set
#load news sentiment dataset
#defining tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#Options for Dataset:
# labeled_dataset = my own dataset
# phrase_dataset = dataset from kaggle
# merged_dataset = both datasets merged toegether
#merged_dataset = pd.concat([phrase_dataset, labeled_train_data], axis=0) => for testing on unknown labeled date with merged dataset

train_data, val_data = train_test_split(phrase_dataset, test_size=.2)
merged_train_data, merged_val_data = train_test_split(merged_dataset, test_size=.2)
labeled_train_data, labeled_val_data = train_test_split(labeled_dataset, test_size=.2)


print("Train Dataset\n", train_data.reset_index(drop=True))
print("Validation Dataset\n", val_data.reset_index(drop=True))

train_dataset = tokenize_headlines(merged_train_data, tokenizer)
val_dataset = tokenize_headlines(merged_val_data, tokenizer)

Train Dataset
                                                    news  sentiment
0     ` Nordic infrastructure construction is one of...          2
1     As a result of these negotiations the company ...          1
2     Tikkurila has an interesting growth strategy ,...          2
3     Net sales grew in the period to  x20ac 402 mil...          0
4     After completion of the acquisition , Poyry 's...          2
...                                                 ...        ...
3871  Water Treatment Products In Australia Today , ...          2
3872  No blind-spots coming from 1 vantage point all...          2
3873  It 's `` finger-friendly '' , and to my opinio...          2
3874  Shares of Standard Chartered ( STAN ) rose 1.2...          0
3875  The employee negotiations are to address measu...          2

[3876 rows x 2 columns]
Validation Dataset
                                                   news  sentiment
0    A downloadable instruction sheet , instruction...          2
1   

In [29]:
#TODO: Calculate per class accuracy
#TODO: Try out different Loss function
#TODO: Optimizer zero grad etc necessary?
#TODO: Cased vs Uncased

model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)
model = model.to(device)
#data loader
train_batch_size = 8
val_batch_size = 8

train_data_loader = DataLoader(train_dataset, batch_size = train_batch_size, sampler=RandomSampler(train_dataset))
val_data_loader = DataLoader(val_dataset, batch_size = val_batch_size, sampler=SequentialSampler(val_dataset))

#optimizer and scheduler
num_epochs = 1
num_steps = len(train_data_loader) * num_epochs
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
#makes learning rate increase during warum up steps and decrease linearly during training.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_steps)

#training and evaluation
seed_val = 64
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in range(num_epochs):

  print("\n###################################################")
  print("Epoch: {}/{}".format(epoch+1, num_epochs))
  print("###################################################\n")

  #training phase
  average_train_loss = 0
  average_train_acc = 0
  for step, batch in enumerate(train_data_loader):

    model.train() 
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # set the gradient to zero as by default pytorch is accumulating the gradients
    model.zero_grad()

    loss, logits = model(input_ids=input_ids,
                   attention_mask=attention_mask,
                   labels=labels)

    #loss is cross entropy loss by default
    average_train_loss += loss
    
    if step % 100 == 0:
      print("At Step {} Training Loss: {:.5f}".format(step, loss.item()))

    #backpropagation
    loss.backward()
    #maximum gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    #update parameters
    optimizer.step()
    #update learning rate
    scheduler.step()

    logits_for_acc = logits.detach().cpu().numpy()
    label_for_acc = labels.to('cpu').numpy()
    average_train_acc += sklearn.metrics.accuracy_score(label_for_acc, np.argmax(logits_for_acc, axis=-1))

    #print out sentences + sentiment predictions + labels
    #print(tokenizer.batch_decode(input_ids, skip_special_tokens=True))
    #print("Predictions: ",np.argmax(logits_for_acc, axis=1))
    #print("Labels:      ",label_for_acc)
    #print("#############")
      

  average_train_loss = average_train_loss / len(train_data_loader)
  average_train_acc = average_train_acc / len(train_data_loader)
  print("======Average Training Loss: {:.5f}=========".format(average_train_loss))
  print("======Average Training Accuracy: {:.2f}%========".format(average_train_acc*100))

  #validation phase
  average_val_loss = 0
  average_val_acc = 0
  
  for step,batch in enumerate(val_data_loader):
    model.eval()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
      loss, logits = model(input_ids=input_ids,
                     attention_mask=attention_mask,
                     labels=labels)

    #loss is cross entropy loss by default
    average_val_loss += loss.item()

    logits_for_acc = logits.detach().cpu().numpy()
    label_for_acc = labels.to('cpu').numpy()
    average_val_acc += sklearn.metrics.accuracy_score(label_for_acc, np.argmax(logits_for_acc, axis=-1))

    #print out sentences + sentiment predictions + labels
    #print(tokenizer.batch_decode(input_ids, skip_special_tokens=True))
    #print("Predictions: ",np.argmax(logits_for_acc, axis=1))
    #print("Labels:      ",label_for_acc)
    #print("#############")
    

  average_val_loss = average_val_loss / len(val_data_loader)
  average_val_acc = average_val_acc / len(val_data_loader)
  print("======Average Validation Loss: {:.5f}=========".format(average_val_loss))
  print("======Average Validation Accuracy: {:.2f}%======".format(average_val_acc*100))


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b


###################################################
Epoch: 1/1
###################################################



  # Remove the CWD from sys.path while we load stuff.


At Step 0 Training Loss: 1.10420
At Step 100 Training Loss: 0.89819
At Step 200 Training Loss: 0.31455
At Step 300 Training Loss: 0.36738
At Step 400 Training Loss: 0.59081
At Step 500 Training Loss: 0.53577
At Step 600 Training Loss: 0.58356
At Step 700 Training Loss: 0.32109
At Step 800 Training Loss: 0.16012
At Step 900 Training Loss: 0.49118
At Step 1000 Training Loss: 0.36884
At Step 1100 Training Loss: 0.51396


In [11]:
#Following Code is from: https://github.com/huggingface/transformers/blob/35ff345fc9df9e777b27903f11fa213e4052595b/examples/run_glue.py#L495
#Saving model

output_dir = './model_save/'
#Output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

#Save model and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



Saving model to ./model_save/


NameError: ignored

In [None]:
#Load trained model

model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)
model.to(device)