In [26]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert-sentiment-classification/test_data/dev.csv
/kaggle/input/bert-sentiment-classification/test_data/fixed_data_deepwd_aug_03.csv
/kaggle/input/bert-sentiment-classification/test_data/fixed_data_deepwd_aug_named_constrained.csv
/kaggle/input/bert-sentiment-classification/test_data/fixed_data_pwws_aug.csv
/kaggle/input/bert-sentiment-classification/test_data/README
/kaggle/input/bert-sentiment-classification/test_data/fixed_data_deepwd_aug.csv
/kaggle/input/bert-sentiment-classification/test_data/random_test.csv
/kaggle/input/bert-sentiment-classification/test_data/fixed_test.csv
/kaggle/input/bert-sentiment-classification/test_data/fixed_data_pwws_aug_named_constrained.csv
/kaggle/input/bert-sentiment-classification/test_data/fixed_data_deepwd_aug_02_lv40.csv
/kaggle/input/bert/transformers/default/1/BERT-Trained/bert_trained/config.json
/kaggle/input/bert/transformers/default/1/BERT-Trained/bert_trained/tokenizer_config.json
/kaggle/input/bert/transformers/default/1/BER

**Notebook for BERT predictions and evaluation**

**Package Loading**

In [8]:
!pip install tensorflow
!pip install transformers
!pip install scikit-learn



In [14]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import random
import torch
import tensorflow as tf

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import multilabel_confusion_matrix, f1_score, precision_score, recall_score

In [15]:
def encode_sentiment(sentiment):
  if sentiment == 'Negative':
    return 0
  elif sentiment == 'Neutral':
    return 1
  else:
    return 2

In [16]:
def process_data(data, tokenizer):
  input_ids = {}
  attention_masks = {}
  labels = {}
  
  for key in data.keys():
    print(key)
    input_ids[key] = []
    attention_masks[key] = []
    
    for sentence in data[key]['MASKED_DOCUMENT']:
      encoded_dict = tokenizer.encode_plus(
                          sentence,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = MAX_LENGTH,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                          truncation=True
                    )
      input_ids[key].append(encoded_dict['input_ids'])
      attention_masks[key].append(encoded_dict['attention_mask'])

    input_ids[key] = torch.cat(input_ids[key], dim=0)
    attention_masks[key] = torch.cat(attention_masks[key], dim=0)

  for key in data.keys():
    labels[key] = data[key]['TRUE_SENTIMENT'].apply(lambda x: encode_sentiment(x))

  dataset = {}
  
  for key in data.keys():
    dataset[key] = TensorDataset(input_ids[key], attention_masks[key], torch.tensor(labels[key]))

  return dataset

In [17]:
def generate_dataloaders(config, dataset):
  dataloaders = {}
  
  for key in dataset.keys():
    dataloaders[key] = DataLoader(
        dataset[key],
        sampler = RandomSampler(dataset[key]),
        batch_size = config[key]
    )

  return dataloaders

In [18]:
# Function to calculate the accuracy of our predictions vs labels
def get_accuracy(preds, labels):
  return np.sum(preds == labels) / len(labels)

def multiclass_confusion_matrix(preds, labels):
  return multilabel_confusion_matrix(labels, preds)

In [19]:
def make_predictions(model, dataloader):
  model.eval()

  # Tracking variables 
  predictions , true_labels = [], []

  # Predict 
  for batch in dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions.
        result = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask,
                      return_dict=True)

    logits = result.logits

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
  return predictions, true_labels

def get_precision(cm):
  true_pos = np.diag(cm) 
  return np.sum(true_pos / np.sum(cm, axis=0))

def get_recall(cm):
  true_pos = np.diag(cm)
  return np.sum(true_pos / np.sum(cm, axis=1))

def evaluate_predictions(predictions, true_labels):
  predictions = np.argmax(predictions, axis=1).flatten()
  true_labels = true_labels.flatten()
  
  acc = get_accuracy(predictions, true_labels)
  p = precision_score(true_labels, predictions, average=None)
  r = recall_score(true_labels, predictions, average=None)
  f1 = f1_score(true_labels, predictions, average=None)
  macro_f1 = f1_score(true_labels, predictions, average='macro')
  
  return acc, p, r, f1, macro_f1

In [20]:
MAX_LENGTH = 512

batch_sizes = {
    'train': 8,
    'dev': 4,
    'random_test': 4,
    'fixed_test': 4
}

In [21]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [22]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


**Load Data**

In [24]:
data_path = "/kaggle/input/bert-sentiment-classification/test_data"
data = {}

for item in ['random_test', 'fixed_test']:
  data[item] = pd.read_csv(os.path.join(data_path, item + '.csv'))

**BERT MODEL**

In [29]:
# If using a custom-trained model with correct files:
tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bert/transformers/default/1/BERT-Trained/bert_trained/")
model = BertForSequenceClassification.from_pretrained("/kaggle/input/bert/transformers/default/1/BERT-Trained/bert_trained/")

# If files are missing, use a standard BERT model:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

if device.type != 'cpu':
    model.cuda()

In [30]:
dataset = process_data(data, tokenizer)

random_test
fixed_test


In [31]:
dataloaders = generate_dataloaders(batch_sizes, dataset)

In [32]:
for key, dataloader in dataloaders.items():
  predictions, true_labels = make_predictions(model, dataloader)
  predictions = np.concatenate(predictions)
  true_labels = np.concatenate(true_labels)
  acc, p, r, f1, macro_f1 = evaluate_predictions(predictions, true_labels)

In [35]:
print(key)
print(f"Accuracy: {acc}")
print(f"F1_Score: {f1}")

print(f"Macro_f1 Score: {macro_f1}")

fixed_test
Accuracy: 0.4727932285368803
F1_Score: [0.13714286 0.44072948 0.57003654]
Macro_f1 Score: 0.3826362937431432
