In [None]:
import random
random.seed(42)

from sklearn.metrics import cohen_kappa_score

import pandas as pd 
df1 = pd.read_excel('/content/annotator1.xlsx')
df2 = pd.read_excel('/content/annotator2.xlsx')

#Creating a contingency table
cont_table = pd.crosstab(df1['label'], df2['label'])

# calculate the percentage agreement
total_instances = cont_table.values.sum() #Getting the total number of entries in the dataset
agreement_instances = sum(cont_table[i][i] for i in cont_table.index) #Taking the diagnol entries which corresponds to agreed labels
raw_agreement = (agreement_instances / total_instances) * 100 #Calcultaing the percentage agreement 

print(f"Raw Percent Agreement: {raw_agreement:.2f}%")

print(f"Cohens Kappa Score: {cohen_kappa_score(df1['label'].values, df2['label'].values)}") #Using the sklearn implementation of Cohen's Kappa

Raw Percent Agreement: 73.33%
Cohens Kappa Score: 0.19463087248322153


Now we'll move onto fine-tuning  pretrained language models specifically on your dataset. Since we're dealing with large models, the first step is to change to a GPU runtime.

## Adding a hardware accelerator

Please go to the menu and add a GPU as follows:

`Edit > Notebook Settings > Hardware accelerator > (GPU)`

Run the following cell to confirm that the GPU is detected.

In [None]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


## Installing Hugging Face's Transformers library
We will use Hugging Face's Transformers (https://github.com/huggingface/transformers), an open-source library that provides general-purpose architectures for natural language understanding and generation with a collection of various pretrained models made by the NLP community. This library will allow us to easily use pretrained models like `BERT` and perform experiments on top of them. We can use these models to solve downstream target tasks, such as text classification, question answering, and sequence labeling.

Note that you will be asked to link with your Google Drive account to download some of these files. If you're concerned about security risks (there have not been any issues in previous semesters), feel free to make a new Google account and use it!

In [None]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download helper functions file
helper_file = drive.CreateFile({'id': '16HW-z9Y1tM3gZ_vFpJAuwUDohz91Aac-'})
helper_file.GetContentFile('helpers.py')
print('helper file downloaded! (helpers.py)')

# Download sample file of tweets
data_file = drive.CreateFile({'id': '1QcoAmjOYRtsMX7njjQTYooIbJHPc6Ese'})
data_file.GetContentFile('tweets.csv')
print('sample tweets downloaded! (tweets.csv)')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
success!
helper file downloaded! (helpers.py)
sa

In [None]:
from helpers import tokenize_and_format, flat_accuracy

In [None]:
from helpers import tokenize_and_format, flat_accuracy
import pandas as pd

df = pd.read_excel('final_data.xlsx')

df = df.sample(frac=1).reset_index(drop=True)

texts = df.text.values
labels = df.label.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Original:  if 2020 was a nose it’d be mine 🐦
Token IDs: tensor([  101,  2065, 12609,  2001,  1037,  4451,  2009,  1521,  1040,  2022,
         3067,   100,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


## Create train/test/validation splits

Here we split your dataset into 3 parts: a training set, a validation set, and a testing set. Each item in your dataset will be a 3-tuple containing an input_id tensor, an attention_mask tensor, and a label tensor.



In [None]:

total = len(df)

num_train = int(total * .8)
num_val = int(total * .1)
num_test = total - num_train - num_val

# make lists of 3-tuples (already shuffled the dataframe in cell above)

train_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train)]
val_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train, num_val+num_train)]
test_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_val + num_train, total)]

train_text = [texts[i] for i in range(num_train)]
val_text = [texts[i] for i in range(num_train, num_val+num_train)]
test_text = [texts[i] for i in range(num_val + num_train, total)]


In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
batch_size = [8,16,32]
learning_rate = [2e-5, 5e-5, 2e-4, 5e-4, 5e-3]
weight_decay = [2e-3,5e-3,1e-2,0.1]
epochs = [5,10,15,20]

In [None]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(val_set, b_size):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/b_size) + 1

    total_correct = 0

    for i in range(num_batches):

      end_index = min(b_size * (i+1), len(val_set))

      batch = val_set[i*b_size:end_index]
      
      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy



In [None]:
import random

performance_track = []
for epoc in epochs:
  for learning_r in learning_rate:
    for b_size in batch_size:
      for weight_d in weight_decay:
        
        model = BertForSequenceClassification.from_pretrained(
          "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
          num_labels = 2, # The number of output labels.   
          output_attentions = False, # Whether the model returns attentions weights.
          output_hidden_states = False, # Whether the model returns all hidden-states.
        )
        model.cuda()
        optimizer = AdamW(model.parameters(),
                    lr = learning_r,
                    weight_decay = weight_d
                  )
        # training loop
        # For each epoch...
        for epoch_i in range(0, epoc):
            # Perform one full pass over the training set.

            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epoc))
            print('Training...')

            # Reset the total loss for this epoch.
            total_train_loss = 0

            # Put the model into training mode.
            model.train()

            # For each batch of training data...
            num_batches = int(len(train_set)/b_size) + 1

            for i in range(num_batches):
              end_index = min(b_size * (i+1), len(train_set))

              batch = train_set[i*b_size:end_index]

              if len(batch) == 0: continue

              input_id_tensors = torch.stack([data[0] for data in batch])
              input_mask_tensors = torch.stack([data[1] for data in batch])
              label_tensors = torch.stack([data[2] for data in batch])

              # Move tensors to the GPU
              b_input_ids = input_id_tensors.to(device)
              b_input_mask = input_mask_tensors.to(device)
              b_labels = label_tensors.to(device)

              # Clear the previously calculated gradient
              model.zero_grad()        

              # Perform a forward pass (evaluate the model on this training batch).
              outputs = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask, 
                                    labels=b_labels)
              loss = outputs.loss
              logits = outputs.logits

              total_train_loss += loss.item()

              # Perform a backward pass to calculate the gradients.
              loss.backward()

              # Update parameters and take a step using the computed gradient.
              optimizer.step()
                
            # ========================================
            #               Validation
            # ========================================
            # After the completion of each training epoch, measure our performance on
            # our validation set. Implement this function in the cell above.
            print(f"Parameters:")
            print(f"Epochs: {epoc}, Learning rate: {learning_r}, Batch size: {b_size}, Weight Decay: {weight_d}")
            print(f"Total loss: {total_train_loss}")
            val_acc = get_validation_performance(val_set, b_size)
            performance_track.append([epoc, learning_r, b_size, weight_d, total_train_loss, val_acc])
            print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")


In [None]:
#Training the model with optimal parameter values
epochs= 10
b_size = 32
weight_decay = 0.1
learning_rate = 5e-5

model = BertForSequenceClassification.from_pretrained(
          "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
          num_labels = 2, # The number of output labels.   
          output_attentions = False, # Whether the model returns attentions weights.
          output_hidden_states = False, # Whether the model returns all hidden-states.
        )
model.cuda()
optimizer = AdamW(model.parameters(),
            lr = learning_rate,
            weight_decay = weight_decay
          )
# training loop
# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/b_size) + 1

    for i in range(num_batches):
      end_index = min(b_size * (i+1), len(train_set))

      batch = train_set[i*b_size:end_index]

      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)

      # Clear the previously calculated gradient
      model.zero_grad()        

      # Perform a forward pass (evaluate the model on this training batch).
      outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
      loss = outputs.loss
      logits = outputs.logits

      total_train_loss += loss.item()

      # Perform a backward pass to calculate the gradients.
      loss.backward()

      # Update parameters and take a step using the computed gradient.
      optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Parameters:")
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set, b_size)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Training...
Parameters:
Total loss: 2.269478678703308
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 1.7473209500312805
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 1.5795238614082336
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 1.4203079044818878
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 1.2445179224014282
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 0.9792076945304871
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 0.7272263169288635
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 0.4920831471681595
Validation accuracy: 0.8333333333333334

Training...
Parameters:
Total loss: 0.3618459478020668
Validation accuracy: 0.9166666666666666

Training...
Parameters:
Total loss: 0.26559726893901825
Validation accuracy: 0.8333333333333334

Training complete!


# Evaluation on the test set


In [None]:
get_validation_performance(test_set, b_size)

0.9166666666666666

In [None]:
##ERROR ANALYSIS CODE
def get_sentence_idx(val_set, b_size):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/b_size) + 1

    total_correct = 0

    for i in range(num_batches):

      end_index = min(b_size * (i+1), len(val_set))

      batch = val_set[i*b_size:end_index]
      
      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      with torch.no_grad():        

        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        logits = outputs.logits
            
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        #Getting the indices of sentences that were predicted incorrectly by the model 
        sentence_idx = (pred_flat!=labels_flat)
    return sentence_idx

incorrect_text = []
ea_labels = []

idx = np.array(get_sentence_idx(test_set, b_size))
idx = [int(i) for i in idx]
filtered_values = list(filter(lambda x: x[1], zip(test_set, idx)))
for i in range(len(input_ids)):
  for j in range(len(filtered_values)):
    if torch.equal(input_ids[i],filtered_values[j][0][0]):
      incorrect_text.append(texts[i])
      ea_labels.append(labels[i])

incorrect_text.append("Wow, thanks for the feedback. I had no idea that my presentation skills were so terrible.#notreally")
ea_labels.append(0)

incorrect_text.append("Oh great, another meeting. I can't wait to sit through hours of mind-numbing presentations.")
ea_labels.append(0)

incorrect_text.append("I love sitting in traffic for hours. It gives me so much time to contemplate life's mysteries.")
ea_labels.append(0)

incorrect_text.append("Oh, I just love filling out paperwork. It's my favorite thing to do on a Friday afternoon.")
ea_labels.append(0)

ea_input_ids, ea_attention_masks = tokenize_and_format(incorrect_text)
ea_input_ids = torch.cat(ea_input_ids, dim=0)
ea_attention_masks = torch.cat(ea_attention_masks, dim=0)
ea_labels = torch.tensor(ea_labels)

analysis_set = [(ea_input_ids[i], ea_attention_masks[i], ea_labels[i]) for i in range(len(ea_input_ids))]
print(f"Error Analysis performace: {get_validation_performance(analysis_set,len(analysis_set))}")

## print out up to 5 test set examples (or adversarial examples) that your model gets wrong
for i in range(len(incorrect_text)):
  print(f"Incorrect text: {incorrect_text[i]}, Actual label: {ea_labels[i]}")

Error Analysis performace: 0.0
Incorrect text: Trying to know all this history tonight is gonna kill me, Actual label: 0
Incorrect text: Wow, thanks for the feedback. I had no idea that my presentation skills were so terrible.#notreally, Actual label: 0
Incorrect text: Oh great, another meeting. I can't wait to sit through hours of mind-numbing presentations., Actual label: 0
Incorrect text: I love sitting in traffic for hours. It gives me so much time to contemplate life's mysteries., Actual label: 0
Incorrect text: Oh, I just love filling out paperwork. It's my favorite thing to do on a Friday afternoon., Actual label: 0
