In [1]:
# Check GPU
!nvidia-smi

Fri Jul 22 11:58:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# install dependencies not already in the Google Colab runtime

!pip install datasets > /dev/null
!pip install transformers > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m


# Initial Data Cleaning

In [3]:
# Cleaning the MNLI dataset to make it easier to use sklearn's train_test_split
# The original MNLI dataset has separate columns for premise and hypothesis
# This code will concatenate them into a text column, i.e. premise [SEP] hypothesis

from datasets import load_dataset

MNLI_GLUE = load_dataset("glue", "mnli")

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mnli (download: 298.29 MiB, generated: 78.65 MiB, post-processed: Unknown size, total: 376.95 MiB) to /root/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
MNLI_GLUE # inspect dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [6]:
import pandas as pd

# only interested in the training set for now (will still use train_test_splitting in experiments from this dataset)

train_dict = MNLI_GLUE["train"]

In [7]:
# converting each dictionary to dataframe

train_df = pd.DataFrame.from_dict(train_dict)

In [8]:
# dropping the neutrals (1) from the df
# only interested in positive and negative pairs to study binary classification

df_train = train_df.loc[train_df['label'] != 1]

In [9]:
# convert the 2(contradiction) into 1 (previously was neutral)
# now it is only entailment (0) and contradiction (1)

df_train['label'] = df_train['label'].replace([2], 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [10]:
df_train.head(5)

Unnamed: 0,premise,hypothesis,label,idx
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0,1
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0,2
3,How do you know? All this is their information...,This information belongs to them.,0,3
5,my walkman broke so i'm upset now i just have ...,I'm upset that my walkman broke and now I have...,0,5
7,(Read for Slate 's take on Jackson's findings.),Slate had an opinion on Jackson's findings.,0,7


In [11]:
df_train = df_train.reset_index() # fix the index from removing neutrals

In [12]:
df_train.head(5)

Unnamed: 0,index,premise,hypothesis,label,idx
0,1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0,1
1,2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0,2
2,3,How do you know? All this is their information...,This information belongs to them.,0,3
3,5,my walkman broke so i'm upset now i just have ...,I'm upset that my walkman broke and now I have...,0,5
4,7,(Read for Slate 's take on Jackson's findings.),Slate had an opinion on Jackson's findings.,0,7


In [13]:
df_train.iloc[0]["premise"] # inspect data

'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him'

In [14]:
# initailize text column with empty strings

df_train["text"] = ""

# loop through MNLI data concatenating premise and hypothesis

for i in range(len(df_train)):
  if i % 10_000 == 9_999:
    print(i)
  # concatenate premise and hypothesis into a new text column
  df_train["text"][i] = df_train.iloc[i]["premise"] + " [SEP] " + df_train.iloc[i]["hypothesis"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


9999
19999
29999
39999
49999
59999
69999
79999
89999
99999
109999
119999
129999
139999
149999
159999
169999
179999
189999
199999
209999
219999
229999
239999
249999
259999


In [15]:
df_train['text'][0] # inspect new column

'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him [SEP] You lose the things to the following level if the people recall.'

In [16]:
df_train.head(5) # inspect data

Unnamed: 0,index,premise,hypothesis,label,idx,text
0,1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0,1,you know during the season and i guess at at y...
1,2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0,2,One of our number will carry out your instruct...
2,3,How do you know? All this is their information...,This information belongs to them.,0,3,How do you know? All this is their information...
3,5,my walkman broke so i'm upset now i just have ...,I'm upset that my walkman broke and now I have...,0,5,my walkman broke so i'm upset now i just have ...
4,7,(Read for Slate 's take on Jackson's findings.),Slate had an opinion on Jackson's findings.,0,7,(Read for Slate 's take on Jackson's findings...


In [17]:
df_train = df_train.drop(['index', 'hypothesis', 'premise'], axis = 1) # clean data

In [18]:
df_train = df_train.drop(['idx'], axis = 1) # clean data

In [19]:
df_train.head(5)

Unnamed: 0,label,text
0,0,you know during the season and i guess at at y...
1,0,One of our number will carry out your instruct...
2,0,How do you know? All this is their information...
3,0,my walkman broke so i'm upset now i just have ...
4,0,(Read for Slate 's take on Jackson's findings...


In [20]:
# save cleaned DataFrame to local file
# I have stored this cleaned dataset on HuggingFace at: erikacardenas300/MNLI-Processed

df_train.to_csv("MNLI-Processed.csv")

# Load the cleaned dataset

In [21]:
from datasets import load_dataset

In [22]:
# load cleaned data from above
raw_training_data = load_dataset("erikacardenas300/MNLI-Processed")["train"]

raw_training_data

Using custom data configuration erikacardenas300--MNLI-Processed-6bc530ee41a997d2


Downloading and preparing dataset csv/erikacardenas300--MNLI-Processed to /root/.cache/huggingface/datasets/erikacardenas300___csv/erikacardenas300--MNLI-Processed-6bc530ee41a997d2/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/erikacardenas300___csv/erikacardenas300--MNLI-Processed-6bc530ee41a997d2/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Unnamed: 0', 'label', 'text'],
    num_rows: 261802
})

In [23]:
from sklearn.model_selection import train_test_split

# Divide data into training and test splits to evaluate model performance
train_x, test_x, train_y, test_y = train_test_split(raw_training_data["text"], raw_training_data["label"], test_size=0.2)

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import AutoTokenizer
import random

# Custom PyTorch Dataset through Inheritance
class MNLI_Labeled_Negatives(Dataset):

  def __init__(self, text_pairs, labels):
    # call PyTorch Dataset init()
    super().__init__()

    # store tokenizer to Dataset class, used in sampling rather than tokenizing the data offline
    self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    self.text_pairs = text_pairs
    self.labels = labels # 0 entailment 1 contradiction

    # An Example of an Instance in the Dataset
    # The NLP task is: NLI (Natural Language Inference)
    # ==========================================================================
    #
    # How do you know? All this is their information again. [SEP] This information belongs to them.
    # LABEL ^ =====> 0 (entailment)
    #
    # ==========================================================================

  def __getitem__(self, index):
    # get a text pair with the index
    new_premise_hypothesis_pair = self.text_pairs[index]
    # tokenize it
    new_tokenized_pair = self.tokenizer.encode(new_premise_hypothesis_pair, truncation=True, max_length=512)
    # empty attention mask (this is used to prevent padded 0s from getting a gradient in the loss)
    attn_mask = torch.zeros(512,)
    # tell the attention mask where there is text to "pay attention" to
    attn_mask[:len(new_tokenized_pair)] = 1.
    # pad to make sure each sequence contains 512 tokens
    while (len(new_tokenized_pair) < 512):
      new_tokenized_pair.append(0)
    # return minibatch
    return np.array(new_tokenized_pair), attn_mask.numpy(), np.array([self.labels[index]])
  
  # length of the dataset
  def __len__(self):
    return len(self.text_pairs)


In [30]:
# construct training and test sets with the custom PyTorch Dataset
training_set = MNLI_Labeled_Negatives(train_x, train_y)
testing_set = MNLI_Labeled_Negatives(test_x, test_y)

# Pass the custom PyTorch Dataset into a PyTorch DataLoader
train_loader = torch.utils.data.DataLoader(training_set, batch_size=32, shuffle=True, num_workers=1) # batch_size hyperparameter might explain poor performance
test_loader = torch.utils.data.DataLoader(testing_set, batch_size=32, shuffle=True, num_workers=1)

In [31]:
# test to make sure the dataset is setup properly
dataiter = iter(train_loader)
dataiter.next()

[tensor([[  101,  2027,  2020,  ...,     0,     0,     0],
         [  101,  2210,  2011,  ...,     0,     0,     0],
         [  101,  2054,  2057,  ...,     0,     0,     0],
         ...,
         [  101,  2070,  5097,  ...,     0,     0,     0],
         [  101,  2087, 16511,  ...,     0,     0,     0],
         [  101,  1037, 17524,  ...,     0,     0,     0]]),
 tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.]]),
 tensor([[0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [1],
         [0],
         [1],
         [0],
         [1],
         [1],
         [1],
         [1],
         [0],
         [0],
         [1],
         [0],
         [1],
         [0],
         [0],
         [1],
         [0],
         [0],
         [1],
         [1],
    

# Define the Model and Load Pre-Trained Weights

In [33]:
from transformers import AutoModelForSequenceClassification

# Get the HuggingFace pre-trained DistilBERT model, set labels to 2 to use cross-entropy + softmax for binary classification
Supervised_NLI_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [34]:
# count the number of parameters in the DistilBERT model
sum(param.numel() for param in Supervised_NLI_model.parameters())

66955010

# Train the Model

In [40]:
import torch.optim as optim
import torch.nn as nn

# move the model to the GPU
device = "cuda:0"
Supervised_NLI_model.to(device)

# define the Cross Entropy loss function
criterion = nn.CrossEntropyLoss()

# Learning rate is a great hyper-parameter to begin debugging potential errors
optimizer = optim.SGD(Supervised_NLI_model.parameters(), lr = 1e-2, momentum = 0.9)

import time # needed in PyTorch to record how long training takes
start = time.time() # begin training

# The NLI dataset is very large (200,000 instances), so we do not take several epochs through it
for epoch in range(1):
  ministep = 0
  for i, data in enumerate(train_loader, 0):
    # get the batch of data and store it in variables for the optimization logic
    inputs, attn_masks, labels = data[0].to(device), data[1].to(device), data[2].to(device)
    outputs = Supervised_NLI_model(inputs, attn_masks).logits
    # for debugging might want
    # ("\n")
    # for i, output in enumerate(outputs)
    # ("\n")
    loss = criterion(outputs, labels.squeeze(1)) # labels.squeeze(1) is used to compress the labels from (16,1) to (16,)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    ministep += 1
    if ministep == 500:
      break

  print("\n")
  print("Epoch: " + str(epoch) + " Loss: " + str(loss.item()))
  print("\n")
print("Finished Training!")
print("Ran in: " + str(int(time.time() - start)) + " seconds.")



Epoch: 0 Loss: 0.48043084144592285


Finished Training!
Ran in: 680 seconds.


# Evaluate the Model

In [41]:
print("TRAINING SET EVALUATION")

correct = 0
total = 0
import time

# dictionary to store predictions
pred_counter = {
    0: 0, 1: 0
}

# dictionary to store the errors
confusion_matrix_tracker = {
    "True Positive": 0,
    "False Positive": 0,
    "True Negative": 0,
    "False Negative": 0
}

# time how long this takes
start = time.time()
ministep = 0 # ministep used to monitor evaluation (takes a while for 200,000 instances)
with torch.no_grad():
  for data in train_loader:
    test_xs, test_masks, test_ys = data[0].to(device), data[1].to(device), data[2].to(device) # get data
    outputs = Supervised_NLI_model(test_xs, test_masks).logits # run inference
    confidence, y_preds = torch.max(outputs, 1) # get the prediction from the logits
    for i, pred in enumerate(y_preds):
      prediction = pred.item() # convert to numpy from tensor
      # update the monitoring dictionaries
      pred_counter[prediction] += 1
      if (prediction == 1 and test_ys[i] == 1):
        confusion_matrix_tracker["True Positive"] += 1
        correct += 1
      elif (prediction == 1 and test_ys[i] == 0):
        confusion_matrix_tracker["False Positive"] += 1
      elif (prediction == 0 and test_ys[i] == 0):
        confusion_matrix_tracker["True Negative"] += 1
        correct += 1
      elif (prediction == 0 and test_ys[i] == 1):
        confusion_matrix_tracker["False Negative"] += 1

    # check on evaluation progress
    ministep += 1
    if ministep % 200 == 199:
      print("\n")
      print(ministep)
      print(confusion_matrix_tracker)
      print("\n")
      break # not interested in full evaluation for now
    # add to total
    total += len(test_ys)

print("Training Accuracy = : " + str(correct / total * 100) + "%.")
print("\n")
print(confusion_matrix_tracker)
print("\n")

# Add Total Negatives and Total Positives

print("Calculated in: " + str(time.time() - start) + " seconds.")

TRAINING SET EVALUATION


199
{'True Positive': 2095, 'False Positive': 362, 'True Negative': 2791, 'False Negative': 1120}


Training Accuracy = : 77.11489898989899%.


{'True Positive': 2095, 'False Positive': 362, 'True Negative': 2791, 'False Negative': 1120}


Calculated in: 108.61632490158081 seconds.


In [42]:
print("TESTING SET EVALUATION")

correct = 0
total = 0
import time

# dictionary to store predictions
pred_counter = {
    0: 0, 1: 0
}

# dictionary to store the errors
confusion_matrix_tracker = {
    "True Positive": 0,
    "False Positive": 0,
    "True Negative": 0,
    "False Negative": 0
}

# time how long this takes
start = time.time()
ministep = 0 # ministep used to monitor evaluation (takes a while for 200,000 instances)
with torch.no_grad():
  for data in test_loader:
    test_xs, test_masks, test_ys = data[0].to(device), data[1].to(device), data[2].to(device) # get data
    outputs = Supervised_NLI_model(test_xs, test_masks).logits # run inference
    confidence, y_preds = torch.max(outputs, 1) # get the prediction from the logits
    for i, pred in enumerate(y_preds):
      prediction = pred.item() # convert to numpy from tensor
      # update the monitoring dictionaries
      pred_counter[prediction] += 1
      if (prediction == 1 and test_ys[i] == 1):
        confusion_matrix_tracker["True Positive"] += 1
        correct += 1
      elif (prediction == 1 and test_ys[i] == 0):
        confusion_matrix_tracker["False Positive"] += 1
      elif (prediction == 0 and test_ys[i] == 0):
        confusion_matrix_tracker["True Negative"] += 1
        correct += 1
      elif (prediction == 0 and test_ys[i] == 1):
        confusion_matrix_tracker["False Negative"] += 1

    # check on evaluation progress
    ministep += 1
    if ministep % 200 == 199:
      print("\n")
      print(ministep)
      print(confusion_matrix_tracker)
      print("\n")
      break # not interested in full evaluation for now
    # add to total
    total += len(test_ys)

print("Testing Accuracy = : " + str(correct / total * 100) + "%.")
print("\n")
print(confusion_matrix_tracker)
print("\n")

# Add Total Negatives and Total Positives

print("Calculated in: " + str(time.time() - start) + " seconds.")

TESTING SET EVALUATION


199
{'True Positive': 2011, 'False Positive': 439, 'True Negative': 2751, 'False Negative': 1167}


Testing Accuracy = : 75.15782828282829%.


{'True Positive': 2011, 'False Positive': 439, 'True Negative': 2751, 'False Negative': 1167}


Calculated in: 108.92818093299866 seconds.
