In [1]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer
from collections.abc import Iterable
from torchtext import data
from transformers import BertModel
import torch.nn as nn
from transformers.optimization import *
import torch.optim as optim
import math

In [2]:
# importing the dataset into dataframes
ROOT_PATH = "/home/rahman75/Research/Project_Conflict/Data/SNLI/"
LOADING_PATH = os.path.join(ROOT_PATH, "snli_raw")
SAVING_PATH = os.path.join(ROOT_PATH,"snli_processed")
df_train = pd.read_csv(os.path.join(LOADING_PATH, "snli_1.0_train.txt"), sep="\t")
df_dev = pd.read_csv(os.path.join(LOADING_PATH, "snli_1.0_dev.txt"), sep="\t")
df_test = pd.read_csv(os.path.join(LOADING_PATH, "snli_1.0_test.txt"), sep="\t")

In [3]:
# extracting the required columns form the dataset
df_train = df_train[['gold_label','sentence1','sentence2']]
df_dev = df_dev[['gold_label','sentence1','sentence2']]
df_test = df_test[['gold_label','sentence1','sentence2']]

In [4]:
df_train.head()
# Analyzing the data
df_train.groupby('gold_label').count()

Unnamed: 0_level_0,sentence1,sentence2
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1
-,785,785
contradiction,183187,183185
entailment,183416,183414
neutral,182764,182762


In [5]:
df_dev.head()
# Analyzing the data
df_dev.groupby('gold_label').count()

Unnamed: 0_level_0,sentence1,sentence2
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1
-,158,158
contradiction,3278,3278
entailment,3329,3329
neutral,3235,3235


In [6]:
df_test.head()
# Analyzing the data
df_test.groupby('gold_label').count()

Unnamed: 0_level_0,sentence1,sentence2
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1
-,176,176
contradiction,3237,3237
entailment,3368,3368
neutral,3219,3219


In [7]:
# removing the entries from all train, dev and test datasets with label '-'
df_train = df_train[df_train['gold_label'] != '-']
df_dev = df_dev[df_dev['gold_label'] != '-']
df_test = df_test[df_test['gold_label'] != '-']

In [8]:
# dropping the rows from the data with NaN values
df_train = df_train.dropna(subset = ['sentence2'])
df_train.groupby('gold_label').count()

Unnamed: 0_level_0,sentence1,sentence2
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1
contradiction,183185,183185
entailment,183414,183414
neutral,182762,182762


In [9]:
# using the same tokenizer used in pre-training
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [10]:
# using the tokens from BertTokenizer
sep_token = tokenizer.sep_token
cls_token = tokenizer.cls_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
#using the token ids
sep_token_idx = tokenizer.sep_token_id
cls_token_idx = tokenizer.cls_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

<p>Let's have some improvement here<p>

In [11]:
# defining the maximum length of the sequence
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
# defining the maximum length of each sentence
max_sentence_length = 200
max_input_length

512

In [12]:
# function to tokenize the sentences using BertTokenizer
def tokenize_sentences(sentence):
  tokens = tokenizer.tokenize(sentence)
  return tokens
# function to reduce the size of the sentence to the max_input_length
def reduce_sentence_length(sentence):
  tokens = sentence.strip().split(" ")
  tokens = tokens[:max_input_length]
  return tokens
# function to trim the sentence to the max_sentence_length
def trim_sentence(sentence):
  # splitting the sentence
  sentence = sentence.split()
  # check if the sentence has 128 or more tokens
  if len(sentence) >= 200:
    sentence = sentence[:max_sentence_length]
  return " ".join(sentence)

In [13]:
# trimming the sentences upto the maximum length
df_train['sentence1'] = df_train['sentence1'].apply(trim_sentence)
df_dev['sentence1'] = df_dev['sentence1'].apply(trim_sentence)
df_test['sentence1'] = df_test['sentence1'].apply(trim_sentence)
df_train['sentence2'] = df_train['sentence2'].apply(trim_sentence)
df_dev['sentence2'] = df_dev['sentence2'].apply(trim_sentence)
df_test['sentence2'] = df_test['sentence2'].apply(trim_sentence)

In [None]:
# adding the [cls] and [sep] tokens
df_train['t_sentence1'] = cls_token + ' ' + df_train['sentence1'] + ' ' + sep_token + ' '
df_dev['t_sentence1'] = cls_token + ' ' + df_dev['sentence1'] + ' ' + sep_token + ' '
df_test['t_sentence1'] = cls_token + ' ' + df_test['sentence1'] + ' ' + sep_token + ' '
df_train['t_sentence2'] = df_train['sentence2'] + ' ' + sep_token
df_dev['t_sentence2'] = df_dev['sentence2'] + ' ' + sep_token
df_test['t_sentence2'] = df_test['sentence2'] + ' ' + sep_token

Dont Run the next cell

In [14]:
# adding the [cls] and [sep] tokens
df_train['t_sentence1'] = df_train['sentence1'] 
df_dev['t_sentence1'] =  df_dev['sentence1']
df_test['t_sentence1'] = df_test['sentence1'] 
df_train['t_sentence2'] = df_train['sentence2'] 
df_dev['t_sentence2'] = df_dev['sentence2']
df_test['t_sentence2'] = df_test['sentence2']

In [15]:
# applying the BertTokenizer to the newly generated sentences
df_train['b_sentence1'] = df_train['t_sentence1'].apply(tokenize_sentences)
df_dev['b_sentence1'] = df_dev['t_sentence1'].apply(tokenize_sentences)
df_test['b_sentence1'] = df_test['t_sentence1'].apply(tokenize_sentences)
df_train['b_sentence2'] = df_train['t_sentence2'].apply(tokenize_sentences)
df_dev['b_sentence2'] = df_dev['t_sentence2'].apply(tokenize_sentences)
df_test['b_sentence2'] = df_test['t_sentence2'].apply(tokenize_sentences)

In [16]:
# function to get the token type id's of the sentence-01
def token_type_ids_sent_01(sentence):
  try:
    return [0] * len(sentence)
  except:
    return []
# function to get the token type id's of the sentence-02
def token_type_ids_sent_02(sentence):
  try:
    return [1] * len(sentence)
  except:
    return []


# getting the token type ids for the sentences
df_train['sentence1_token_type'] = df_train['b_sentence1'].apply(token_type_ids_sent_01)
df_dev['sentence1_token_type'] = df_dev['b_sentence1'].apply(token_type_ids_sent_01)
df_test['sentence1_token_type'] = df_test['b_sentence1'].apply(token_type_ids_sent_01)
df_train['sentence2_token_type'] = df_train['b_sentence2'].apply(token_type_ids_sent_02)
df_dev['sentence2_token_type'] = df_dev['b_sentence2'].apply(token_type_ids_sent_02)
df_test['sentence2_token_type'] = df_test['b_sentence2'].apply(token_type_ids_sent_02)

In [17]:
# obtain the seqence from the tokenized sentences
df_train['sequence'] = df_train['b_sentence1'] + df_train['b_sentence2']
df_dev['sequence'] = df_dev['b_sentence1'] + df_dev['b_sentence2']
df_test['sequence'] = df_test['b_sentence1'] + df_test['b_sentence2']

In [18]:
# function to get the attention mask of the given sentence
def attention_mask_sentence(sentence):
  try:
    return [1] * len(sentence)
  except:
    return []

# generating attention mask
df_train['attention_mask'] = df_train['sequence'].apply(attention_mask_sentence)
df_dev['attention_mask'] = df_dev['sequence'].apply(attention_mask_sentence)
df_test['attention_mask'] = df_test['sequence'].apply(attention_mask_sentence)

In [19]:
# combining the token type of both sentences
df_train['token_type'] = df_train['sentence1_token_type'] + df_train['sentence2_token_type']
df_dev['token_type'] = df_dev['sentence1_token_type'] + df_dev['sentence2_token_type']
df_test['token_type'] = df_test['sentence1_token_type'] + df_test['sentence2_token_type']

In [20]:
df_dev = df_dev.dropna(subset = ['sequence'])
df_dev = df_dev.dropna(subset = ['token_type'])

In [21]:
# function to convert the attention_mask and token_type ids to int
def convert_to_int(ids):
  ids = [int(d) for d in ids]
  return ids

df_train['attention_mask'] = df_train['attention_mask'].apply(convert_to_int)
df_dev['attention_mask'] = df_dev['attention_mask'].apply(convert_to_int)
df_test['attention_mask'] = df_test['attention_mask'].apply(convert_to_int)
df_train['token_type'] = df_train['token_type'].apply(convert_to_int)
df_dev['token_type'] = df_dev['token_type'].apply(convert_to_int)
df_test['token_type'] = df_test['token_type'].apply(convert_to_int)



In [22]:
# function to combine the sequences from lists
def combine_sequence(sequence):
  return " ".join(sequence)
# function to combine the masks
def combine_mask(mask):
  mask = [str(m) for m in mask]
  return " ".join(mask)

In [23]:
# Converting the inputs to sequential for torchtext Field
df_train['sequence'] = df_train['sequence'].apply(combine_sequence)
df_dev['sequence']  = df_dev['sequence'].apply(combine_sequence)
df_test['sequence'] = df_test['sequence'].apply(combine_sequence)
df_train['attention_mask'] = df_train['attention_mask'].apply(combine_mask)
df_dev['attention_mask'] = df_dev['attention_mask'].apply(combine_mask)
df_test['attention_mask'] = df_test['attention_mask'].apply(combine_mask)
df_train['token_type'] = df_train['token_type'].apply(combine_mask)
df_dev['token_type'] = df_dev['token_type'].apply(combine_mask)
df_test['token_type'] = df_test['token_type'].apply(combine_mask)

In [None]:
df_train['token_type']

In [24]:
# extracting the required columns
df_train = df_train[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_dev = df_dev[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_test = df_test[['gold_label', 'sequence', 'attention_mask', 'token_type']]

In [None]:
# text field for sequence
TEXT = data.Field(batch_first = True,
                 use_vocab = False,
                 tokenize = reduce_sentence_length,
                 preprocessing = tokenizer.convert_tokens_to_ids,
                 pad_token = pad_token_idx,
                 unk_token = unk_token_idx)
# label field for label
LABEL = data.LabelField()
# text field for attention mask
ATTENTION = data.Field(batch_first = True,
                      use_vocab = False,
                      tokenize = reduce_sentence_length,
                      preprocessing = convert_to_int,
                      pad_token = pad_token_idx)
# text field for token type ids
TTYPE = data.Field(batch_first = True, 
                  use_vocab = False,
                  tokenize = reduce_sentence_length,
                  preprocessing = convert_to_int,
                  pad_token = 1)


In [None]:
fields = [('label', LABEL), ('sequence', TEXT), ('attention_mask', ATTENTION), ('token_type', TTYPE)]

In [25]:
# saving the data in the files

df_train.to_csv(os.path.join(SAVING_PATH,'snli_1.0_train_smpl.csv'), index=False)
df_dev.to_csv(os.path.join(SAVING_PATH,'snli_1.0_dev_smpl.csv'), index=False)
df_test.to_csv(os.path.join(SAVING_PATH,'snli_1.0_test_smpl.csv'), index=False)

In [None]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                    path = os.path.join(ROOT_PATH,'snli_processed'),
                                    train = 'snli_1.0_train.csv',
                                    validation = 'snli_1.0_dev.csv',
                                    test = 'snli_1.0_test.csv',
                                    format = 'csv',
                                    fields = fields,
                                    skip_header = True)
train_data_len = len(train_data)

In [None]:
# building the vocabulary for labels
LABEL.build_vocab(train_data)

In [None]:
# using bucketiterator for preparing batches for training
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
                               (train_data, valid_data, test_data),
                               batch_size = BATCH_SIZE,
                               sort_key = lambda x: len(x.sequence),
                               sort_within_batch = False,
                               device = device)

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
class BERTNLIModel(nn.Module):
  def __init__(self, bert_model, hidden_dim, output_dim,):
    super().__init__()
    self.bert = bert_model
    embedding_dim = bert_model.config.to_dict()['hidden_size']
    self.out = nn.Linear(embedding_dim, output_dim)
  def forward(self, sequence, attn_mask, token_type):
    embedded = self.bert(input_ids = sequence, attention_mask =  
                      attn_mask, token_type_ids = token_type)[1]
    output = self.out(embedded)
    return output

In [None]:
# loading the model
HIDDEN_DIM = 512
OUTPUT_DIM = len(LABEL.vocab)
model = BERTNLIModel(bert_model, HIDDEN_DIM, OUTPUT_DIM,).to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-6, correct_bias=False)
def get_scheduler(optimizer, warmup_steps):
  scheduler = get_constant_schedule_with_warmup(optimizer,
                                      num_warmup_steps=warmup_steps)
  return scheduler
# using the cross entropy loss
criterion = nn.CrossEntropyLoss().to(device)
fp16 = False
#if fp16:
#   try:
#     from apex import amp
#   except ImportError:
#     raise ImportError("Please install apex from
#           https://www.github.com/nvidia/apex to use fp16 training.")
#model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

In [None]:
# function to calculate the accuracy of model
def accuracy(pred, y):
  max_preds = pred.argmax(dim = 1, keepdim = True)
  correct = (max_preds.squeeze(1)==y).float()
  return correct.sum() / len(y)

In [None]:
max_grad_norm = 1
def train(model, iterator, optimizer, criterion, scheduler):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad() # clear gradients first
    torch.cuda.empty_cache() # releases all unoccupied cached memory
    sequence = batch.sequence
    attn_mask = batch.attention_mask
    token_type = batch.token_type
    label = batch.label
    predictions = model(sequence, attn_mask, token_type)
    loss = criterion(predictions, label)
    acc = accuracy(predictions, label)
    if fp16:
      with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                       max_grad_norm)
    else:
      loss.backward()
    optimizer.step()
    scheduler.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for batch in iterator:
      sequence = batch.sequence
      attn_mask = batch.attention_mask
      token_type = batch.token_type
      labels = batch.label
      predictions = model(sequence, attn_mask, token_type)
      loss = criterion(predictions, labels)
      acc = accuracy(predictions, labels)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 1
warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS * train_data_len * 1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)
best_valid_loss = float('inf')


In [None]:
for epoch in range(N_EPOCHS):
  train_loss, train_acc = train(model, train_iterator, optimizer, 
                               criterion, scheduler)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), os.path.join(ROOT_PATH, 'saved_models/bert-nli.pt'))
   
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
torch.cuda.is_available()
torch.tensor([0.12, 0.32]).cuda()

In [None]:
data.