# * * * ReCAM: Reading Comprehension of Abstract Meaning * * *

## * * * * * * * * * * License Thesis * * * * * * * * * *

### Load Corpus

*  load the available dataset provided by the SemEval-2021 competition

In [1]:
!git clone https://github.com/boyuanzheng010/SemEval2021-Reading-Comprehension-of-Abstract-Meaning.git  

Cloning into 'SemEval2021-Reading-Comprehension-of-Abstract-Meaning'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 153 (delta 9), reused 7 (delta 7), pack-reused 135[K
Receiving objects: 100% (153/153), 13.12 MiB | 8.74 MiB/s, done.
Resolving deltas: 100% (51/51), done.


*  install the necessary Python APIs

In [2]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 22.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 15.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 30.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

*  import the necessary libraries

In [3]:
import logging
import random
from tqdm import tqdm, trange
import csv
import json

import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

*  prepare a logging mechanism to log the necessary results


In [4]:
logging.basicConfig(filename='/output.log', 
                    filemode='w',
                    format = '%(asctime)s :: %(levelname)s :: %(message)s',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

*  initialize the model's arguments

In [5]:
# Arguments
max_seq_length = 256 # we cannot use 512 token because => 'CUDA out of memory'
train_batch_size = 8 #16 => the smaller, the less the probability to get a memory outrage
eval_batch_size = 8       
learning_rate = 1e-5      
num_train_epochs = 2            # Epochs - less epochs to be used for BERT
warmup_proportion = 0.1         # How to use? depends on the optimizer
seed = 42                       # Random seed, distil bert
optimize_on_cpu = True          # Whether to perform optimization and keep the optimizer averages on CPU
fp16 = True                     # Whether to use 16-bit float precision instead of 32-bit
                                # see the NVIDIA doc
loss_scale = 128                # Loss scaling, positive power of 2 values can improve fp16 convergence
gradient_accumulation_steps = 1 
weight_decay_rate = 0.01

*  define the file paths to each set of data:

In [6]:
# ~ original

data_file_path = './SemEval2021-Reading-Comprehension-of-Abstract-Meaning/data/'
test_data_relative_path = 'trail_data/'
train_data_relative_path = 'training_data/'

# Imperceptibility
task_1_train_data_file_path = data_file_path + train_data_relative_path + 'Task_1_train.jsonl'
task_1_dev_data_file_path = data_file_path + train_data_relative_path + 'Task_1_dev.jsonl'
task_1_test_data_file_path = data_file_path + test_data_relative_path + 'Task_1_Imperceptibility.jsonl'

# Nonspecificity
task_2_train_data_file_path = data_file_path + train_data_relative_path + 'Task_2_train.jsonl'
task_2_dev_data_file_path = data_file_path + train_data_relative_path + 'Task_2_dev.jsonl'
task_2_test_data_file_path = data_file_path + test_data_relative_path + 'Task_2_Nonspecificity.jsonl'


### Necessary functions:

*     read samples from jsonl file

In [7]:
# ~ original
def read_dataset_from_jsonl_file(file_name):
  with open(file_name, 'r', encoding = 'utf-8') as json_line_file_pointer:
      dataset_list = list(json_line_file_pointer)

      dataset_elements = []
      for element in dataset_list:
          parsed_result = json.loads(element)

          dataset_elements.append({
              "article" : parsed_result['article'],
              "question" : parsed_result['question'],
              "options" : [parsed_result['option_0'], parsed_result['option_1'], parsed_result['option_2'], parsed_result['option_3'], parsed_result['option_4']],
              "label" : int(parsed_result['label']) 
          })

  return dataset_elements

*    pre-process input features structure

In [8]:
class InputFeatures(object):
    def __init__(self, features , masked_labels, options, label):
        
        self.choices_features = []
        for _, input_ids, input_mask in features: 
          self.choices_features.append({
              'input_ids': input_ids,
              'input_mask': input_mask
          })

        self.masked_labels = masked_labels
        self.options = options
        self.label = label

*     pre-process samples and convert them into features to serve as input for the model

In [9]:
def convert_samples_into_features(samples, tokenizer, max_seq_length):
    
    features = []
    masking_token_id = tokenizer.mask_token_id
    masking_token = tokenizer.mask_token
    classification_token = "[CLS]"
    separation_token = "[SEP]"

    for sample_index, sample in enumerate(samples):
        choices_features = []

        article_tokens = tokenizer.tokenize(sample['article'])
        question_tokens = tokenizer.tokenize(sample['question'].replace("@placeholder", masking_token))

        options = []
        for option in sample['options']:
            article_tokens_option = article_tokens[:]
            question_tokens_option = question_tokens + tokenizer.tokenize(option) 
        
            _truncate_seq_pair(article_tokens_option, question_tokens_option, max_seq_length - 3)

            tokens = [classification_token] + question_tokens_option + [separation_token] + article_tokens_option + [separation_token]

            input_ids = tokenizer.convert_tokens_to_ids(tokens)                                   
            input_mask = [1] * len(input_ids)

            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding

            # Heuristic : Loss calculated only for Masked Token!
            masked_labels = [-100 if t_id != masking_token_id else tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample['options'][sample['label']]))[0] for t_id in input_ids]

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(masked_labels) == max_seq_length
            choices_features.append((tokens, input_ids, input_mask))

        features.append(
            InputFeatures(
                features = choices_features,
                masked_labels = masked_labels,
                options = options,
                label = sample['label']
            )
        )
    
    return features

*  optimize the model to work on cpu based on the optimizer parameters

In [10]:
# ~ original
def improve_model_from_optimizer(model_parameters, optimizer_parameters):
    for (optimizer, optimizer_parameter), (model, model_parameters) in zip(optimizer_parameters, model_parameters):
        if optimizer == model:
          model_parameters.data.copy_(optimizer_parameter.data)

*  optimize on CPU by copying the gradient of the GPU parameters to the CPU/RAMM copy of the model

In [11]:
def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
    """ Utility function for optimize_on_cpu and 16-bits training.
        Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
    """
    is_nan = False
    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
        if name_opti != name_model:
            logging.error("name_opti != name_model: {} {}".format(name_opti, name_model))
            raise ValueError
        if param_model.grad is not None:
            if test_nan and torch.isnan(param_model.grad).sum() > 0:
                is_nan = True
            if param_opti.grad is None:
                param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
            param_opti.grad.data.copy_(param_model.grad.data)
        else:
            param_opti.grad = None
    return is_nan

*  function to extract a specific field from a feature

In [12]:
# ~ original
def get_feature_specific_field(features, field):
  feature_fields = []
  for feature in features:
    feature_fields.append(feature.choices_features[0][field])
  
  return feature_fields   

*  truncate article and question tokens in order to fit into the model  

In [13]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

def _truncate_seq(tokens_a, max_length):
    while True:
        if len(tokens_a) <= max_length:
            break
        else:
            tokens_a.pop()
    
    return tokens_a

### **HYPERPARAMETER FINE-TUNING**

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("device:")
print(device)
print("device count (number of GPUs):")
print(n_gpu)

train_batch_size = int(train_batch_size / gradient_accumulation_steps)
#Initialise seeds
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)



device:
cuda
device count (number of GPUs):
1


In [15]:
from transformers import AdamW, BertTokenizer, BertForMaskedLM      
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)        # Write the tokenizer to be used
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")      # Write the tokenizer to be used
# model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

#model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')

# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')        # Write the tokenizer to be used
# model = BertForMaskedLM.from_pretrained('bert-large-uncased')    # Write the model to be used

# tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
# model = ElectraForMaskedLM.from_pretrained('google/electra-base-discriminator')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Optimisations
if fp16:
    model.half()
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Load Model to device (cuda here)
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [17]:
if fp16:
    param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                        for n, param in model.named_parameters()]
elif optimize_on_cpu:
    param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                        for n, param in model.named_parameters()]
else:
    param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': weight_decay_rate},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]

In [18]:
number_of_samples = 2000

train_examples = read_dataset_from_jsonl_file(task_1_train_data_file_path) # Training Examples: task_1_train_data_file_path, task_2_train_data_file_path
train_examples = train_examples[:number_of_samples]

num_train_steps = int((len(train_examples) * gradient_accumulation_steps * num_train_epochs) / train_batch_size)
t_total = num_train_steps

# Optimiser is Adam
optimizer = AdamW(optimizer_grouped_parameters,
                         lr = learning_rate)



In [19]:
global_step = 0
train_features = convert_samples_into_features(train_examples, tokenizer, max_seq_length)

logging.info("***** Running training *****")
logging.info("  Num examples = %d", len(train_examples))
logging.info("  Batch size = %d", train_batch_size)
logging.info("  Num steps = %d", num_train_steps)

# all_options = torch.tensor([f.options for f in train_features], dtype=torch.long)
all_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)
all_masked_labels = torch.tensor([f.masked_labels for f in train_features], dtype=torch.long)

all_input_ids_1 = torch.tensor(get_feature_specific_field(train_features, 'input_ids'), dtype=torch.long)
all_input_mask_1 = torch.tensor(get_feature_specific_field(train_features, 'input_mask'), dtype=torch.long)
#all_segment_ids_1 = torch.tensor(get_feature_specific_field(train_features, 'segment_ids'), dtype=torch.long)

train_data = TensorDataset(all_input_ids_1, all_input_mask_1, all_labels, all_masked_labels) # 3rd param: all_segment_ids_1
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
correct_ans = 0
wrong_list = []
total_ans = 0

from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=2000)

for _ in trange(int(num_train_epochs), desc="Epoch"):
    #model.train() #new

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, label, lm_label_ids = batch # 3rd arg: , segment_ids
        
        #optimizer.zero_grad() #new
        
        outputs = model(input_ids, attention_mask=input_mask, labels=lm_label_ids) #last arg: , token_type_ids = segment_ids  #TOkens with labels set to -100 are ignored             
        loss = outputs.loss
        # print()
        # print("HELLLLOOO")
        # print(outputs.last_hidden_state.numpy().shape)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if fp16 and loss_scale != 1.0:
            # rescale loss for fp16 training
            # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
            loss = loss * loss_scale
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()

        #optimizer.step() #new
        #scheduler.step() #new

        tr_loss += loss.item()
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
            if fp16 or optimize_on_cpu:
                if fp16 and loss_scale != 1.0:
                    # scale down gradients for fp16 training
                    for param in model.parameters():
                        if param.grad is not None:
                            param.grad.data = param.grad.data / loss_scale
                is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
                if is_nan:
                    logging.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
                    loss_scale = loss_scale / 2
                    model.zero_grad()
                    continue
                optimizer.step()
                improve_model_from_optimizer(model.named_parameters(), param_optimizer)
            else:
                optimizer.step()

            train_loss = tr_loss / nb_tr_steps
            print("\tTraining loss : ", train_loss)
            print("\tTraining steps : ", nb_tr_steps)

            model.zero_grad()
            global_step += 1
            
          
# torch.save({
#     'model': model.state_dict()
# }, 'saved_file_3.txt')
   

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]
Iteration:   0%|          | 0/250 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/250 [00:01<06:44,  1.63s/it][A

	Training loss :  452.0
	Training steps :  1



Iteration:   1%|          | 2/250 [00:02<05:02,  1.22s/it][A

	Training loss :  616.25
	Training steps :  2



Iteration:   1%|          | 3/250 [00:03<04:23,  1.07s/it][A

	Training loss :  683.0
	Training steps :  3



Iteration:   2%|▏         | 4/250 [00:04<04:05,  1.00it/s][A

	Training loss :  637.8125
	Training steps :  4



Iteration:   2%|▏         | 5/250 [00:05<03:55,  1.04it/s][A

	Training loss :  661.45
	Training steps :  5



Iteration:   2%|▏         | 6/250 [00:06<03:48,  1.07it/s][A

	Training loss :  678.7916666666666
	Training steps :  6



Iteration:   3%|▎         | 7/250 [00:07<03:44,  1.08it/s][A

	Training loss :  658.0357142857143
	Training steps :  7



Iteration:   3%|▎         | 8/250 [00:07<03:41,  1.09it/s][A

	Training loss :  661.96875
	Training steps :  8



Iteration:   4%|▎         | 9/250 [00:08<03:38,  1.10it/s][A

	Training loss :  660.8055555555555
	Training steps :  9



Iteration:   4%|▍         | 10/250 [00:09<03:35,  1.11it/s][A

	Training loss :  663.225
	Training steps :  10



Iteration:   4%|▍         | 11/250 [00:10<03:33,  1.12it/s][A

	Training loss :  645.7272727272727
	Training steps :  11



Iteration:   5%|▍         | 12/250 [00:11<03:32,  1.12it/s][A

	Training loss :  652.2916666666666
	Training steps :  12



Iteration:   5%|▌         | 13/250 [00:12<03:31,  1.12it/s][A

	Training loss :  655.2692307692307
	Training steps :  13



Iteration:   6%|▌         | 14/250 [00:13<03:30,  1.12it/s][A

	Training loss :  670.8571428571429
	Training steps :  14



Iteration:   6%|▌         | 15/250 [00:14<03:29,  1.12it/s][A

	Training loss :  672.8
	Training steps :  15



Iteration:   6%|▋         | 16/250 [00:15<03:27,  1.13it/s][A

	Training loss :  663.40625
	Training steps :  16



Iteration:   7%|▋         | 17/250 [00:15<03:27,  1.12it/s][A

	Training loss :  660.3823529411765
	Training steps :  17



Iteration:   7%|▋         | 18/250 [00:16<03:26,  1.12it/s][A

	Training loss :  655.5
	Training steps :  18



Iteration:   8%|▊         | 19/250 [00:17<03:30,  1.10it/s][A

	Training loss :  665.2631578947369
	Training steps :  19



Iteration:   8%|▊         | 20/250 [00:18<03:37,  1.06it/s][A

	Training loss :  660.025
	Training steps :  20



Iteration:   8%|▊         | 21/250 [00:19<03:36,  1.06it/s][A

	Training loss :  672.4761904761905
	Training steps :  21



Iteration:   9%|▉         | 22/250 [00:20<03:31,  1.08it/s][A

	Training loss :  675.3181818181819
	Training steps :  22



Iteration:   9%|▉         | 23/250 [00:21<03:47,  1.00s/it][A

	Training loss :  676.6521739130435
	Training steps :  23



Iteration:  10%|▉         | 24/250 [00:23<04:02,  1.07s/it][A

	Training loss :  671.7708333333334
	Training steps :  24



Iteration:  10%|█         | 25/250 [00:24<04:00,  1.07s/it][A

	Training loss :  681.92
	Training steps :  25



Iteration:  10%|█         | 26/250 [00:24<03:47,  1.01s/it][A

	Training loss :  673.4326923076923
	Training steps :  26



Iteration:  11%|█         | 27/250 [00:25<03:38,  1.02it/s][A

	Training loss :  674.9166666666666
	Training steps :  27



Iteration:  11%|█         | 28/250 [00:26<03:31,  1.05it/s][A

	Training loss :  676.3482142857143
	Training steps :  28



Iteration:  12%|█▏        | 29/250 [00:27<03:25,  1.08it/s][A

	Training loss :  675.3706896551724
	Training steps :  29



Iteration:  12%|█▏        | 30/250 [00:28<03:22,  1.08it/s][A

	Training loss :  679.8083333333333
	Training steps :  30



Iteration:  12%|█▏        | 31/250 [00:29<03:19,  1.10it/s][A

	Training loss :  682.6209677419355
	Training steps :  31



Iteration:  13%|█▎        | 32/250 [00:30<03:16,  1.11it/s][A

	Training loss :  675.1484375
	Training steps :  32



Iteration:  13%|█▎        | 33/250 [00:31<03:14,  1.11it/s][A

	Training loss :  672.4924242424242
	Training steps :  33



Iteration:  14%|█▎        | 34/250 [00:32<03:13,  1.12it/s][A

	Training loss :  676.3897058823529
	Training steps :  34



Iteration:  14%|█▍        | 35/250 [00:32<03:12,  1.12it/s][A

	Training loss :  674.4071428571428
	Training steps :  35



Iteration:  14%|█▍        | 36/250 [00:33<03:10,  1.12it/s][A

	Training loss :  675.2708333333334
	Training steps :  36



Iteration:  15%|█▍        | 37/250 [00:34<03:09,  1.12it/s][A

	Training loss :  670.9121621621622
	Training steps :  37



Iteration:  15%|█▌        | 38/250 [00:35<03:08,  1.12it/s][A

	Training loss :  669.046052631579
	Training steps :  38



Iteration:  16%|█▌        | 39/250 [00:36<03:08,  1.12it/s][A

	Training loss :  669.3525641025641
	Training steps :  39



Iteration:  16%|█▌        | 40/250 [00:37<03:07,  1.12it/s][A

	Training loss :  671.78125
	Training steps :  40



Iteration:  16%|█▋        | 41/250 [00:38<03:07,  1.12it/s][A

	Training loss :  670.5914634146342
	Training steps :  41



Iteration:  17%|█▋        | 42/250 [00:39<03:05,  1.12it/s][A

	Training loss :  671.7916666666666
	Training steps :  42



Iteration:  17%|█▋        | 43/250 [00:40<03:04,  1.12it/s][A

	Training loss :  673.3779069767442
	Training steps :  43



Iteration:  18%|█▊        | 44/250 [00:40<03:03,  1.12it/s][A

	Training loss :  671.4488636363636
	Training steps :  44



Iteration:  18%|█▊        | 45/250 [00:41<03:02,  1.12it/s][A

	Training loss :  669.9944444444444
	Training steps :  45



Iteration:  18%|█▊        | 46/250 [00:42<03:01,  1.12it/s][A

	Training loss :  668.125
	Training steps :  46



Iteration:  19%|█▉        | 47/250 [00:43<03:00,  1.12it/s][A

	Training loss :  664.281914893617
	Training steps :  47


In [None]:
# torch.save({
#     'model': model.state_dict()
# }, 'saved_file_epoch1.txt')

# number_of_test_samples = 1000
dev_examples_task_1 = read_dataset_from_jsonl_file(task_1_dev_data_file_path)
# dev_examples_task_1 = dev_examples_task_1[:number_of_test_samples] 

test_examples_task_1 = read_dataset_from_jsonl_file(task_1_test_data_file_path)
# test_examples_task_1 = test_examples_task_1[:number_of_test_samples]



dev_examples_task_2 = read_dataset_from_jsonl_file(task_2_dev_data_file_path)
# dev_examples_task_2 = dev_examples_task_2[:number_of_test_samples] 

test_examples_task_2 = read_dataset_from_jsonl_file(task_2_test_data_file_path)
# test_examples_task_2 = test_examples_task_2[:number_of_test_samples]

model.eval()

In [None]:
#torch.cuda.memory_summary(device=None, abbreviated=False)

import torch
torch.cuda.empty_cache()

In [None]:
def compute_accuracy(examples):
  correct_score = 0
  wrong_list = []
  
  for idx, example in enumerate(examples) :

      article = example['article']
      ques_tokens = tokenizer.tokenize(example['question'].replace("@placeholder", tokenizer.mask_token))        
      tokenized_article = tokenizer.tokenize(article)

      _truncate_seq_pair(tokenized_article, ques_tokens, max_seq_length - 1)

      tokens =  [tokenizer.cls_token] + ques_tokens + [tokenizer.sep_token] + tokenized_article + [tokenizer.sep_token]

      # tokens =  ques_tokens + [tokenizer.sep_token]

      masked_index = tokens.index(tokenizer.mask_token)


      candidates = example['options']
      candidates_ids = []
      for c in candidates:
          candidates_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(c))[0])


      indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)        
      segments_ids = [0] * (len(ques_tokens) + 2) + [1] * (len(tokenized_article) + 1)
      input_mask = [1] * len(indexed_tokens)

      # Zero-pad up to the sequence length.
      padding = [0] * (max_seq_length - len(indexed_tokens))
      indexed_tokens += padding#[tokenizer.pad_token_id] * len(padding)
      input_mask += padding
      segments_ids += padding
      

      tokens_tensor = torch.tensor([indexed_tokens])
      #segments_tensors = torch.tensor([segments_ids])
      mask_tensors = torch.tensor([input_mask])
      
      mask_tensors = mask_tensors.to(device)
      tokens_tensor = tokens_tensor.to(device)
      #segments_tensors = segments_tensors.to(device)


      predictions = model(input_ids = tokens_tensor, attention_mask=mask_tensors) # last arg: , token_type_ids = segments_tensors
      predictions_candidates = predictions.logits[0, masked_index, candidates_ids]
      answer_idx = torch.argmax(predictions_candidates).item()
      #print(answer_idx)
      
      
      #print("Correct answer : ", answer_idx, "\tLabel :", example['label'], '\n') 
      if(answer_idx == example['label']):
          correct_score += 1
      else :
          wrong_list.append(idx)

  return correct_score / len(examples)  

In [None]:
accurary_task_1_dev = compute_accuracy(dev_examples_task_1)
accurary_task_1_test = compute_accuracy(test_examples_task_1)

accurary_task_3_dev = compute_accuracy(dev_examples_task_2)
accurary_task_3_test = compute_accuracy(test_examples_task_2)

In [None]:
with open('outfile.csv', 'w') as f :
      writer = csv.writer(f, delimiter = ',')
      writer.writerow(['Train task', 'Evaluation task', 'Epochs', 'Learning rate', 'Weight decay rate', 'Accurary','Model'])

In [None]:
with open('outfile.csv', 'a') as f :
      writer = csv.writer(f, delimiter = ',')

      writer.writerow(['task 1', 'task 1 dev', num_train_epochs, learning_rate, weight_decay_rate, accurary_task_1_dev, 'bert-base-uncased'])
      writer.writerow(['task 1', 'task 1 test', num_train_epochs, learning_rate, weight_decay_rate, accurary_task_1_test, 'bert-base-uncased'])

      writer.writerow(['task 1', 'task 2 dev', num_train_epochs, learning_rate, weight_decay_rate, accurary_task_3_dev, 'bert-base-uncased'])
      writer.writerow(['task 1', 'task 2 test', num_train_epochs, learning_rate, weight_decay_rate, accurary_task_3_test, 'bert-base-uncased'])

In [30]:
# accuracy = correct_score / len(examples)  
# print("Accuracy :", accuracy)
# print("Correct answers :", correct_score) 

# print("total input items: 1000")
# print("wrong list items length :\n", len(wrong_list)) 

### **LOGISTIC REGRESSION **

*  function for pre-processing the samples

In [None]:
log_regr_max_samples = 500

def prepare_first_data_grouping(dataset_list):
  
  input_data = []
  labels = []
  for element in dataset_list[:log_regr_max_samples]:

    first = element['question'].replace("@placeholder", element['options'][0])
    second = element['question'].replace("@placeholder", element['options'][1])
    third = element['question'].replace("@placeholder", element['options'][2])
    forth = element['question'].replace("@placeholder", element['options'][3])
    fifth = element['question'].replace("@placeholder", element['options'][4])

    input_data.append(
        first + ' '+ element['article']
    )
    input_data.append(
        second + ' '+ element['article']
    )
    input_data.append(
        third + ' '+ element['article']
    )
    input_data.append(
        forth + ' '+ element['article']
    )
    input_data.append(
        fifth + ' '+ element['article']
    )

    label = int(element['label'])
    if (label == 0):
      labels.append(1)
    else:
      labels.append(0)

    if (label == 1):
      labels.append(1)
    else:
      labels.append(0)
    
    if (label == 2):
      labels.append(1)
    else:
      labels.append(0)

    if (label == 3):
      labels.append(1)
    else:
      labels.append(0)

    if (label == 4):
      labels.append(1)
    else:
      labels.append(0)
  
  return input_data, labels

*  read and pre-process test, dev and test samples

In [None]:
# task 1
train_task_1_dataset_list = read_dataset_from_jsonl_file(task_1_train_data_file_path) 
dev_task_1_dataset_list = read_dataset_from_jsonl_file(task_1_dev_data_file_path) 
test_task_1_dataset_list = read_dataset_from_jsonl_file(task_1_test_data_file_path) 

processed_train_data_task_1, train_labels_task_1 = prepare_first_data_grouping(train_task_1_dataset_list)
processed_dev_data_task_1, dev_labels_task_1 = prepare_first_data_grouping(dev_task_1_dataset_list)
processed_test_data_task_1, test_labels_task_1 = prepare_first_data_grouping(test_task_1_dataset_list)

#task 2
train_task_2_dataset_list = read_dataset_from_jsonl_file(task_2_train_data_file_path) 
dev_task_2_dataset_list = read_dataset_from_jsonl_file(task_2_dev_data_file_path) 
test_task_2_dataset_list = read_dataset_from_jsonl_file(task_2_test_data_file_path) 

processed_train_data_task_2, train_labels_task_2 = prepare_first_data_grouping(train_task_2_dataset_list)
processed_dev_data_task_2, dev_labels_task_2 = prepare_first_data_grouping(dev_task_2_dataset_list)
processed_test_data_task_2, test_labels_task_2 = prepare_first_data_grouping(test_task_2_dataset_list)

KeyError: ignored

*  load the pre-trained model

In [None]:
from transformers import BertTokenizer, BertModel, AdamW       

pretrained_weights = 'distilbert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True) 
model = BertModel.from_pretrained(pretrained_weights) 

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.transformer.layer.0.output_layer_norm.weight', 'distilbert.transformer.layer.3.ffn.lin2.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.3.attention.k_lin.weight', 'distilbert.transformer.layer.2.ffn.lin2.bias', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.4.sa_layer_norm.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'vocab_projector.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.1.ffn.lin2.weight', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transfor

*  tokenize the samples

In [None]:
def tokenize_samples(samples):
  tokenized = []
  for sample in samples:
    original_tokenized = tokenizer.encode(sample, add_special_tokens=True)
    truncated_tokenized = _truncate_seq_pair(original_tokenized, 512)
    tokenized.append(truncated_tokenized) 

  return tokenized

tokenized_train_data_task_1 = tokenize_samples(processed_train_data_task_1)
tokenized_dev_data_task_1 = tokenize_samples(processed_dev_data_task_1)
tokenized_test_data_task_1 = tokenize_samples(processed_test_data_task_1)

tokenized_train_data_task_2 = tokenize_samples(processed_train_data_task_2)
tokenized_dev_data_task_2 = tokenize_samples(processed_dev_data_task_2)
tokenized_test_data_task_2 = tokenize_samples(processed_test_data_task_2)

Token indices sequence length is longer than the specified maximum sequence length for this model (661 > 512). Running this sequence through the model will result in indexing errors


512

*  add padding 

In [None]:
def add_padding(tokenized):
  max_len = 0
  for t in tokenized:
      if len(t) > max_len:
          max_len = len(t)

  padded = np.array([t + [0]*(max_len-len(t)) for t in tokenized])
  
  return padded

padded_train_data_task_1  = add_padding(tokenized_train_data_task_1)
padded_dev_data_task_1  = add_padding(tokenized_dev_data_task_1)
padded_test_data_task_1  = add_padding(tokenized_test_data_task_1)

padded_train_data_task_2  = add_padding(tokenized_train_data_task_2)
padded_dev_data_task_2  = add_padding(tokenized_dev_data_task_2)
padded_test_data_task_2  = add_padding(tokenized_test_data_task_2)

*  test token sequence size after padding

In [None]:
np.array(padded_train_data_task_1).shape

(2500, 512)

*  build the attention mask

In [None]:
attention_mask_train_data_task_1 = np.where(padded_train_data_task_1 != 0, 1, 0)
attention_mask_dev_data_task_1 = np.where(padded_dev_data_task_1 != 0, 1, 0)
attention_mask_test_data_task_1 = np.where(padded_test_data_task_1 != 0, 1, 0)

attention_mask_train_data_task_2 = np.where(padded_train_data_task_2 != 0, 1, 0)
attention_mask_dev_data_task_2 = np.where(padded_dev_data_task_2 != 0, 1, 0)
attention_mask_test_data_task_2 = np.where(padded_test_data_task_2 != 0, 1, 0)

(2500, 512)

*  test attention mask shape

In [None]:
attention_mask_train_data_task_1.shape

*  convert input ids and attention mask into tensors

In [None]:
input_ids_train_data_task_1 = torch.tensor(padded_train_data_task_1) 
input_ids_dev_data_task_1 = torch.tensor(padded_dev_data_task_1) 
input_ids_test_data_task_1 = torch.tensor(padded_test_data_task_1) 

input_ids_train_data_task_2 = torch.tensor(padded_train_data_task_2) 
input_ids_dev_data_task_2 = torch.tensor(padded_dev_data_task_2) 
input_ids_test_data_task_2 = torch.tensor(padded_test_data_task_2) 

attention_mask_train_data_task_1 = torch.tensor(attention_mask_train_data_task_1)
attention_mask_dev_data_task_1 = torch.tensor(attention_mask_dev_data_task_1)
attention_mask_test_data_task_1 = torch.tensor(attention_mask_test_data_task_1)

attention_mask_train_data_task_2 = torch.tensor(attention_mask_train_data_task_2)
attention_mask_dev_data_task_2 = torch.tensor(attention_mask_dev_data_task_2)
attention_mask_test_data_task_2 = torch.tensor(attention_mask_test_data_task_2)

*  prepare train, dev and test data loading

In [None]:
train_data_task_1 = TensorDataset(input_ids_train_data_task_1, attention_mask_train_data_task_1)
train_sampler_task_1 = RandomSampler(train_data_task_1)
train_dataloader_task_1 = DataLoader(train_data_task_1, sampler=train_sampler_task_1, batch_size=train_batch_size)

dev_data_task_1 = TensorDataset(input_ids_dev_data_task_1, attention_mask_dev_data_task_1)
dev_sampler_task_1 = RandomSampler(dev_data_task_1)
dev_dataloader_task_1 = DataLoader(dev_data_task_1, sampler=dev_sampler_task_1, batch_size=train_batch_size)

test_data_task_1 = TensorDataset(input_ids_test_data_task_1, attention_mask_test_data_task_1)
test_sampler_task_1 = RandomSampler(test_data_task_1)
test_dataloader_task_1 = DataLoader(test_data_task_1, sampler=test_sampler_task_1, batch_size=train_batch_size)


train_data_task_2 = TensorDataset(input_ids_train_data_task_2, attention_mask_train_data_task_2)
train_sampler_task_2 = RandomSampler(train_data_task_2)
train_dataloader_task_2 = DataLoader(train_data_task_2, sampler=train_sampler_task_2, batch_size=train_batch_size)

dev_data_task_2 = TensorDataset(input_ids_dev_data_task_2, attention_mask_dev_data_task_2)
dev_sampler_task_2 = RandomSampler(dev_data_task_2)
dev_dataloader_task_2 = DataLoader(dev_data_task_2, sampler=dev_sampler_task_2, batch_size=train_batch_size)

test_data_task_2 = TensorDataset(input_ids_test_data_task_2, attention_mask_test_data_task_2)
test_sampler_task_2 = RandomSampler(test_data_task_2)
test_dataloader_task_2 = DataLoader(test_data_task_2, sampler=test_sampler_task_2, batch_size=train_batch_size)

*  get vector representation of data

In [None]:
def getModelFeatures(train_dataloader):
  features = []
  for i in trange(int(num_train_epochs), desc="Epoch"):
    print(i)
    with torch.no_grad():
      for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
              input_ids, input_mask = batch

              outputs = model(input_ids, attention_mask=input_mask)
              outputs.last_hidden_state.numpy().shape

              if (i == num_train_epochs):
                features.append(outputs.last_hidden_state[:,0,:].numpy())
    
  return features

features_train_task_1 = getModelFeatures(train_dataloader_task_1)
features_dev_task_1 = getModelFeatures(dev_dataloader_task_1)
features_test_task_1 = getModelFeatures(test_dataloader_task_1)

features_train_task_2 = getModelFeatures(train_dataloader_task_2)
features_dev_task_2 = getModelFeatures(dev_dataloader_task_2)
features_test_task_2 = getModelFeatures(test_dataloader_task_2)

Iteration: 100%|██████████| 625/625 [33:15<00:00,  3.19s/it]


*  append feature batches 

In [None]:
def append_feature_batches(features):
  all_features = np.append(features[0], features[1], axis=0)
  for index, feature in enumerate(features):
    if (index != 0 and index != 1):
      all_features = np.append(all_features, features[index], axis=0)
  
  return all_features

all_features_train_task_1 = append_feature_batches(features_train_task_1)
all_features_dev_task_1 = append_feature_batches(features_dev_task_1)
all_features_test_task_1 = append_feature_batches(features_test_task_1)

all_features_train_task_2 = append_feature_batches(features_train_task_2)
all_features_dev_task_2 = append_feature_batches(features_dev_task_2)
all_features_test_task_2 = append_feature_batches(features_test_task_2)

2500


*  import LogRegr libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
def get_grid_search_best_parameter(features, labels):
  grid_search = GridSearchCV(LogisticRegression(max_iter = 10000), {'C': np.linspace(0.000001, 1000, 20)})
  grid_search.fit(features, labels)

  return grid_search.best_params_

C_train_task_1 = get_grid_search_best_parameter(all_features_train_task_1, train_labels_task_1)
# C_dev_task_1 = get_grid_search_best_parameter(all_features_dev_task_1, dev_labels_task_1)
# C_test_task_1 = get_grid_search_best_parameter(all_features_test_task_1, test_labels_task_1)

C_train_task_2 = get_grid_search_best_parameter(all_features_train_task_2, train_labels_task_2)
# C_dev_task_2 = get_grid_search_best_parameter(all_features_dev_task_2, dev_labels_task_2)
# C_test_task_2 = get_grid_search_best_parameter(all_features_test_task_2, test_labels_task_2)
#  print('best parameters: ', grid_search.best_params_)
#  print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 0.0001}
best scrores:  0.7978666666666667


In [None]:
lr_clf = LogisticRegression(max_iter = 3000, C = C_train_task_1)
lr_clf.fit(all_features_train_task_1, train_labels_task_1)
dev_task_1_accuracy = lr_clf.score(all_features_dev_task_1, dev_labels_task_1)
test_task_1_accuracy = lr_clf.score(all_features_test_task_1, test_labels_task_1)

print("task 1:")
print("dev:")
print(dev_task_1_accuracy)
print("test:")
print(test_task_1_accuracy)


lr_clf = LogisticRegression(max_iter = 3000, C = C_train_task_2)
lr_clf.fit(all_features_train_task_2, train_labels_task_2)
dev_task_2_accuracy = lr_clf.score(all_features_dev_task_2, dev_labels_task_2)
test_task_2_accuracy = lr_clf.score(all_features_test_task_2, test_labels_task_2)

print("task 2:")
print("dev:")
print(dev_task_2_accuracy)
print("test:")
print(test_task_2_accuracy)

LogisticRegression(C=0.0001, max_iter=3000)