In [1]:
%%capture
!pip install transformers
!pip install datasets

# Data processing

In [1]:
import pandas as pd
import os
questions = pd.read_csv("../MSR/testing_data.csv")
answers = pd.read_csv("../MSR/test_answer.csv")

In [2]:
def processfiles(files, train_dir):
    texts = []
    for i, a_file in enumerate(files):
        text = ""
        try:
            with open(os.path.join(train_dir, a_file)) as instream:
                for line in instream:
                    text += line
                texts.append(text)
        except UnicodeDecodeError:
            print(f"Unicode error for this file {a_file}")
    return texts

In [3]:
import glob
file_names = glob.glob('../MSR/Holmes_Training_Data/*')

In [4]:
texts = processfiles(file_names, '../MSR/')

Unicode error for this file ../MSR/Holmes_Training_Data/LLIFE10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/PHIL410.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/ACHOE10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/MOHIC10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/DTROY10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/TNGLW10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/KRSON10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/GHROS10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/MFRND10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/HHOHG10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/WTSLW10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/HFDTR10.TXT
Unicode error for this file ../MSR/Holmes_Training_Data/TBTAS10.TXT


Creating hugging face dataset from text files

In [5]:
from transformers import AutoTokenizer
import datasets
from datasets import Dataset

In [6]:
ds_len = 128
texts_dict = {'text': [t for t in texts[:ds_len]]}

In [7]:
ds = Dataset.from_dict(texts_dict)

In [8]:
ds

Dataset({
    features: ['text'],
    num_rows: 128
})

In [9]:
ds = ds.train_test_split(test_size=0.1)

In [10]:
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenised_ds = ds.map(lambda batch: tokenizer(batch['text']),
                      batched=True,
                      num_proc=2,
                      remove_columns=['text'])



   

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (65336 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (91992 > 512). Running this sequence through the model will result in indexing errors


   

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8123 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (124716 > 512). Running this sequence through the model will result in indexing errors


In [11]:
block_size = 128


def group_texts(b):
    concat_text = {k: sum(b[k], []) for k in b.keys()}
    total_length = len(concat_text[list(b.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concat_text.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

In [12]:
blocked_ds = tokenised_ds.map(group_texts, batched=True, num_proc=2, batch_size=1000)

   

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [14]:
tokenizer.convert_ids_to_tokens(50264)

'<mask>'

In [15]:
from transformers import AutoModelForMaskedLM, Trainer, TrainingArguments

trainer_model = AutoModelForMaskedLM.from_pretrained(model_name,
                                                     return_dict=True)

train_args = TrainingArguments("test-mlm",
                               evaluation_strategy='epoch',
                               learning_rate=2e-5,
                               weight_decay=0.01,
                               num_train_epochs=3,
                               per_device_train_batch_size=8,
                               save_steps=20000)

trainer = Trainer(model=trainer_model,
                  args=train_args,
                  train_dataset=blocked_ds['train'],
                  eval_dataset=blocked_ds['test'],
                  data_collator=data_collator)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 104459
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39174


Epoch,Training Loss,Validation Loss
1,2.1404,2.125037
2,2.0543,2.036787
3,1.984,2.005885


***** Running Evaluation *****
  Num examples = 11419
  Batch size = 8
Saving model checkpoint to test-mlm/checkpoint-20000
Configuration saved in test-mlm/checkpoint-20000/config.json
Model weights saved in test-mlm/checkpoint-20000/pytorch_model.bin
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

***** Running Evaluation *****
  Num examples = 11419
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=39174, training_loss=2.125062181883033, metrics={'train_runtime': 2813.6989, 'train_samples_per_second': 111.375, 'train_steps_per_second': 13.923, 'total_flos': 1.0390155971095296e+16, 'train_loss': 2.125062181883033, 'epoch': 3.0})

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np
import re

choices = {'a)':1, 'b)':2, 'c)':3, 'd)':4, 'e)':5}
model_name = 'distilroberta-base'


In [18]:
class LanguageModelEvaluator():

    def __init__(self, q, a, c, mn, model):
        self.questions, self.answers, self.choices, self.model_name = q, a, c, mn
        print(len(self.questions))
        self.process_questions_and_answers()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = model
        self.sent_encodings, self.word_encodings, self.mask_idxs = self.make_encodings()

    def run_model_and_evaluate(self):
        output = self.make_predictions()
        self.accuracy = self.get_model_accuracy(output, self.questions['answer'])

    def process_questions_and_answers(self, s='_____'):
        answer_idxs, candidate_questions = [], []
        for index, row in self.questions.iterrows():
            answer = answers.iloc[index].answer + ')'
            answer_idxs.append(self.choices.get(answer))
            candidate_questions.append([re.sub(s, row.loc[c], row.loc['question']) for c in self.choices.keys()])
        self.questions.loc[:, 'candidate_questions'] = candidate_questions
        self.questions.loc[:, 'answer'] = answer_idxs

    def get_sublist_idxs_in_list(self, word, sentence):
        # find mask indicies for encoded sentence
        possibles = np.where(sentence == word[0])[0]
        for p in possibles:
            check = sentence[p:p + len(word)]
            if np.all(check == word):
                return list(range(p, (p + len(word))))

    def make_encodings(self):
        sent_encodings, word_encodings, mask_idxs = [], [], []
        for index, row in self.questions.iterrows():
            _sent_encodings, _word_encodings, _mask_idxs = [], [], []
            for i, (word, sentence) in enumerate(zip(row[self.choices.keys()], row.loc['candidate_questions'])):
                encoded_word = self.tokenizer.encode(str(" " + word), add_special_tokens=False)
                encoded_sent = self.tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors='pt',
                                                        padding='max_length', max_length=128, return_attention_mask=True)
                tokens_to_mask_idx = self.get_sublist_idxs_in_list(np.array(encoded_word), np.array(encoded_sent['input_ids'][0]))
                encoded_sent['input_ids'][0][tokens_to_mask_idx] = self.tokenizer.mask_token_id
                _sent_encodings.append(encoded_sent)
                _word_encodings.append(encoded_word)
                _mask_idxs.append(tokens_to_mask_idx)
            sent_encodings.append(_sent_encodings)
            word_encodings.append(_word_encodings)
            mask_idxs.append(_mask_idxs)
        return sent_encodings, word_encodings, mask_idxs

    def make_predictions(self):
        output = []
        for q_idx, (w, s, m) in enumerate(zip(self.word_encodings, self.sent_encodings, self.mask_idxs)):
            print(f'Question {q_idx}')
            predictions = []
            candidate_input_ids = torch.stack([inp_ids['input_ids'].squeeze(0) for inp_ids in s])
            candidate_attention_masks = torch.stack([am['attention_mask'].squeeze(0) for am in s])
            candidate_logits = self.model(candidate_input_ids, attention_mask=candidate_attention_masks).logits
            for idx, (token, mask_idxs) in enumerate(zip(w, m)):
                mask_token_logits = candidate_logits[idx, mask_idxs, token]
                candidate_score = float(torch.mean(mask_token_logits))
                predictions.append(candidate_score)
            output.append(np.argmax(predictions) + 1)
        return output 

    def get_model_accuracy(self, predictions, ground_truth):
        correct = 0
        for pred, gt in zip(predictions, ground_truth):
            if pred == gt:
                correct += 1
        return correct/len(ground_truth)



In [19]:
questions = pd.read_csv("../MSR/testing_data.csv")
answers = pd.read_csv("../MSR/test_answer.csv")
evaluator = LanguageModelEvaluator(questions, answers, choices, 'distilroberta-base', trainer.model)

1040


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 1,


In [20]:
trainer.model.cpu()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [21]:
evaluator.run_model_and_evaluate()

Question 0
Question 1
Question 2
Question 3
Question 4
Question 5
Question 6
Question 7
Question 8
Question 9
Question 10
Question 11
Question 12
Question 13
Question 14
Question 15
Question 16
Question 17
Question 18
Question 19
Question 20
Question 21
Question 22
Question 23
Question 24
Question 25
Question 26
Question 27
Question 28
Question 29
Question 30
Question 31
Question 32
Question 33
Question 34
Question 35
Question 36
Question 37
Question 38
Question 39
Question 40
Question 41
Question 42
Question 43
Question 44
Question 45
Question 46
Question 47
Question 48
Question 49
Question 50
Question 51
Question 52
Question 53
Question 54
Question 55
Question 56
Question 57
Question 58
Question 59
Question 60
Question 61
Question 62
Question 63
Question 64
Question 65
Question 66
Question 67
Question 68
Question 69
Question 70
Question 71
Question 72
Question 73
Question 74
Question 75
Question 76
Question 77
Question 78
Question 79
Question 80
Question 81
Question 82
Question 83
Qu

In [22]:
evaluator.accuracy

0.7346153846153847