In [None]:
%%capture
!pip install transformers
!pip install dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
mrscc_dir = '/content/drive/MyDrive/mrscc'
questions = pd.read_csv(os.path.join(mrscc_dir, 'testing_data.csv'))
answers = pd.read_csv(os.path.join(mrscc_dir, 'test_answer.csv'))

In [None]:
questions

In [None]:
answers

Applying model for MLM taks

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np
import re

choices = {'a)':1, 'b)':2, 'c)':3, 'd)':4, 'e)':5}
model_name = 'distilroberta-base'

In [None]:
class LanguageModelEvaluator():

  def __init__(self, q, a, c, mn):
    self.questions, self.answers, self.choices, self.model_name = q, a, c, mn
    print(len(self.questions))
    self.process_questions_and_answers()
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.model = AutoModelForMaskedLM.from_pretrained(model_name)
    self.sent_encodings, self.word_encodings, self.mask_idxs = self.make_encodings()

  def run_model_and_evaluate(self):
    output = self.make_predictions()
    self.accuracy = self.get_model_accuracy(output, self.questions['answer'])

  def process_questions_and_answers(self, s='_____'):
    answer_idxs, candidate_questions = [], []
    for index, row in self.questions.iterrows():
      answer = answers.iloc[index].answer + ')'
      answer_idxs.append(self.choices.get(answer))
      candidate_questions.append([re.sub(s, row.loc[c], row.loc['question']) for c in self.choices.keys()])
    self.questions.loc[:, 'candidate_questions'] = candidate_questions
    self.questions.loc[:, 'answer'] = answer_idxs

  def get_sublist_idxs_in_list(self, word, sentence):
    # find mask indicies for encoded sentence
    possibles = np.where(sentence == word[0])[0]
    for p in possibles:
      check = sentence[p:p + len(word)]
      if np.all(check == word):
          return list(range(p, (p + len(word))))

  def make_encodings(self):
    sent_encodings, word_encodings, mask_idxs = [], [], []
    for index, row in self.questions.iterrows():
        _sent_encodings, _word_encodings, _mask_idxs = [], [], []
        for i, (word, sentence) in enumerate(zip(row[self.choices.keys()], row.loc['candidate_questions'])):
          encoded_word = self.tokenizer.encode(str(" " + word), add_special_tokens=False)
          encoded_sent = self.tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors='pt',
                                                padding='max_length', max_length=128, return_attention_mask=True)
          tokens_to_mask_idx = self.get_sublist_idxs_in_list(np.array(encoded_word), np.array(encoded_sent['input_ids'][0]))
          encoded_sent['input_ids'][0][tokens_to_mask_idx] = self.tokenizer.mask_token_id
          _sent_encodings.append(encoded_sent)
          _word_encodings.append(encoded_word)
          _mask_idxs.append(tokens_to_mask_idx)
        sent_encodings.append(_sent_encodings)
        word_encodings.append(_word_encodings)
        mask_idxs.append(_mask_idxs)
    return sent_encodings, word_encodings, mask_idxs

  def make_predictions(self):
    output = []
    for q_idx, (w, s, m) in enumerate(zip(self.word_encodings, self.sent_encodings, self.mask_idxs)):
      print(f'Question {q_idx}')
      predictions = []
      candidate_input_ids = torch.stack([inp_ids['input_ids'].squeeze(0) for inp_ids in s])
      candidate_attention_masks = torch.stack([am['attention_mask'].squeeze(0) for am in s])
      candidate_logits = self.model(candidate_input_ids, attention_mask=candidate_attention_masks).logits
      for idx, (token, mask_idxs) in enumerate(zip(w, m)):
        mask_token_logits = candidate_logits[idx, mask_idxs, token]
        candidate_score = float(torch.mean(mask_token_logits))
        predictions.append(candidate_score)
      output.append(np.argmax(predictions) + 1)
    return output

  def get_model_accuracy(self, predictions, ground_truth):
    correct = 0
    for pred, gt in zip(predictions, ground_truth):
      if pred == gt:
        correct += 1
    return correct/len(ground_truth)


In [None]:
evaluator = LanguageModelEvaluator(questions[:100],answers,choices,model_name)

In [None]:
evaluator.get_model_accuracy(evaluator.make_predictions(), evaluator.questions['answer'])

In [None]:
def processfiles(files, train_dir):
  texts = []
  for i, a_file in enumerate(files):
    text = ""
    try:
      with open(os.path.join(train_dir,a_file)) as instream:
        for line in instream:
          text += line
          texts.append(text)
    except UnicodeDecodeError:
      print(f"Unicode error for this file: {a_file}")
  return texts

In [None]:
import glob
file_names = glob.glob('/content/drive/MyDrive/Holmes_Training_Data/*')
sample_files = file_names[:100]

In [None]:
texts = processfiles(sample_files, mrscc_dir)

In [None]:
!pip install datasets

In [None]:
from transformers import AutoTokenizer
import datasets
from datasets import Dataset

In [None]:
ds_len = 5
texts_dict = {'text':[t for t in texts[:ds_len]]}

In [None]:
ds = Dataset.from_dict(texts_dict)

In [None]:
ds

In [None]:
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_ds = ds.map(lambda batch : tokenizer(batch['text'], remove_columns=['text']), batched=True, num_proc=2)

In [None]:
block_size = 128

def group_texts(text):
  concat_text = {k:sum(texts[k],[]) for k in texts.keys()}
  total_length = len(concat_text[list(text.keys())[0]])
  total_length = (total_length // block_size) * block_size
  result = {k:concat_text[k][:total_length] for k in concat_text.keys()}
  result['labels'] = result['input_ids'].copy()
  return result

In [None]:
lm_dataset = tokenized_ds.map(group_texts, batched=True, num_proc=2)