<a href="https://colab.research.google.com/github/2002sairuthvik/DL_Learning/blob/main/BERT_Retraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%%capture
!pip install transformers
!pip install dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import os
mrscc_dir = '/content/drive/MyDrive/mrscc'
questions = pd.read_csv(os.path.join(mrscc_dir, 'testing_data.csv'))
answers = pd.read_csv(os.path.join(mrscc_dir, 'test_answer.csv'))

In [9]:
questions

Unnamed: 0,id,question,a),b),c),d),e)
0,1,I have it from the same source that you are bo...,crying,instantaneously,residing,matched,walking
1,2,It was furnished partly as a sitting and partl...,daintily,privately,inadvertently,miserably,comfortably
2,3,"As I descended , my old ally , the _____ , cam...",gods,moon,panther,guard,country-dance
3,4,"We got off , _____ our fare , and the trap rat...",rubbing,doubling,paid,naming,carrying
4,5,"He held in his hand a _____ of blue paper , sc...",supply,parcel,sign,sheet,chorus
...,...,...,...,...,...,...,...
1035,1036,The bedrooms in this _____ are on the ground f...,wing,coach,balcony,kingdom,neighbourhood
1036,1037,Our visitor bore every mark of being an averag...,blind,energetic,eloquent,pompous,sandy-haired
1037,1038,"The terror of his face lay in his eyes , howev...",cruelty,novitiate,justice,broker,success
1038,1039,"It is your commonplace , _____ crimes which ar...",underlying,featureless,theological,flattering,inevitable


In [10]:
answers

Unnamed: 0,id,answer
0,1,c
1,2,a
2,3,d
3,4,c
4,5,d
...,...,...
1035,1036,a
1036,1037,d
1037,1038,a
1038,1039,b


Applying model for MLM taks

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np
import re

choices = {'a)':1, 'b)':2, 'c)':3, 'd)':4, 'e)':5}
model_name = 'distilroberta-base'

In [8]:
class LanguageModelEvaluator():

  def __init__(self, q, a, c, mn):
    self.questions, self.answers, self.choices, self.model_name = q, a, c, mn
    print(len(self.questions))
    self.process_questions_and_answers()
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.model = AutoModelForMaskedLM.from_pretrained(model_name)
    self.sent_encodings, self.word_encodings, self.mask_idxs = self.make_encodings()

  def run_model_and_evaluate(self):
    output = self.make_predictions()
    self.accuracy = self.get_model_accuracy(output, self.questions['answer'])

  def process_questions_and_answers(self, s='_____'):
    answer_idxs, candidate_questions = [], []
    for index, row in self.questions.iterrows():
      answer = answers.iloc[index].answer + ')'
      answer_idxs.append(self.choices.get(answer))
      candidate_questions.append([re.sub(s, row.loc[c], row.loc['question']) for c in self.choices.keys()])
    self.questions.loc[:, 'candidate_questions'] = candidate_questions
    self.questions.loc[:, 'answer'] = answer_idxs

  def get_sublist_idxs_in_list(self, word, sentence):
    # find mask indicies for encoded sentence
    possibles = np.where(sentence == word[0])[0]
    for p in possibles:
      check = sentence[p:p + len(word)]
      if np.all(check == word):
          return list(range(p, (p + len(word))))

  def make_encodings(self):
    sent_encodings, word_encodings, mask_idxs = [], [], []
    for index, row in self.questions.iterrows():
        _sent_encodings, _word_encodings, _mask_idxs = [], [], []
        for i, (word, sentence) in enumerate(zip(row[self.choices.keys()], row.loc['candidate_questions'])):
          encoded_word = self.tokenizer.encode(str(" " + word), add_special_tokens=False)
          encoded_sent = self.tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors='pt',
                                                padding='max_length', max_length=128, return_attention_mask=True)
          tokens_to_mask_idx = self.get_sublist_idxs_in_list(np.array(encoded_word), np.array(encoded_sent['input_ids'][0]))
          encoded_sent['input_ids'][0][tokens_to_mask_idx] = self.tokenizer.mask_token_id
          _sent_encodings.append(encoded_sent)
          _word_encodings.append(encoded_word)
          _mask_idxs.append(tokens_to_mask_idx)
        sent_encodings.append(_sent_encodings)
        word_encodings.append(_word_encodings)
        mask_idxs.append(_mask_idxs)
    return sent_encodings, word_encodings, mask_idxs

  def make_predictions(self):
    output = []
    for q_idx, (w, s, m) in enumerate(zip(self.word_encodings, self.sent_encodings, self.mask_idxs)):
      print(f'Question {q_idx}')
      predictions = []
      candidate_input_ids = torch.stack([inp_ids['input_ids'].squeeze(0) for inp_ids in s])
      candidate_attention_masks = torch.stack([am['attention_mask'].squeeze(0) for am in s])
      candidate_logits = self.model(candidate_input_ids, attention_mask=candidate_attention_masks).logits
      for idx, (token, mask_idxs) in enumerate(zip(w, m)):
        mask_token_logits = candidate_logits[idx, mask_idxs, token]
        candidate_score = float(torch.mean(mask_token_logits))
        predictions.append(candidate_score)
      output.append(np.argmax(predictions) + 1)
    return output

  def get_model_accuracy(self, predictions, ground_truth):
    correct = 0
    for pred, gt in zip(predictions, ground_truth):
      if pred == gt:
        correct += 1
    return correct/len(ground_truth)


In [9]:
evaluator = LanguageModelEvaluator(questions[:100],answers,choices,model_name)

100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.questions.loc[:, 'candidate_questions'] = candidate_questions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.questions.loc[:, 'answer'] = answer_idxs
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended b

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
evaluator.get_model_accuracy(evaluator.make_predictions(), evaluator.questions['answer'])

Question 0
Question 1
Question 2
Question 3
Question 4
Question 5
Question 6
Question 7
Question 8
Question 9
Question 10
Question 11
Question 12
Question 13
Question 14
Question 15
Question 16
Question 17
Question 18
Question 19
Question 20
Question 21
Question 22
Question 23
Question 24
Question 25
Question 26
Question 27
Question 28
Question 29
Question 30
Question 31
Question 32
Question 33
Question 34
Question 35
Question 36
Question 37
Question 38
Question 39
Question 40
Question 41
Question 42
Question 43
Question 44
Question 45
Question 46
Question 47
Question 48
Question 49
Question 50
Question 51
Question 52
Question 53
Question 54
Question 55
Question 56
Question 57
Question 58
Question 59
Question 60
Question 61
Question 62
Question 63
Question 64
Question 65
Question 66
Question 67
Question 68
Question 69
Question 70
Question 71
Question 72
Question 73
Question 74
Question 75
Question 76
Question 77
Question 78
Question 79
Question 80
Question 81
Question 82
Question 83
Qu

0.68

In [8]:
def processfiles(files, train_dir):
  texts = []
  for i, a_file in enumerate(files):
    text = ""
    try:
      with open(os.path.join(train_dir,a_file)) as instream:
        for line in instream:
          text += line
          texts.append(text)
    except UnicodeDecodeError:
      print(f"Unicode error for this file: {a_file}")
  return texts

In [9]:
import glob
file_names = glob.glob('/content/drive/MyDrive/Holmes_Training_Data/*')
sample_files = file_names[:100]

In [10]:
texts = processfiles(sample_files, mrscc_dir)

Unicode error for this file: /content/drive/MyDrive/Holmes_Training_Data/TNGLW10.TXT
Unicode error for this file: /content/drive/MyDrive/Holmes_Training_Data/WTSLW10.TXT


In [12]:
!pip install datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [13]:
from transformers import AutoTokenizer
import datasets
from datasets import Dataset

In [14]:
ds_len = 5
texts_dict = {'text':[t for t in texts[:ds_len]]}

In [15]:
ds = Dataset.from_dict(texts_dict)

In [16]:
ds

Dataset({
    features: ['text'],
    num_rows: 5
})

In [19]:
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_ds = ds.map(lambda batch : tokenizer(batch['text'], remove_columns=['text']), batched=True, num_proc=2)

Map (num_proc=2):   0%|          | 0/5 [00:00<?, ? examples/s]

TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'remove_columns'

In [None]:
block_size = 128

def group_texts(text):
  concat_text = {k:sum(texts[k],[]) for k in texts.keys()}
  total_length = len(concat_text[list(text.keys())[0]])
  total_length = (total_length // block_size) * block_size
  result = {k:concat_text[k][:total_length] for k in concat_text.keys()}
  result['labels'] = result['input_ids'].copy()
  return result

In [20]:
lm_dataset = tokenized_ds.map(group_texts, batched=True, num_proc=2)

NameError: name 'tokenized_ds' is not defined