<a href="https://colab.research.google.com/github/BriniMohamedAyechi/DistilBERT_QA_Squad/blob/main/Finetuning_Distillbert_on_Q%26A_Squad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests

In [None]:
!pip install datasets


In [None]:
!pip install transformers

In [4]:
os.mkdir('squad')

In [5]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'


In [6]:
for file in ['train-v1.1.json','dev-v1.1.json']:
  res = requests.get(f'{url}{file}')
  with open(f'squad/{file}','wb') as f:
    for chunk in res.iter_content(chunk_size=4):
      f.write(chunk)


In [7]:
import json

In [8]:
with open('squad/train-v1.1.json','rb',) as f :
    squad_dict=json.load(f)

In [9]:
def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers


In [None]:
train_contexts, train_questions, train_answers = read_squad('/content/sample_data/train-v1.1.json') # here am using a smaller dataset to speed up the training process you can just replace this with the train-v1 from the original squad dataset
val_contexts, val_questions, val_answers = read_squad('squad/dev-v1.1.json')

In [None]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    # this means the answer is off by 'n' tokens
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [None]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
# @title #Tokenize/encode


In [None]:
from transformers import DistilBertTokenizerFast
import torch

# Initialize tokenizer with cache_dir option
cache_dir = './tokenizer_cache'  # Choose a directory where the cache will be stored
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', cache_dir=cache_dir)



# Tokenize and encode the data
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True, return_tensors='pt')


In [None]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
# @title Fine-Tuning


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
# Save the model and tokenizer for later use
os.makedirs('bert-question-answer', exist_ok=True)
model.save_pretrained('bert-question-answer')
tokenizer.save_pretrained('bert-question-answer')

In [None]:
import os
from transformers import BertTokenizer, BertForSequenceClassification
from google.colab import drive

# Assuming you have already fine-tuned the model and loaded it as 'model'
new_model_name = 'bert-question-answer'

# Save the model and tokenizer to Google Drive
drive_path = '/content/drive/MyDrive/mymodels/'  # Specify the folder path in Google Drive where you want to save the model
os.makedirs(drive_path, exist_ok=True)

model.save_pretrained(drive_path + new_model_name)
tokenizer.save_pretrained(drive_path + new_model_name)