In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import json
from pathlib import Path

In [2]:
from google.colab import files
files.upload()

Saving train_bangla_samples_fixed_preprocessed.json to train_bangla_samples_fixed_preprocessed.json
Saving valid_bangla_samples_fixed_preprocessed.json to valid_bangla_samples_fixed_preprocessed.json


In [2]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers                

In [3]:
train_contexts, train_questions, train_answers = read_squad('/content/train_bangla_samples_fixed_preprocessed.json')
val_contexts, val_questions, val_answers = read_squad('/content/valid_bangla_samples_fixed_preprocessed.json')

In [4]:
train_answers[50]

{'answer_start': 1014, 'text': ' শহরগুলি এ'}

In [4]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

       
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1   
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [5]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [6]:
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [7]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True,return_tensors='pt',max_length=50)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True,return_tensors='pt',max_length=50)

In [8]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


In [9]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [10]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [10]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)



In [11]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [12]:
from transformers import  BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-uncased',return_dict=True,output_attentions=True)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-b

In [13]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  
Epoch 0: 100%|██████████| 2121/2121 [04:20<00:00,  8.13it/s, loss=1.38]


In [15]:
model_path='/content/models'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/models/tokenizer_config.json',
 '/content/models/special_tokens_map.json',
 '/content/models/vocab.txt',
 '/content/models/added_tokens.json')

In [17]:
model_path = '/content/models'
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [18]:
model = model.to(device)
model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
acc = []
for batch in val_loader:
    with torch.no_grad():
         input_ids = batch['input_ids'].to(device)
         attention_mask = batch['attention_mask'].to(device)
         start_true = batch['start_positions'].to(device)
         end_true = batch['end_positions'].to(device)
         outputs = model(input_ids, attention_mask=attention_mask)
         start_pred = torch.argmax(outputs['start_logits'], dim=1)
         end_pred = torch.argmax(outputs['end_logits'], dim=1)
         acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
         acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
acc = sum(acc)/len(acc)
print(acc)

  


0.07952724358974358


In [19]:
for contxt,quas in zip(val_contexts,val_questions):
    val_encodings=tokenizer(contxt, quas, truncation=True, padding=True)
    with torch.no_grad():
         outputs = model(input_ids, attention_mask=attention_mask)
         start_pred = torch.argmax(outputs['start_logits'], dim=1)
         end_pred = torch.argmax(outputs['end_logits'], dim=1)
         all_tokens = tokenizer.convert_ids_to_tokens(val_encodings["input_ids"])
         answer = ' '.join(all_tokens[start_pred[0]:end_pred[0]])
         print("The qusetion is :",quas)
         print("The answer is :",answer)
         print('*'*50)

The qusetion is : চার্লসটন, দক্ষিণ ক্যারোলিনা কোন কাউন্টিতে অবস্থিত?  
The answer is : 
**************************************************
The qusetion is : চার্লসটন কোন বন্দরে অবস্থিত?  
The answer is : 
**************************************************
The qusetion is : চার্লসটন হারবার কোন মহাসাগরের খাঁড়ি?  
The answer is : 
**************************************************
The qusetion is : কোপার নদীর সাথে কোন নদী মিশে গিয়ে চার্লস্টন হারবার গঠন করে?  
The answer is : 
**************************************************
The qusetion is : চার্লসটন কোন কাউন্টিতে অবস্থিত?  
The answer is : 
**************************************************
The qusetion is : চার্লসটন হারবার কোন সমুদ্রের উপর গঠিত?  
The answer is : 
**************************************************
The qusetion is : চার্লসটন হারবার থেকে অ্যাশলে নদীর সাথে কোন নদী মিশে গেছে?  
The answer is : 
**************************************************
The qusetion is : চার্লসটন কোন সালে প্রতিষ্ঠিত হয়েছিল?  
The answer is : 
*

KeyboardInterrupt: ignored