In [None]:
import json

In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
import sys
sys.path.append('drive/gdrive/MyDrive/training_set.json')

In [None]:
with open('training_set.json','rb') as f:
     squad = json.load(f)

UNDERSTADING THE  DATA STRUCTURE AND TYPE

In [None]:
type(squad)

In [None]:
squad.keys()

In [None]:
squad

In [None]:
type(squad['data'])

In [None]:
print(len(squad['data']))

In [None]:
squad['data']

In [None]:
type(squad['data'][0])

In [None]:
squad['data'][0].keys()

In [None]:
type(squad['data'][1]['title'][0])

In [None]:
type(squad['data'][0]['paragraphs'][0])

In [None]:
squad['data'][0]['paragraphs'][0].keys()

In [None]:
type(squad['data'][0]['paragraphs'][0]['context'][0])

In [None]:
type(squad['data'][0]['paragraphs'][0]['qas'][0])

In [None]:
squad['data'][0]['paragraphs'][0]['qas'][0].keys()

In [None]:
type(squad['data'][0]['paragraphs'][0]['qas'][0] ['answers'][0])

In [None]:
squad['data'][0]['paragraphs'][0]['qas'][0] ['answers'][0].keys()

In [None]:
type(squad['data'][0]['paragraphs'][0]['qas'][0] ['id'][0])

In [None]:
type(squad['data'][0]['paragraphs'][0]['qas'][0] ['question'][0])

CONSOLIDATION OF DATA TYPE

# POINT 1 : 'squad' is a complicated dictionary with two keys 'data' and 'version'. 

# POINT 2 : With respect to our work at hand, the value of the key'version' need not play any role at all. 

# POINT 3:  All expected analysis, will spin around the value corresponding to the 'data' key.

# POINT 4: The value corresponding to 'data' key is itself a list comprising 442 elements. 
#          Since we need to split the given data set into training and validation, it is at the 'data' key level we possibly
#          have to do the splitting.

# POINT 5: The 'data', which is of type list, the elements of that list is again a dictionary having two keys 
#          'title' and 'paragraphs'.

# POINT 6: 'title' is of type string whereas'paragraphs' is again of type dictionary with two keys 'context' and 'qas'.

# POINT 7:  'context' is of type strings and 'qas' is again a dictionary.

# POINT 8: The dictionary 'qas' contains three keys namely, 'answers','question' and 'id'.'id' and 'questions' are strings 
#          whereas 'answers' is again another dictionary with keys 'answer_start' and 'text'.

# POINT 9 : Also note, in the original data set we do not have any answer_end index as would be required as we use BERT or variants for building the Q/A application


Extracting the Contexts, Questions and Answers

In [None]:
# Refer : https://huggingface.co/transformers/v4.0.1/custom_datasets.html
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('training_set.json')

In [None]:
for group in squad['data']:
    print(group['title'])

Printing the Contexts, Questions and Answers

In [None]:
# Note the repeatation of Context in the contexts list
train_contexts[:10]

In [None]:
train_answers[:10]

In [None]:
train_questions[:10]

In [None]:
len(train_contexts)

In [None]:
len(train_questions)

In [None]:
len(train_answers)

Computing the end index (To understand end of answers)

In [None]:
# Refer : https://huggingface.co/transformers/v4.0.1/custom_datasets.html
 
def add_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_index = start_idx + len(gold_text)
        answer['answer_end'] = end_index

In [None]:
add_end_index(train_answers,train_contexts)

In [None]:
print(train_answers[:10])

Installing Transformers

In [None]:
! pip install transformers

Performing Encoding

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_contexts,train_questions,padding=True,truncation=True)

In [None]:
#Refer : https://huggingface.co/docs/transformers/preprocessing
tokenizer.decode(train_encodings['input_ids'][0])

In [None]:
tokenizer.decode(train_encodings['input_ids'][5])

ADD TOKEN POSITIONS

In [None]:
#Refer : https://huggingface.co/docs/transformers/preprocessing
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)

In [None]:
train_encodings.keys()

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")