# prepare the dataset

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset('squad')

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to C:/Users/Ethan/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to C:/Users/Ethan/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [8]:
raw_datasets['train'][0]['answers']

{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}

In [9]:
print("Context: ", raw_datasets["train"][0]["context"])
print("Question: ", raw_datasets["train"][0]["question"])
print("Answer: ", raw_datasets["train"][0]["answers"])

Context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:  {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [10]:
#make sure only one answer for each question
raw_datasets['train'].filter(lambda x: len(x['answers']['text']) != 1)

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 0
})

In [12]:
#we can have more answers for the validationsets
raw_datasets['validation'][2]['answers']

{'text': ['Santa Clara, California',
  "Levi's Stadium",
  "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."],
 'answer_start': [403, 355, 355]}

# preproces the data

### convert text

In [13]:
from transformers import AutoTokenizer

model_checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
tokenizer.is_fast

True

In [20]:
# trial tokenize the inputs for question and context
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP]'

### since the text might be longer that the limit, we use sliding window to truncate the features, with length 100 and stride 50

In [21]:
inputs = tokenizer(
    question,
    context,
    max_length = 100,
    truncation = "only_second",
    stride = 50,
    return_overflowing_tokens=True,
)

for ids in inputs['input_ids']:
    print(tokenizer.decode(ids))
# we have four parts, for the parts without an answer, we will leave start only without end as 0

[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basi [SEP]
[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin [SEP]
[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Next to the Main Building is the B

### besides that, we also need to calculate out the end index using start + length of the answer

In [22]:
inputs = tokenizer(
    question,
    context,
    max_length = 100,
    truncation = "only_second",
    stride = 50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True, # this will calculate the end, base on the start and length of the answers
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [30]:
inputs["offset_mapping"]

[[(0, 0),
  (0, 2),
  (3, 7),
  (8, 11),
  (12, 15),
  (16, 22),
  (23, 27),
  (28, 37),
  (38, 44),
  (45, 47),
  (48, 52),
  (53, 55),
  (56, 59),
  (59, 63),
  (64, 70),
  (70, 71),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264, 2

In [31]:
inputs["overflow_to_sample_mapping"]

[0, 0, 0, 0]

### let's try more examples

In [32]:
inputs = tokenizer(
    raw_datasets['train'][2:6]['question'],
    raw_datasets['train'][2:6]['context'],
    max_length = 100,
    truncation = "only_second",
    stride = 50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True, # this will calculate the end, base on the start and length of the answers
)

In [33]:
print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 4 examples gave 19 features.
Here is where each comes from: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3].


## mapping the answer back to features

## try with only 4 records first

In [84]:
inputs['offset_mapping']

[[(0, 0),
  (0, 3),
  (4, 12),
  (13, 15),
  (16, 19),
  (20, 26),
  (27, 32),
  (33, 35),
  (36, 41),
  (42, 46),
  (47, 49),
  (50, 56),
  (57, 59),
  (60, 65),
  (66, 75),
  (75, 76),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264,

In [70]:
sequence_ids = inputs.sequence_ids(7)
sequence_ids

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 None,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 None]

In [74]:
print(sequence_ids[20])

1


In [77]:
answer = answers[3]
answer

{'text': ['September 1876'], 'answer_start': [248]}

In [83]:
offset

[(0, 0),
 (0, 4),
 (5, 8),
 (9, 12),
 (13, 15),
 (15, 18),
 (18, 23),
 (24, 32),
 (33, 35),
 (36, 41),
 (42, 45),
 (45, 46),
 (47, 52),
 (53, 63),
 (63, 64),
 (0, 0),
 (1052, 1053),
 (1054, 1056),
 (1057, 1061),
 (1061, 1062),
 (1063, 1067),
 (1068, 1073),
 (1074, 1082),
 (1083, 1091),
 (1092, 1096),
 (1097, 1100),
 (1101, 1106),
 (1107, 1113),
 (1114, 1115),
 (1116, 1123),
 (1124, 1128),
 (1128, 1129),
 (1130, 1133),
 (1134, 1146),
 (1147, 1152),
 (1153, 1158),
 (1159, 1164),
 (1165, 1169),
 (1170, 1174),
 (1175, 1185),
 (1185, 1186),
 (1187, 1194),
 (1195, 1200),
 (1201, 1203),
 (1204, 1213),
 (1214, 1216),
 (1217, 1222),
 (1223, 1225),
 (1226, 1229),
 (1230, 1238),
 (1238, 1239),
 (1240, 1247),
 (1247, 1248),
 (1249, 1252),
 (1253, 1258),
 (1259, 1262),
 (1263, 1274),
 (1275, 1277),
 (1278, 1281),
 (1282, 1290),
 (1290, 1291),
 (1292, 1299),
 (1299, 1300),
 (1301, 1303),
 (1304, 1310),
 (1311, 1315),
 (1316, 1318),
 (1319, 1332),
 (1333, 1340),
 (1341, 1344),
 (1345, 1354),
 (1355, 

In [88]:
answers = raw_datasets['train'][2:6]['answers']
start_positions = []
end_positions = []

for i, offset in enumerate(inputs['offset_mapping']):
    sample_idx = inputs['overflow_to_sample_mapping'][i] # which sample we are on, like the first record, second record
    answer = answers[sample_idx] # get the answer repeatedly for that record. 
    start_char = answer['answer_start'][0] # the answer start from this position
    end_char = answer["answer_start"][0] + len(answer["text"][0]) # end with start + len
    sequence_ids = inputs.sequence_ids(i) # get this record's token id? 
    
    # get the "relative" postion of offset tuple's index based on the sequence id
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1 
    context_start = idx # skip the questions part, jump into the context
    
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx -1 # find the end, +1,-1 is smart, since context will end with None
    
    # if the context starts after the answer's start, or end before the answer end, 
    # means the context doesn't include all the answers
    if(offset[context_start][0] > start_char or offset[context_end][0] < end_char):
        start_positions.append(0)
        end_positions.append(0)
    else:
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx+=1 # we keep increase idx until either go beyond the end, or pass the start of the answer
        start_positions.append(idx-1) # so it will looks like this [0,0,0,9,0,40,0,...]
        
        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1 # we keep decrease until start or below the end position of the answer
        end_positions.append(idx + 1)

start_positions, end_positions

([83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0],
 [85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0])

In [91]:
print(inputs['overflow_to_sample_mapping'])

[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]


### double check with the decode answer

In [96]:
answers

[{'text': ['the Main Building'], 'answer_start': [279]},
 {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]},
 {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]},
 {'text': ['September 1876'], 'answer_start': [248]}]

In [100]:
i = 0
sample_idx = inputs['overflow_to_sample_mapping'][0]
answer = answers[sample_idx]['text'][0] # the actual answer

start = start_positions[i]
end = end_positions[i]
labeled_answer = tokenizer.decode(inputs['input_ids'][0][start : end + 1]) # the answer after decode from the tokenized inputs
print(f"actual answer : {answer}, predicted answer: {labeled_answer}")

actual answer : the Main Building, predicted answer: the Main Building


In [101]:
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")

Theoretical answer: a Marian place of prayer and reflection, decoded example: [CLS] What is the Grotto at Notre Dame? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grot [SEP]


# the actual function for preprocess the text

In [112]:
max_length=384
stride=128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in raw_datasets['question']] # get the questions without redundant spaces
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset_mapping = inputs.pop("offset_mapping") # pop out the mapping for sliding windows
    sample_map = inputs.pop("overflow_to_sample_mapping") # pop out the relation
    answers = examples['answers']
    start_positions = []
    end_positions = []
    
    # same as the previous explanation
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [116]:
train_dataset = raw_datasets['train'].map(
    preprocess_training_examples,
    batched = True,
    remove_columns = raw_datasets['train'].column_names,
)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [117]:
len(raw_datasets['train']), len(train_dataset)

(87599, 88729)

### the actual function for preprocess the validation data

In [128]:
raw_datasets['validation']['id']

['56be4db0acb8001400a502ec',
 '56be4db0acb8001400a502ed',
 '56be4db0acb8001400a502ee',
 '56be4db0acb8001400a502ef',
 '56be4db0acb8001400a502f0',
 '56be8e613aeaaa14008c90d1',
 '56be8e613aeaaa14008c90d2',
 '56be8e613aeaaa14008c90d3',
 '56bea9923aeaaa14008c91b9',
 '56bea9923aeaaa14008c91ba',
 '56bea9923aeaaa14008c91bb',
 '56beace93aeaaa14008c91df',
 '56beace93aeaaa14008c91e0',
 '56beace93aeaaa14008c91e1',
 '56beace93aeaaa14008c91e2',
 '56beace93aeaaa14008c91e3',
 '56bf10f43aeaaa14008c94fd',
 '56bf10f43aeaaa14008c94fe',
 '56bf10f43aeaaa14008c94ff',
 '56bf10f43aeaaa14008c9500',
 '56bf10f43aeaaa14008c9501',
 '56d20362e7d4791d009025e8',
 '56d20362e7d4791d009025e9',
 '56d20362e7d4791d009025ea',
 '56d20362e7d4791d009025eb',
 '56d600e31c85041400946eae',
 '56d600e31c85041400946eb0',
 '56d600e31c85041400946eb1',
 '56d9895ddc89441400fdb50e',
 '56d9895ddc89441400fdb510',
 '56be4e1facb8001400a502f6',
 '56be4e1facb8001400a502f9',
 '56be4e1facb8001400a502fa',
 '56beaa4a3aeaaa14008c91c2',
 '56beaa4a3aea

In [132]:
for k, o in enumerate(inputs['offset_mapping'][0]):
    print(k,o)

0 (0, 0)
1 (0, 3)
2 (4, 12)
3 (13, 15)
4 (16, 19)
5 (20, 26)
6 (27, 32)
7 (33, 35)
8 (36, 41)
9 (42, 46)
10 (47, 49)
11 (50, 56)
12 (57, 59)
13 (60, 65)
14 (66, 75)
15 (75, 76)
16 (0, 0)
17 (0, 13)
18 (13, 15)
19 (15, 16)
20 (17, 20)
21 (21, 27)
22 (28, 31)
23 (32, 33)
24 (34, 42)
25 (43, 52)
26 (52, 53)
27 (54, 56)
28 (56, 58)
29 (59, 62)
30 (63, 67)
31 (68, 76)
32 (76, 77)
33 (77, 78)
34 (79, 83)
35 (84, 88)
36 (89, 91)
37 (92, 93)
38 (94, 100)
39 (101, 107)
40 (108, 110)
41 (111, 114)
42 (115, 121)
43 (122, 126)
44 (126, 127)
45 (128, 139)
46 (140, 142)
47 (143, 148)
48 (149, 151)
49 (152, 155)
50 (156, 160)
51 (161, 169)
52 (170, 173)
53 (174, 180)
54 (181, 183)
55 (183, 184)
56 (185, 187)
57 (188, 189)
58 (190, 196)
59 (197, 203)
60 (204, 206)
61 (207, 213)
62 (214, 218)
63 (219, 223)
64 (224, 226)
65 (226, 229)
66 (229, 232)
67 (233, 237)
68 (238, 241)
69 (242, 248)
70 (249, 250)
71 (250, 251)
72 (251, 254)
73 (254, 256)
74 (257, 259)
75 (260, 262)
76 (263, 264)
77 (264, 265)
78 

In [149]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    
    #1. change idx to actually ids, 2. switch offset map to None for questions and padding
    for i in range (len(inputs['input_ids'])): # i will be all sliding windows
        sample_idx = sample_map[i] # again, get which original record we are working on
        example_ids.append(examples['id'][sample_idx]) # switch index into id: (0,1,2,3) --> (id1,id1,id1,id2)
        
        sequence_ids = inputs.sequence_ids(i)# get "this" sliding window's sequence ids
        offset = inputs['offset_mapping'][i] # get the offset table for question and context\
        inputs['offset_mapping'][i] = [ # we update the offset table, None for question, no change for context
            o if sequence_ids[k] == 1 else None for k,o in enumerate(offset)
        ]
        
    inputs['example_id'] = example_ids # adding a new columns for ids
    return inputs

In [150]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10822)

# post processing

### test on small validation sets

In [151]:
small_eval_set = raw_datasets['validation'].select(range(100))
trained_checkpoint = 'distilbert-base-cased-distilled-squad'

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets['validation'].column_names,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [169]:
eval_set

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 100
})

In [153]:
#switch back to the old tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### a test run to get some simulated outputs

In [155]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(['example_id','offset_mapping']) # we dont need those columns?
eval_set_for_model.set_format('torch')

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

In [156]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [176]:
start_logits[0]

array([-2.2607305 , -5.1783237 , -5.2708955 , -6.085875  , -6.450712  ,
       -6.267092  , -5.314638  , -6.303294  , -6.0470333 , -7.300931  ,
       -5.6778746 , -3.7498558 , -4.7882442 , -0.3857287 , -4.396611  ,
       -1.9304978 , -5.377072  , -4.38861   , -2.5399942 , -4.352415  ,
       -4.3879633 , -4.332361  , -4.679052  , -3.5798018 , -2.3069317 ,
       -6.5530424 , -2.7710173 , -0.8722055 , -3.0604641 , -2.9521334 ,
       -4.1954746 , -1.3516878 , -3.7250593 , -4.7030296 , -4.068798  ,
       -0.8678019 , -3.6668823 , -1.882703  ,  4.400489  ,  2.943784  ,
       -0.79794705, -1.3878798 , -0.6945326 ,  1.5780797 , -1.7355437 ,
        0.52237374, 10.694445  ,  4.4599714 , -1.3703669 , -0.04970801,
        2.0126493 , -2.671808  , -2.298387  , -1.5135034 ,  0.02073927,
       -2.7323515 , -0.02802638,  9.803682  ,  2.7017295 , -1.8932886 ,
       -6.2241735 , -3.2045467 , -3.996812  , -4.2383113 , -3.569338  ,
       -2.5027401 , -4.011326  , -5.996893  , -4.775754  , -2.77

### create id to index map

In [None]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature['example_id']].append(idx) # reset the index actually, id back to one on one for index

In [163]:
example_to_features

defaultdict(list,
            {'56be4db0acb8001400a502ec': [0],
             '56be4db0acb8001400a502ed': [1],
             '56be4db0acb8001400a502ee': [2],
             '56be4db0acb8001400a502ef': [3],
             '56be4db0acb8001400a502f0': [4],
             '56be8e613aeaaa14008c90d1': [5],
             '56be8e613aeaaa14008c90d2': [6],
             '56be8e613aeaaa14008c90d3': [7],
             '56bea9923aeaaa14008c91b9': [8],
             '56bea9923aeaaa14008c91ba': [9],
             '56bea9923aeaaa14008c91bb': [10],
             '56beace93aeaaa14008c91df': [11],
             '56beace93aeaaa14008c91e0': [12],
             '56beace93aeaaa14008c91e1': [13],
             '56beace93aeaaa14008c91e2': [14],
             '56beace93aeaaa14008c91e3': [15],
             '56bf10f43aeaaa14008c94fd': [16],
             '56bf10f43aeaaa14008c94fe': [17],
             '56bf10f43aeaaa14008c94ff': [18],
             '56bf10f43aeaaa14008c9500': [19],
             '56bf10f43aeaaa14008c9501': [20],
     

In [181]:
start_logits[0][46]

10.694445

In [186]:
np.argsort(start_logits[0])

array([360, 361, 362, 364, 370, 365, 359, 354, 374, 356, 363, 357, 373,
       358, 222, 328, 280, 383, 355, 198, 216, 369, 375, 206, 381, 219,
       242, 366, 371, 220, 380, 231, 376, 221, 353, 379, 352, 382, 368,
       247, 367, 215, 344, 235, 207, 241, 244, 372, 217, 243, 224, 226,
       340, 351, 180, 184, 349, 202, 199, 330, 193, 223, 262, 203, 347,
       204, 236, 282, 211, 225, 200, 205, 201, 208, 263, 378, 213, 197,
       339, 188, 248, 195, 196, 258, 377, 182, 257, 194, 246, 249, 240,
       261, 336, 209, 348, 212, 277, 324, 210, 228, 178, 275, 341, 260,
       189, 183, 335, 181, 343, 345, 342, 279, 271, 254, 185, 268, 281,
       259, 350, 278, 325, 192, 214, 334, 253, 238, 329, 267, 172, 332,
       239, 187, 173, 273, 285, 179, 232, 218, 230, 191, 346, 233, 272,
       276, 326, 255, 284, 227, 237, 234, 327, 331, 288, 264, 286, 293,
       190, 266, 337, 256, 174, 283, 250, 245, 186, 297, 302, 177, 229,
       296, 252, 290, 322, 323, 292, 338, 175, 176, 298, 265, 27

In [199]:
eval_set['offset_mapping'][70]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 [0, 3],
 [4, 11],
 [12, 16],
 [17, 19],
 [20, 25],
 [26, 30],
 [31, 33],
 [34, 39],
 [40, 44],
 [45, 47],
 [48, 51],
 [52, 57],
 [58, 65],
 [65, 66],
 [67, 73],
 [74, 77],
 [78, 85],
 [86, 88],
 [89, 95],
 [95, 96],
 [96, 97],
 [98, 105],
 [105, 106],
 [107, 112],
 [113, 119],
 [120, 123],
 [124, 129],
 [130, 135],
 [136, 139],
 [140, 146],
 [147, 150],
 [151, 155],
 [156, 161],
 [162, 170],
 [170, 171],
 [171, 172],
 [173, 182],
 [183, 184],
 [185, 191],
 [192, 197],
 [198, 202],
 [203, 212],
 [213, 216],
 [217, 218],
 [219, 228],
 [228, 229],
 [230, 236],
 [237, 247],
 [248, 251],
 [252, 258],
 [259, 262],
 [263, 268],
 [269, 274],
 [275, 279],
 [280, 283],
 [283, 284],
 [285, 294],
 [295, 299],
 [300, 304],
 [305, 312],
 [312, 313],
 [314, 315],
 [315, 316],
 [317, 322],
 [322, 323],
 [324, 327],
 [328, 331],
 [332, 338],
 [339, 345],
 [345, 346],
 [346, 347],
 None,
 None,
 None,
 None,
 Non

### compute the index for the answers

In [205]:
import numpy as numpy

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example['id'] # 56be4db0acb8001400a502ec
    context = example['context'] # context
    answers = []
    
    for feature_idx in example_to_features[example_id]:  # get which actual record we are in 
        start_logit = start_logits[feature_idx] # find the start index for the answer based on which record we are on
        end_logit = end_logits[feature_idx]
        offsets = eval_set['offset_mapping'][feature_idx] # get the particular offset table for the record
        
        start_indexes = np.argsort(start_logit)[-1: -n_best -1 : -1].tolist() # get the 20th largest possibility for start index
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() # get 2oth largest possible end index
        
        for start_index in start_indexes:
            for end_index in end_indexes:
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue # if start or end falles in None area, which is out bound, question, or padding area
                if(
                    end_index < start_index 
                    or 
                    end_index - start_index + 1 > max_answer_length
                ): # if not valid or too long
                    continue
                
                # the rest of the pairs should all be possible solutions
                answers.append(
                    {
                        # parsing the answer
                        "text":context[offsets[start_index][0]:offsets[end_index][1]], 
                        #score =  sum of possibility of start and end
                        "logit_score":start_logit[start_index] + end_logit[end_index], 
                    }
                )
    best_answer = max(answers,key = lambda x : x['logit_score']) # get the best score answer
    predicted_answers.append({"id":example_id,"prediction_text":best_answer['text']})

In [206]:
predicted_answers

[{'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'},
 {'id': '56be4db0acb8001400a502ed', 'prediction_text': 'Carolina Panthers'},
 {'id': '56be4db0acb8001400a502ee',
  'prediction_text': "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California"},
 {'id': '56be4db0acb8001400a502ef', 'prediction_text': 'Carolina Panthers'},
 {'id': '56be4db0acb8001400a502f0', 'prediction_text': 'gold'},
 {'id': '56be8e613aeaaa14008c90d1', 'prediction_text': 'golden anniversary'},
 {'id': '56be8e613aeaaa14008c90d2', 'prediction_text': 'February 7, 2016'},
 {'id': '56be8e613aeaaa14008c90d3',
  'prediction_text': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference'},
 {'id': '56bea9923aeaaa14008c91b9', 'prediction_text': 'golden anniversary'},
 {'id': '56bea9923aeaaa14008c91ba',
  'prediction_text': 'American Football Conference'},
 {'id': '56bea9923aeaaa14008c9

In [207]:
import evaluate

metric = evaluate.load('squad') # load squad metric for evaluation

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [216]:
theoretical_answers = [
    {"id":ex['id'], "answers":ex['answers']} for ex in small_eval_set
]

In [218]:
theoretical_answers[0]['answers']

{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],
 'answer_start': [177, 177, 177]}

In [219]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)
# trial evaluate, making sure the data we processed in a reasonable way

{'exact_match': 83.0, 'f1': 88.25000000000004}

# the actual compute_metric function

In [220]:
from tqdm.auto import tqdm

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [221]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

  0%|          | 0/100 [00:00<?, ?it/s]

{'exact_match': 83.0, 'f1': 88.25000000000004}

# Fine tuning