In [1]:
from datasets import load_dataset

raw_datasets = load_dataset('squad')

Found cached dataset squad (C:/Users/Ethan/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
print("Context: ", raw_datasets["train"][0]["context"])
print("Question: ", raw_datasets["train"][0]["question"])
print("Answer: ", raw_datasets["train"][0]["answers"])

Context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:  {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [4]:
#make sure only one answer for each question
raw_datasets['train'].filter(lambda x: len(x['answers']['text']) != 1)

Loading cached processed dataset at C:\Users\Ethan\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-0c357a91ac1d3bad.arrow


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 0
})

In [5]:
#we can have more answers for the validationsets
raw_datasets['validation'][2]['answers']

{'text': ['Santa Clara, California',
  "Levi's Stadium",
  "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."],
 'answer_start': [403, 355, 355]}

# Preprocess the data

### convert Text

In [6]:
from transformers import AutoTokenizer

model_checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
tokenizer.is_fast

True

In [8]:
# trial tokenize the inputs for question and context
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP]'

### since the text might be longer that the limit, we use sliding window to truncate the features, with length 100 and stride 50

In [9]:
inputs = tokenizer(
    question,
    context,
    max_length = 100,
    truncation = "only_second",
    stride = 50,
    return_overflowing_tokens=True,
)

for ids in inputs['input_ids']:
    print(tokenizer.decode(ids))
# we have four parts, for the parts without an answer, we will leave start only without end as 0

[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basi [SEP]
[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin [SEP]
[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Next to the Main Building is the B

### besides that, we also need to calculate out the end index using start + length of the answer

In [10]:
inputs = tokenizer(
    question,
    context,
    max_length = 100,
    truncation = "only_second",
    stride = 50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True, # this will calculate the end, base on the start and length of the answers
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [11]:
inputs["offset_mapping"]

[[(0, 0),
  (0, 2),
  (3, 7),
  (8, 11),
  (12, 15),
  (16, 22),
  (23, 27),
  (28, 37),
  (38, 44),
  (45, 47),
  (48, 52),
  (53, 55),
  (56, 59),
  (59, 63),
  (64, 70),
  (70, 71),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264, 2

In [12]:
inputs["overflow_to_sample_mapping"]

[0, 0, 0, 0]

### let's try more examples

In [13]:
inputs = tokenizer(
    raw_datasets['train'][2:6]['question'],
    raw_datasets['train'][2:6]['context'],
    max_length = 100,
    truncation = "only_second",
    stride = 50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True, # this will calculate the end, base on the start and length of the answers
)

In [14]:
print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 4 examples gave 19 features.
Here is where each comes from: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3].


## mapping the answer back to features

## try with only 4 records first

In [15]:
inputs['offset_mapping']

[[(0, 0),
  (0, 3),
  (4, 12),
  (13, 15),
  (16, 19),
  (20, 26),
  (27, 32),
  (33, 35),
  (36, 41),
  (42, 46),
  (47, 49),
  (50, 56),
  (57, 59),
  (60, 65),
  (66, 75),
  (75, 76),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264,

In [16]:
sequence_ids = inputs.sequence_ids(7)
sequence_ids

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 None,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 None]

In [17]:
answers = raw_datasets['train'][2:6]['answers']
start_positions = []
end_positions = []

for i, offset in enumerate(inputs['offset_mapping']):
    sample_idx = inputs['overflow_to_sample_mapping'][i] # which sample we are on, like the first record, second record
    answer = answers[sample_idx] # get the answer repeatedly for that record. 
    start_char = answer['answer_start'][0] # the answer start from this position
    end_char = answer["answer_start"][0] + len(answer["text"][0]) # end with start + len
    sequence_ids = inputs.sequence_ids(i) # get this record's token id? 
    
    # get the "relative" postion of offset tuple's index based on the sequence id
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1 
    context_start = idx # skip the questions part, jump into the context
    
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx -1 # find the end, +1,-1 is smart, since context will end with None
    
    # if the context starts after the answer's start, or end before the answer end, 
    # means the context doesn't include all the answers
    if(offset[context_start][0] > start_char or offset[context_end][0] < end_char):
        start_positions.append(0)
        end_positions.append(0)
    else:
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx+=1 # we keep increase idx until either go beyond the end, or pass the start of the answer
        start_positions.append(idx-1) # so it will looks like this [0,0,0,9,0,40,0,...]
        
        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1 # we keep decrease until start or below the end position of the answer
        end_positions.append(idx + 1)

start_positions, end_positions

([83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0],
 [85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0])

In [18]:
print(inputs['overflow_to_sample_mapping'])

[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]


### double check with the decode answer

In [19]:
answers

[{'text': ['the Main Building'], 'answer_start': [279]},
 {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]},
 {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]},
 {'text': ['September 1876'], 'answer_start': [248]}]

In [20]:
i = 0
sample_idx = inputs['overflow_to_sample_mapping'][0]
answer = answers[sample_idx]['text'][0] # the actual answer

start = start_positions[i]
end = end_positions[i]
labeled_answer = tokenizer.decode(inputs['input_ids'][0][start : end + 1]) # the answer after decode from the tokenized inputs
print(f"actual answer : {answer}, predicted answer: {labeled_answer}")

actual answer : the Main Building, predicted answer: the Main Building


In [21]:
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")

Theoretical answer: a Marian place of prayer and reflection, decoded example: [CLS] What is the Grotto at Notre Dame? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grot [SEP]


# the actual function for preprocess the text

In [22]:
max_length=384
stride=128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples['question']] # get the questions without redundant spaces
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset_mapping = inputs.pop("offset_mapping") # pop out the mapping for sliding windows
    sample_map = inputs.pop("overflow_to_sample_mapping") # pop out the relation
    answers = examples['answers']
    start_positions = []
    end_positions = []
    
    # same as the previous explanation
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [23]:
train_dataset = raw_datasets['train'].map(
    preprocess_training_examples,
    batched = True,
    remove_columns = raw_datasets['train'].column_names,
)

Loading cached processed dataset at C:\Users\Ethan\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-65feacafeb7c2366.arrow


In [24]:
len(raw_datasets['train']), len(train_dataset)

(87599, 88729)

### the actual function for preprocess the validation data

In [25]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    
    #1. change idx to actually ids, 2. switch offset map to None for questions and padding
    for i in range (len(inputs['input_ids'])): # i will be all sliding windows
        sample_idx = sample_map[i] # again, get which original record we are working on
        example_ids.append(examples['id'][sample_idx]) # switch index into id: (0,1,2,3) --> (id1,id1,id1,id2)
        
        sequence_ids = inputs.sequence_ids(i)# get "this" sliding window's sequence ids
        offset = inputs['offset_mapping'][i] # get the offset table for question and context\
        inputs['offset_mapping'][i] = [ # we update the offset table, None for question, no change for context
            o if sequence_ids[k] == 1 else None for k,o in enumerate(offset)
        ]
        
    inputs['example_id'] = example_ids # adding a new columns for ids
    return inputs

In [26]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10822)

# post processing

### test on small validation sets

In [27]:
small_eval_set = raw_datasets['validation'].select(range(100))
trained_checkpoint = 'distilbert-base-cased-distilled-squad'

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets['validation'].column_names,
)

Loading cached processed dataset at C:\Users\Ethan\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-297acf91a2624012.arrow


In [28]:
eval_set

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 100
})

In [29]:
#switch back to the old tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### a test run to get some simulated outputs

In [30]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(['example_id','offset_mapping']) # we dont need those columns?
eval_set_for_model.set_format('torch')

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

In [31]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [32]:
start_logits[0]

array([-2.2607305 , -5.178325  , -5.270898  , -6.0858765 , -6.450713  ,
       -6.2670946 , -5.314639  , -6.3032956 , -6.0470343 , -7.3009343 ,
       -5.6778765 , -3.7498586 , -4.7882466 , -0.38573128, -4.3966126 ,
       -1.9304973 , -5.377075  , -4.388612  , -2.539995  , -4.352418  ,
       -4.3879633 , -4.332363  , -4.6790547 , -3.579806  , -2.306936  ,
       -6.553044  , -2.7710211 , -0.872208  , -3.0604684 , -2.9521334 ,
       -4.1954756 , -1.3516891 , -3.725062  , -4.703031  , -4.068803  ,
       -0.8678059 , -3.6668837 , -1.8827084 ,  4.400484  ,  2.943778  ,
       -0.7979491 , -1.3878831 , -0.6945383 ,  1.5780765 , -1.7355467 ,
        0.52236897, 10.694437  ,  4.4599767 , -1.3703719 , -0.04971688,
        2.0126412 , -2.6718123 , -2.2983904 , -1.5135095 ,  0.02073593,
       -2.732355  , -0.02802771,  9.80368   ,  2.7017334 , -1.893289  ,
       -6.2241755 , -3.2045462 , -3.9968174 , -4.2383127 , -3.5693395 ,
       -2.502742  , -4.011329  , -5.996893  , -4.775757  , -2.77

### create id to index map

In [33]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature['example_id']].append(idx) # reset the index actually, id back to one on one for index

In [34]:
example_to_features

defaultdict(list,
            {'56be4db0acb8001400a502ec': [0],
             '56be4db0acb8001400a502ed': [1],
             '56be4db0acb8001400a502ee': [2],
             '56be4db0acb8001400a502ef': [3],
             '56be4db0acb8001400a502f0': [4],
             '56be8e613aeaaa14008c90d1': [5],
             '56be8e613aeaaa14008c90d2': [6],
             '56be8e613aeaaa14008c90d3': [7],
             '56bea9923aeaaa14008c91b9': [8],
             '56bea9923aeaaa14008c91ba': [9],
             '56bea9923aeaaa14008c91bb': [10],
             '56beace93aeaaa14008c91df': [11],
             '56beace93aeaaa14008c91e0': [12],
             '56beace93aeaaa14008c91e1': [13],
             '56beace93aeaaa14008c91e2': [14],
             '56beace93aeaaa14008c91e3': [15],
             '56bf10f43aeaaa14008c94fd': [16],
             '56bf10f43aeaaa14008c94fe': [17],
             '56bf10f43aeaaa14008c94ff': [18],
             '56bf10f43aeaaa14008c9500': [19],
             '56bf10f43aeaaa14008c9501': [20],
     

In [35]:
start_logits[0][46]

10.694437

In [36]:
eval_set['offset_mapping'][70]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 [0, 3],
 [4, 11],
 [12, 16],
 [17, 19],
 [20, 25],
 [26, 30],
 [31, 33],
 [34, 39],
 [40, 44],
 [45, 47],
 [48, 51],
 [52, 57],
 [58, 65],
 [65, 66],
 [67, 73],
 [74, 77],
 [78, 85],
 [86, 88],
 [89, 95],
 [95, 96],
 [96, 97],
 [98, 105],
 [105, 106],
 [107, 112],
 [113, 119],
 [120, 123],
 [124, 129],
 [130, 135],
 [136, 139],
 [140, 146],
 [147, 150],
 [151, 155],
 [156, 161],
 [162, 170],
 [170, 171],
 [171, 172],
 [173, 182],
 [183, 184],
 [185, 191],
 [192, 197],
 [198, 202],
 [203, 212],
 [213, 216],
 [217, 218],
 [219, 228],
 [228, 229],
 [230, 236],
 [237, 247],
 [248, 251],
 [252, 258],
 [259, 262],
 [263, 268],
 [269, 274],
 [275, 279],
 [280, 283],
 [283, 284],
 [285, 294],
 [295, 299],
 [300, 304],
 [305, 312],
 [312, 313],
 [314, 315],
 [315, 316],
 [317, 322],
 [322, 323],
 [324, 327],
 [328, 331],
 [332, 338],
 [339, 345],
 [345, 346],
 [346, 347],
 None,
 None,
 None,
 None,
 Non

### compute the index for the answers

In [37]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example['id'] # 56be4db0acb8001400a502ec
    context = example['context'] # context
    answers = []
    
    for feature_idx in example_to_features[example_id]:  # get which actual record we are in 
        start_logit = start_logits[feature_idx] # find the start index for the answer based on which record we are on
        end_logit = end_logits[feature_idx]
        offsets = eval_set['offset_mapping'][feature_idx] # get the particular offset table for the record
        
        start_indexes = np.argsort(start_logit)[-1: -n_best -1 : -1].tolist() # get the 20th largest possibility for start index
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() # get 2oth largest possible end index
        
        for start_index in start_indexes:
            for end_index in end_indexes:
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue # if start or end falles in None area, which is out bound, question, or padding area
                if(
                    end_index < start_index 
                    or 
                    end_index - start_index + 1 > max_answer_length
                ): # if not valid or too long
                    continue
                
                # the rest of the pairs should all be possible solutions
                answers.append(
                    {
                        # parsing the answer
                        "text":context[offsets[start_index][0]:offsets[end_index][1]], 
                        #score =  sum of possibility of start and end
                        "logit_score":start_logit[start_index] + end_logit[end_index], 
                    }
                )
    best_answer = max(answers,key = lambda x : x['logit_score']) # get the best score answer
    predicted_answers.append({"id":example_id,"prediction_text":best_answer['text']})

In [38]:
predicted_answers

[{'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'},
 {'id': '56be4db0acb8001400a502ed', 'prediction_text': 'Carolina Panthers'},
 {'id': '56be4db0acb8001400a502ee',
  'prediction_text': "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California"},
 {'id': '56be4db0acb8001400a502ef', 'prediction_text': 'Carolina Panthers'},
 {'id': '56be4db0acb8001400a502f0', 'prediction_text': 'gold'},
 {'id': '56be8e613aeaaa14008c90d1', 'prediction_text': 'golden anniversary'},
 {'id': '56be8e613aeaaa14008c90d2', 'prediction_text': 'February 7, 2016'},
 {'id': '56be8e613aeaaa14008c90d3',
  'prediction_text': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference'},
 {'id': '56bea9923aeaaa14008c91b9', 'prediction_text': 'golden anniversary'},
 {'id': '56bea9923aeaaa14008c91ba',
  'prediction_text': 'American Football Conference'},
 {'id': '56bea9923aeaaa14008c9

In [39]:
import evaluate

metric = evaluate.load('squad') # load squad metric for evaluation

In [40]:
theoretical_answers = [
    {"id":ex['id'], "answers":ex['answers']} for ex in small_eval_set
]

In [41]:
theoretical_answers[0]['answers']

{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],
 'answer_start': [177, 177, 177]}

In [42]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)
# trial evaluate, making sure the data we processed in a reasonable way

{'exact_match': 83.0, 'f1': 88.25000000000004}

# the actual compute_metric function

In [43]:
from tqdm.auto import tqdm

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [44]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

  0%|          | 0/100 [00:00<?, ?it/s]

{'exact_match': 83.0, 'f1': 88.25000000000004}

# Fine tuning

In [45]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [46]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to C:\Users\Ethan\.huggingface\token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [47]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-squad",
    evaluation_strategy='no',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
)

In [48]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset,
    tokenizer = tokenizer,
)
trainer.train(resume_from_checkpoint = True)

N:\Github\AllAIAnalysis\bert-finetuned-squad is already a clone of https://huggingface.co/EthanWTL/bert-finetuned-squad. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend
Loading model from bert-finetuned-squad\checkpoint-11092.
***** Running training *****
  Num examples = 88729
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 33276
  Number of trainable parameters = 107721218
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 11092
  Will skip the first 1 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
11500,0.7766
12000,0.7424
12500,0.7413
13000,0.7652
13500,0.7388
14000,0.7611
14500,0.7576
15000,0.7878
15500,0.7276
16000,0.7655


Saving model checkpoint to bert-finetuned-squad\checkpoint-22184
Configuration saved in bert-finetuned-squad\checkpoint-22184\config.json
Model weights saved in bert-finetuned-squad\checkpoint-22184\pytorch_model.bin
tokenizer config file saved in bert-finetuned-squad\checkpoint-22184\tokenizer_config.json
Special tokens file saved in bert-finetuned-squad\checkpoint-22184\special_tokens_map.json
tokenizer config file saved in bert-finetuned-squad\tokenizer_config.json
Special tokens file saved in bert-finetuned-squad\special_tokens_map.json


NotADirectoryError: [WinError 267] 目录名称无效。: 'C:\\Users\\Ethan\\AppData\\Local\\Temp\\tmp3qi_sfyu\\lfs_progress'

In [49]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset,
    tokenizer = tokenizer,
)
trainer.train(resume_from_checkpoint = True)

N:\Github\AllAIAnalysis\bert-finetuned-squad is already a clone of https://huggingface.co/EthanWTL/bert-finetuned-squad. Make sure you pull the latest changes with `repo.git_pull()`.
Using cuda_amp half precision backend
Loading model from bert-finetuned-squad\checkpoint-22184.
***** Running training *****
  Num examples = 88729
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 33276
  Number of trainable parameters = 107721218
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 22184
  Will skip the first 2 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Step,Training Loss
22500,0.4871
23000,0.5059
23500,0.5111
24000,0.5358
24500,0.5122
25000,0.5321
25500,0.5067
26000,0.5036
26500,0.5114
27000,0.5124


Saving model checkpoint to bert-finetuned-squad\checkpoint-33276
Configuration saved in bert-finetuned-squad\checkpoint-33276\config.json
Model weights saved in bert-finetuned-squad\checkpoint-33276\pytorch_model.bin
tokenizer config file saved in bert-finetuned-squad\checkpoint-33276\tokenizer_config.json
Special tokens file saved in bert-finetuned-squad\checkpoint-33276\special_tokens_map.json
tokenizer config file saved in bert-finetuned-squad\tokenizer_config.json
Special tokens file saved in bert-finetuned-squad\special_tokens_map.json


NotADirectoryError: [WinError 267] 目录名称无效。: 'C:\\Users\\Ethan\\AppData\\Local\\Temp\\tmp8ylg083_\\lfs_progress'

# evaluate

In [51]:
predictions,_,_=trainer.predict(validation_dataset)
start_logits, end_logits = predictions # prediction will return us the index for start and end
compute_metrics(start_logits,end_logits, validation_dataset, raw_datasets['validation'])
# base line is 80.8 and 88.5

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10822
  Batch size = 8


  0%|          | 0/10570 [00:00<?, ?it/s]

{'exact_match': 81.40964995269631, 'f1': 88.81301224703492}

In [52]:
# push to hub to save our fine-tuned model
trainer.push_to_hub(commit_message="Training complete") 

Saving model checkpoint to bert-finetuned-squad
Configuration saved in bert-finetuned-squad\config.json
Model weights saved in bert-finetuned-squad\pytorch_model.bin
tokenizer config file saved in bert-finetuned-squad\tokenizer_config.json
Special tokens file saved in bert-finetuned-squad\special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}, 'dataset': {'name': 'squad', 'type': 'squad', 'config': 'plain_text', 'split': 'train', 'args': 'plain_text'}}
To https://huggingface.co/EthanWTL/bert-finetuned-squad
   dd1625b..395e1e0  main -> main



# using the fine-tuning model

In [53]:
from transformers import pipeline

model_checkpoint = 'EthanWTL/bert-finetuned-squad'
question_answerer = pipeline("question-answering", model = model_checkpoint)

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

Downloading:   0%|          | 0.00/671 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
loading configuration file config.json from cache at C:\Users\Ethan/.cache\huggingface\hub\models--EthanWTL--bert-finetuned-squad\snapshots\395e1e0fa79d7e6aaa969b429a69fc53f8bf224b\config.json
Model config BertConfig {
  "_name_or_path": "EthanWTL/bert-finetuned-squad",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "posi

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at C:\Users\Ethan/.cache\huggingface\hub\models--EthanWTL--bert-finetuned-squad\snapshots\395e1e0fa79d7e6aaa969b429a69fc53f8bf224b\pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuestionAnswering were initialized from the model checkpoint at EthanWTL/bert-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForQuestionAnswering for predictions without further training.


Downloading:   0%|          | 0.00/347 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at C:\Users\Ethan/.cache\huggingface\hub\models--EthanWTL--bert-finetuned-squad\snapshots\395e1e0fa79d7e6aaa969b429a69fc53f8bf224b\vocab.txt
loading file tokenizer.json from cache at C:\Users\Ethan/.cache\huggingface\hub\models--EthanWTL--bert-finetuned-squad\snapshots\395e1e0fa79d7e6aaa969b429a69fc53f8bf224b\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\Ethan/.cache\huggingface\hub\models--EthanWTL--bert-finetuned-squad\snapshots\395e1e0fa79d7e6aaa969b429a69fc53f8bf224b\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\Ethan/.cache\huggingface\hub\models--EthanWTL--bert-finetuned-squad\snapshots\395e1e0fa79d7e6aaa969b429a69fc53f8bf224b\tokenizer_config.json


{'score': 0.9948900938034058,
 'start': 78,
 'end': 105,
 'answer': 'Jax, PyTorch and TensorFlow'}

# customized Training Loop using Accelerator

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics = compute_metrics(
        start_logits, end_logits, validation_dataset, raw_datasets["validation"]
    )
    print(f"epoch {epoch}:", metrics)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )