In [1]:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset 
from transformers import AdamW
from pathlib import Path
from argparse import Namespace
import wandb
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Fix random seed for reproducibility
def same_seeds(seed):
	torch.manual_seed(seed)
	if torch.cuda.is_available():
			torch.cuda.manual_seed(seed)
			torch.cuda.manual_seed_all(seed)
	np.random.seed(seed)
	random.seed(seed)
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True
same_seeds(2)

In [3]:
from transformers import (
  AutoTokenizer,
  AutoModelForQuestionAnswering,
)

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [4]:
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]

train_questions, train_paragraphs = read_data("D://Code/Machine Translation/hw7_train.json")
dev_questions, dev_paragraphs = read_data("D://Code/Machine Translation/hw7_dev.json")
test_questions, test_paragraphs = read_data("D://Code/Machine Translation/hw7_test.json")

In [5]:
train_questions[0], train_paragraphs[0]

({'id': 0,
  'paragraph_id': 8164,
  'question_text': '保定至西安的電報線路架設後約10年什麼建設開始營運？',
  'answer_text': '盧漢鐵路盧保段',
  'answer_start': 141,
  'answer_end': 147},
 '2010年引入的廣州快速交通運輸系統是世界第二大快速運輸系統。每日載客量可達100萬人次。每小時的客流量峰值高達26,900名乘客，僅次於波哥大的快速交通系統。每10秒有一輛公共汽車，每輛公共汽車在一個方向上行駛350小時。該平台包括橋樑，是世界上最長的國家公共汽車快速運輸系統平台，長度為260米。目前，廣州市的出租車和公交車主要以液化石油氣為燃料，部分公交車採用油電，氣電混合技術。2012年底，一輛LNG燃料公共汽車開始啟動。2014年6月，引入了LNG插電式混合動力公交車取代LPG公交車。2007年1月16日，廣州市政府完全禁止在城市地區駕駛摩托車。違反禁令的機動車將被沒收。廣州市交通局聲稱，禁令的實施導致交通擁堵和車禍大大減少。廣州白雲國際機場位於白雲區與花都區交界處。它於2004年8月5日正式投入運營。它是中國第二繁忙的機場。機場取代了原先位於市中心的舊機場，無法滿足日益增長的航空需求。目前，機場有三個簡易機場，是中國第三個擁有三條跑道的民航機場。比2023年香港國際機場第三條跑道的預計完工時間提前了8年。')

In [6]:
len_q = []
len_p = []
for q in train_questions:
    len_q.append(len(q["question_text"]))
for p in train_paragraphs:
    len_p.append(len(p))

In [7]:
np.max(len_q)

222

In [8]:
np.mean(len_p)

437.5714658490443

In [9]:
# Tokenize questions and paragraphs separately
# 「add_special_tokens」 is set to False since special tokens will be added when tokenized questions and paragraphs are combined in datset __getitem__ 

train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

# You can safely ignore the warning message as tokenized sequences will be futher processed in datset __getitem__ before passing to model

Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors


In [10]:
train_questions_tokenized[3]

Encoding(num_tokens=23, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
train_paragraphs_tokenized[0]

Encoding(num_tokens=430, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [12]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 60
        self.max_paragraph_len = 150
        
        ##### TODO: Change value of doc_stride #####
        # self.doc_stride = 150
        self.doc_stride = 75

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn
        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            
            paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len

            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]		
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask

train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

In [13]:
def evaluate(data, output):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing 
    # Hint: Open your prediction file to see what is wrong 
    
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        
        # Replace answer if calculated probability is larger than previous windows
        #fix the bug 
        if start_index <= end_index:
            if prob > max_prob:
                max_prob = prob
                # Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
                answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
        
    # Remove spaces in answer (e.g. "大 金" --> "大金")
    return answer.replace(' ','')

In [17]:
config = Namespace(
    #save the checkpoint in wandb
    savedir = "./checkpoints/bert-medium",

    # training epoch
    num_epoch = 3,
    
    #Validate on validation set
    validation = True,
    
    #print the loss per logging step
    logging_step = 100,
    
    #learning rate
    learning_rate = 1e-5,

    # training batch size 
    train_batch_size = 8,

    # logging
    use_wandb=True,
)

In [19]:
from accelerate import Accelerator
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR

#### TODO: gradient_accumulation (optional)####
# Note: train_batch_size * gradient_accumulation_steps = effective batch size
# If CUDA out of memory, you can make train_batch_size lower and gradient_accumulation_steps upper
# Doc: https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation
gradient_accumulation_steps = 16
optimizer = AdamW(model.parameters(), lr=config.learning_rate)


# dataloader
# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_loader = DataLoader(train_set, batch_size=config.train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

total_steps = len(train_loader) * config.num_epoch
num_warmup_steps = int(0.2 * total_steps)  # Set warmup steps to 20% of total steps
# [Hugging Face] Apply linear learning rate decay with warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps
)

# Change "fp16_training" to True to support automatic mixed 
# precision training (fp16)	
fp16_training = True
if fp16_training:    
    accelerator = Accelerator(mixed_precision="fp16", gradient_accumulation_steps=gradient_accumulation_steps)
else:
    accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
if config.use_wandb:
    wandb.init(project="Fintuning-QA-Bert", name=Path(config.savedir).stem, config=config)

# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/
model, optimizer, train_loader, scheduler = accelerator.prepare(model, optimizer, train_loader, scheduler) 

In [16]:
model.train()


print("Start Training ...")

for epoch in range(config.num_epoch):
    step = 1
    train_loss = train_acc = 0
    
    for data in enumerate(tqdm(train_loader)):	
        with accelerator.accumulate(model):
            
            # Load all data into GPU
            data = [i.to(device) for i in data]
            
            # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
            # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  
            output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])
            # Choose the most probable start position / end position
            start_index = torch.argmax(output.start_logits, dim=1)
            end_index = torch.argmax(output.end_logits, dim=1)
            
            # Prediction is correct only if both start_index and end_index are correct
            train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
               
            train_loss += output.loss
            
            accelerator.backward(output.loss)
            
            step += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        ##### TODO: Apply linear learning rate decay #####

        # Print training loss and accuracy over past logging step
        if step % config.logging_step == 0:
             if config.use_wandb:
                 wandb.log({
                    "train/loss": train_loss.item() / config.logging_step,
                    "train/acc": train_acc / config.logging_step,
                })
             print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / config.logging_step:.3f}, acc = {train_acc / config.logging_step:.3f}")
             train_loss = train_acc = 0

    if config.validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
            if config.use_wandb: 
                wandb.log({
                    "Validation/acc": dev_acc / len(dev_loader),
                })
            print(f"Validation | Epoch {config.num_epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()

# Save a model and its configuration file to the directory 「saved_model」 
# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」
# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained("saved_model")」
print("Saving Model ...")
model_save_dir = "saved_model" 
model.save_pretrained(model_save_dir)

Start Training ...


  0%|          | 0/3365 [00:00<?, ?it/s]

Epoch 1 | Step 100 | loss = 3.756, acc = 0.080
Epoch 1 | Step 200 | loss = 2.002, acc = 0.306
Epoch 1 | Step 300 | loss = 1.621, acc = 0.435
Epoch 1 | Step 400 | loss = 1.411, acc = 0.481
Epoch 1 | Step 500 | loss = 1.206, acc = 0.562
Epoch 1 | Step 600 | loss = 1.017, acc = 0.592
Epoch 1 | Step 700 | loss = 1.081, acc = 0.580
Epoch 1 | Step 800 | loss = 0.953, acc = 0.621
Epoch 1 | Step 900 | loss = 0.900, acc = 0.641
Epoch 1 | Step 1000 | loss = 0.859, acc = 0.623
Epoch 1 | Step 1100 | loss = 0.855, acc = 0.650
Epoch 1 | Step 1200 | loss = 0.826, acc = 0.645
Epoch 1 | Step 1300 | loss = 0.809, acc = 0.670
Epoch 1 | Step 1400 | loss = 0.725, acc = 0.699
Epoch 1 | Step 1500 | loss = 0.889, acc = 0.636
Epoch 1 | Step 1600 | loss = 0.690, acc = 0.710
Epoch 1 | Step 1700 | loss = 0.797, acc = 0.683
Epoch 1 | Step 1800 | loss = 0.705, acc = 0.688
Epoch 1 | Step 1900 | loss = 0.703, acc = 0.707
Epoch 1 | Step 2000 | loss = 0.665, acc = 0.706
Epoch 1 | Step 2100 | loss = 0.734, acc = 0.678
E

  0%|          | 0/2863 [00:00<?, ?it/s]

Validation | Epoch 4 | acc = 0.686


  0%|          | 0/3365 [00:00<?, ?it/s]

Epoch 2 | Step 100 | loss = 0.448, acc = 0.772
Epoch 2 | Step 200 | loss = 0.480, acc = 0.769
Epoch 2 | Step 300 | loss = 0.435, acc = 0.797
Epoch 2 | Step 400 | loss = 0.534, acc = 0.746
Epoch 2 | Step 500 | loss = 0.374, acc = 0.819
Epoch 2 | Step 600 | loss = 0.411, acc = 0.799
Epoch 2 | Step 700 | loss = 0.476, acc = 0.764
Epoch 2 | Step 800 | loss = 0.422, acc = 0.777
Epoch 2 | Step 900 | loss = 0.493, acc = 0.752
Epoch 2 | Step 1000 | loss = 0.489, acc = 0.780
Epoch 2 | Step 1100 | loss = 0.432, acc = 0.790
Epoch 2 | Step 1200 | loss = 0.473, acc = 0.782
Epoch 2 | Step 1300 | loss = 0.491, acc = 0.769
Epoch 2 | Step 1400 | loss = 0.464, acc = 0.787
Epoch 2 | Step 1500 | loss = 0.419, acc = 0.762
Epoch 2 | Step 1600 | loss = 0.435, acc = 0.792
Epoch 2 | Step 1700 | loss = 0.441, acc = 0.809
Epoch 2 | Step 1800 | loss = 0.409, acc = 0.817
Epoch 2 | Step 1900 | loss = 0.410, acc = 0.795
Epoch 2 | Step 2000 | loss = 0.467, acc = 0.780
Epoch 2 | Step 2100 | loss = 0.467, acc = 0.785
E

  0%|          | 0/2863 [00:00<?, ?it/s]

Validation | Epoch 4 | acc = 0.701


  0%|          | 0/3365 [00:00<?, ?it/s]

Epoch 3 | Step 100 | loss = 0.278, acc = 0.849
Epoch 3 | Step 200 | loss = 0.218, acc = 0.876
Epoch 3 | Step 300 | loss = 0.260, acc = 0.860
Epoch 3 | Step 400 | loss = 0.250, acc = 0.860
Epoch 3 | Step 500 | loss = 0.250, acc = 0.866
Epoch 3 | Step 600 | loss = 0.313, acc = 0.845
Epoch 3 | Step 700 | loss = 0.232, acc = 0.882
Epoch 3 | Step 800 | loss = 0.231, acc = 0.865
Epoch 3 | Step 900 | loss = 0.254, acc = 0.860
Epoch 3 | Step 1000 | loss = 0.293, acc = 0.836
Epoch 3 | Step 1100 | loss = 0.251, acc = 0.873
Epoch 3 | Step 1200 | loss = 0.270, acc = 0.868
Epoch 3 | Step 1300 | loss = 0.231, acc = 0.876
Epoch 3 | Step 1400 | loss = 0.300, acc = 0.849
Epoch 3 | Step 1500 | loss = 0.289, acc = 0.825
Epoch 3 | Step 1600 | loss = 0.254, acc = 0.870
Epoch 3 | Step 1700 | loss = 0.318, acc = 0.830
Epoch 3 | Step 1800 | loss = 0.287, acc = 0.845
Epoch 3 | Step 1900 | loss = 0.268, acc = 0.849
Epoch 3 | Step 2000 | loss = 0.273, acc = 0.852
Epoch 3 | Step 2100 | loss = 0.273, acc = 0.856
E

  0%|          | 0/2863 [00:00<?, ?it/s]

Validation | Epoch 4 | acc = 0.712
Saving Model ...


In [None]:
print("Evaluating Test Set ...")

result = []

model.eval()
with torch.no_grad():
    for data in tqdm(test_loader):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output))

result_file = "result.csv"
with open(result_file, 'w') as f:	
    f.write("ID,Answer\n")
    for i, test_question in enumerate(test_questions):
    # Replace commas in answers with empty strings (since csv is separated by comma)
    # Answers in kaggle are processed in the same way
        f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

print(f"Completed! Result is in {result_file}")

In [None]:
# import torch
# import random  
# import numpy as np

# # To avoid CUDA_OUT_OF_MEMORY
# torch.set_default_tensor_type(torch.cuda.FloatTensor)

# # Fix random seed for reproducibility
# def same_seeds(seed):
# 	torch.manual_seed(seed)
# 	if torch.cuda.is_available():
# 			torch.cuda.manual_seed(seed)
# 			torch.cuda.manual_seed_all(seed)
# 	np.random.seed(seed)
# 	random.seed(seed)
# 	torch.backends.cudnn.benchmark = False
# 	torch.backends.cudnn.deterministic = True
# same_seeds(2)

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # You can try model with different size
# # When using Colab or Kaggle, models with more than 2 billions parameters may 
# # run out of memory
# tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-1.7B")
# model = AutoModelForCausalLM.from_pretrained("facebook/xglm-1.7B")

In [None]:
# # To clean model output. If you try different prompts, you may have to fix 
# # this function on your own
# def clean_text(text):
#     # Note: When you use unilingual model, the colon may become fullwidth
#     text = text.split("答案:")[-1]
#     text = text.split(" ")[0]
#     return text

In [None]:
# import random
# import json

# with open("hw7_in-context-learning-examples.json", "r") as f: 
#     test = json.load(f)

# # K-shot learning 
# # Give model K examples to make it achieve better accuracy 
# # Note: (1) When K >= 4, CUDA_OUT_OFF_MEMORY may occur.
# #       (2) The maximum input length of XGLM is 2048
# K = 2

# question_ids = [qa["id"] for qa in test["questions"]]

# with open("in-context-learning-result.txt", "w") as f:
#     print("ID,Ground-Truth,Prediction", file = f)
#     with torch.no_grad():
#         for idx, qa in enumerate(test["questions"]):
#             # You can try different prompts
#             prompt = "請從最後一篇的文章中找出最後一個問題的答案\n"
#             exist_question_indexs = [question_ids.index(qa["id"])]

#             # K-shot learning: give the model K examples with answers
#             for i in range(K):
#                 question_index = question_ids.index(qa["id"])
#                 while(question_index in exist_question_indexs): 
#                     question_index = random.randint(0, len(question_ids) - 1)
#                 exist_question_indexs.append(question_index)    
#                 paragraph_id = test["questions"][question_index]["paragraph_id"]
#                 prompt += f'文章：{test["paragraphs"][paragraph_id]}\n'
#                 prompt += f'問題：{test["questions"][question_index]["question_text"]}\n'
#                 prompt += f'答案：{test["questions"][question_index]["answer_text"]}\n'

#             # The final one question without answer
#             paragraph_id = qa["paragraph_id"]
#             prompt += f'文章：{test["paragraphs"][paragraph_id]}\n'
#             prompt += f'問題：{qa["question_text"]}\n'
#             prompt += f'答案：'
            
#             inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt") 
#             sample = model.generate(**inputs, max_new_tokens = 20)
#             text = tokenizer.decode(sample[0], skip_special_tokens=True)

#             # Note: You can delete this line to see what will happen
#             text = clean_text(text)
            
#             print(prompt)
#             print(f'正確答案: {qa["answer_text"]}')
#             print(f'模型輸出: {text}')
#             print()

#             print(f"{idx},{qa['answer_text']},{text}", file = f)