In [1]:
# Cell 1: Import required libraries
import json
import numpy as np
import torch
import random, os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from tqdm import tqdm

# Cell 2: Define tool functions (從 util.py)
def read_jsonl(path: str):
    with open(path) as fh:
        return [json.loads(line) for line in fh.readlines() if line]

def remove_key_json(json_data, key_to_remove):
    return [{key: value for key, value in data.items() if key not in key_to_remove} for data in json_data]

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)  
    torch.backends.cudnn.enabled = False 
    torch.backends.cudnn.benchmark = False

# Cell 3: Set up starting arguments
args = {
    "data_train_pth": './Quantitative-101/QQA/QQA_train.json',
    "data_dev_pth": './Quantitative-101/QQA/QQA_dev.json',
    "data_test_pth": "./Quantitative-101/QQA/QQA_test.json",
    "is_digit_base": False,
    "has_demonstrations": True,
    "model_name": 'google/flan-t5-base',
    "seed": 33,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch"
}

# 設置隨機種子
set_seed(args["seed"])

# Cell 4: Load data
train_data = read_jsonl(args["data_train_pth"])
dev_data = read_jsonl(args["data_dev_pth"])
test_data = read_jsonl(args["data_test_pth"])

# # 添加採樣比例參數
# sample_ratio = 0.01  # 使用1%的數據進行測試
# sample_size_train = int(len(train_data[0]) * sample_ratio)
# sample_size_dev = int(len(dev_data[0]) * sample_ratio)
# sample_size_test = int(len(test_data[0]) * sample_ratio)

# # 隨機採樣
# train_data = [random.sample(train_data[0], sample_size_train)]
# dev_data = [random.sample(dev_data[0], sample_size_dev)]
# test_data = [random.sample(test_data[0], sample_size_test)]

# Cell 5: Data Type Conversion
def trans_to_dict_qqa(data):
    # 移除不需要的欄位
    data = remove_key_json(data, ['type', 'question_sci_10E', 'question_sci_10E_char', 'question_mask'])
    
    # 初始化字典
    keys = data[0].keys()
    data_dic = {}
    
    for key in keys:
        data_dic[key] = []
    
    # 轉換數據
    for item in data:
        for key in keys:
            sstr = item[key]
            sstr = str(sstr)
            data_dic[key].append(sstr.strip())
    
    return data_dic

# 轉換訓練和驗證數據
train_dict = trans_to_dict_qqa(train_data[0])
dev_dict = trans_to_dict_qqa(dev_data[0])


In [2]:
# 看看處理後的數據結構
print("可用的欄位：", list(train_dict.keys()))
print("\n訓練數據大小：", len(train_dict['question']))
print("驗證數據大小：", len(dev_dict['question']))

# 看看一個樣本的內容
print("\n第一個訓練樣本：")
for key in train_dict.keys():
    print(f"{key}: {train_dict[key][0]}")

可用的欄位： ['question', 'Option1', 'Option2', 'answer', 'question_char']

訓練數據大小： 564
驗證數據大小： 81

第一個訓練樣本：
question: The ranger and the rustler both were riding horses that galloped at the same speed.  The rustler left at 01:00 where as the ranger left at 0500 hours. Who has traveled further??
Option1: the ranger
Option2: the rustler
answer: Option 2
question_char: The ranger and the rustler both were riding horses that galloped at the same speed.  The rustler left at 0 1:0 0 where as the ranger left at 050 0 hours. Who has traveled further??


In [3]:
class instr_template:
    def __init__(self):
        self.input_template = {}
    
    def load_qqa_template(self):
        # 從 instruction_config.py 複製模板
        self.input_template['icl'] = f"""Choose a correct answer to the following questions
Question: Rolling a marble over dirt creates 1.2 mega N resistance, whereas rolling it over sand creates 45 N resistance. This means the marble will travel further over the?
Option 1: sand
Option 2: dirt
Answer: Option 1
Choose a correct answer to the following questions
Question: A toddler is rolling a ball for more than 1 mins on the grass and rolls it on to the sand where it stops after 43 seconds. The sand stopped the ball because it has _____ than the grass.?
Option 1: more friction
Option 2: less friction
Answer: Option 1
Choose a correct answer to the following questions
Question: Marlo weighs 678 N whereas his friend Dan weighs 852 N . The person which has more mass is likely? 
Option 1: Marlo
Option 2: Dan
Answer: Option 2
Choose a correct answer to the following questions
Question: The F-16 usually weighs 9034 kg and the jumbo jet weighs 439987 kg. Therefore, the F-16 was? 
Option 1: slower accelerating
Option 2: faster accelerating
Answer: Option 2
Choose a correct answer to the following questions
Question: {{question}}
Option 1: {{option1}}
Option 2: {{option2}}"""
        self.input_template['instr'] = f"""Choose a correct answer to the following questions
Question: {{question}}
Option 1: {{option1}}
Option 2: {{option2}}"""

qqa_template = instr_template()
qqa_template.load_qqa_template()

if args["has_demonstrations"]:
    input_template = qqa_template.input_template['icl']
else:
    input_template = qqa_template.input_template['instr']

print("模板示例：")
print(input_template.format(question=train_dict['question'][0], option1=train_dict['Option1'][0], option2=train_dict['Option2'][0]))


模板示例：
Choose a correct answer to the following questions
Question: Rolling a marble over dirt creates 1.2 mega N resistance, whereas rolling it over sand creates 45 N resistance. This means the marble will travel further over the?
Option 1: sand
Option 2: dirt
Answer: Option 1
Choose a correct answer to the following questions
Question: A toddler is rolling a ball for more than 1 mins on the grass and rolls it on to the sand where it stops after 43 seconds. The sand stopped the ball because it has _____ than the grass.?
Option 1: more friction
Option 2: less friction
Answer: Option 1
Choose a correct answer to the following questions
Question: Marlo weighs 678 N whereas his friend Dan weighs 852 N . The person which has more mass is likely? 
Option 1: Marlo
Option 2: Dan
Answer: Option 2
Choose a correct answer to the following questions
Question: The F-16 usually weighs 9034 kg and the jumbo jet weighs 439987 kg. Therefore, the F-16 was? 
Option 1: slower accelerating
Option 2: faster

In [4]:
# Cell 6: 設置 tokenizer 和預處理函數
tokenizer = AutoTokenizer.from_pretrained(args["model_name"])

def preprocess_function(examples):
    # 準備輸入
    inputs = [input_template.format(
        question=question,
        option1=option1,
        option2=option2
    ) for question, option1, option2 in zip(
        examples['question'],
        examples["Option1"],
        examples["Option2"]
    )]
    
    model_inputs = tokenizer(inputs, truncation=True, max_length=512)
    
    # 準備標籤
    labels = []
    for answer, option1, option2 in zip(examples["answer"], examples['Option1'], examples['Option2']):
        if '1' in answer:
            labels.append(answer+": "+option1)
        elif '2' in answer:
            labels.append(answer+": "+option2)
    
    model_labels = tokenizer(text_target=labels, truncation=True)
    model_inputs["labels"] = model_labels["input_ids"]
    
    return model_inputs

# 創建數據集
datasets = DatasetDict({
    'train': Dataset.from_dict(train_dict),
    'validation': Dataset.from_dict(dev_dict)
})

# 對數據集進行預處理
tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=datasets["train"].column_names
)

# 檢查處理後的數據
print("\n處理後的數據結構：")
print(tokenized_datasets)

# 查看一個處理後的樣本
print("\n處理後的第一個樣本：")
print("輸入 ID:", tokenized_datasets["train"][0]["input_ids"])
print("標籤:", tokenized_datasets["train"][0]["labels"])

# 解碼檢查
print("\n解碼後的文本：")
print("輸入文本:", tokenizer.decode(tokenized_datasets["train"][0]["input_ids"]))
print("標籤:", tokenizer.decode(tokenized_datasets["train"][0]["labels"]))



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


處理後的數據結構：
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 564
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 81
    })
})

處理後的第一個樣本：
輸入 ID: [7023, 3, 9, 2024, 1525, 12, 8, 826, 746, 11860, 10, 6070, 53, 3, 9, 14260, 147, 9404, 482, 7, 3, 10917, 13950, 445, 5673, 6, 3, 10339, 8394, 34, 147, 3, 7, 232, 482, 7, 3479, 445, 5673, 5, 100, 598, 8, 14260, 56, 1111, 856, 147, 8, 58, 10231, 209, 10, 3, 7, 232, 10231, 204, 10, 9404, 11801, 10, 10231, 209, 7023, 3, 9, 2024, 1525, 12, 8, 826, 746, 11860, 10, 71, 13817, 19, 8394, 3, 9, 1996, 21, 72, 145, 209, 3519, 7, 30, 8, 5956, 11, 15246, 34, 30, 12, 8, 3, 7, 232, 213, 34, 10796, 227, 8838, 3978, 5, 37, 3, 7, 232, 4910, 8, 1996, 250, 34, 65, 31020, 145, 8, 5956, 5, 58, 10231, 209, 10, 72, 21764, 10231, 204, 10, 705, 21764, 11801, 10, 10231, 209, 7023, 3, 9, 2024, 1525, 12, 8, 826, 746, 11860, 10, 1571, 40, 32, 11385

In [21]:
# Cell 7: 設置訓練參數
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_qqa",
    evaluation_strategy=args["evaluation_strategy"],
    save_strategy=args["save_strategy"],
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    predict_with_generate=True,
    logging_strategy="epoch"
)

f1_metric = evaluate.load("./subtask1/f1.py")
# Cell 8: 定義評估指標
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 轉換為 0/1 標籤
    decoded_preds = [0 if item.startswith("Option 1") else 1 for item in decoded_preds]
    decoded_labels = [0 if item.startswith("Option 1") else 1 for item in decoded_labels]

    # 計算 F1 分數
    macro_f1 = f1_metric.compute(predictions=decoded_preds, references=decoded_labels, average="macro")
    micro_f1 = f1_metric.compute(predictions=decoded_preds, references=decoded_labels, average="micro")

    return {
        'macro_f1': macro_f1['f1']*100,
        'micro_f1': micro_f1['f1']*100
    }

In [8]:
# Cell 9: 設置模型和訓練器
model = AutoModelForSeq2SeqLM.from_pretrained(args["model_name"])
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Cell 10: 開始訓練
print("開始訓練...")
trainer.train()

  return torch.load(checkpoint_file, map_location="cpu")


開始訓練...


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Macro F1,Micro F1
1,0.5641,0.116329,42.188739,44.444444
2,0.1314,0.119828,32.5,48.148148
3,0.1245,0.113269,32.5,48.148148
4,0.1308,0.109572,44.197138,51.851852
5,0.1137,0.106036,43.20122,43.209877
6,0.1209,0.107886,39.28036,50.617284
7,0.1182,0.10682,42.688679,48.148148
8,0.1125,0.105895,46.390642,46.91358
9,0.1145,0.105814,46.881196,46.91358
10,0.1109,0.10607,45.0,45.679012


  state_dict = torch.load(best_model_path, map_location="cpu")


TrainOutput(global_step=710, training_loss=0.1641433339723399, metrics={'train_runtime': 142.7142, 'train_samples_per_second': 39.52, 'train_steps_per_second': 4.975, 'total_flos': 2585440475504640.0, 'train_loss': 0.1641433339723399, 'epoch': 10.0})

In [19]:
# Cell 11: 定義預測函數
def get_predict(model, tokenized_dataset, batch_size=8, max_new_tokens=25, sample_set='test', device='cuda'):
    model.to(device)
    model.eval()
    
    outputs = []
    outputs_raw = []
    
    dataloader = torch.utils.data.DataLoader(
        tokenized_dataset[sample_set], 
        batch_size=batch_size
    )

    def collate_fn(batch):
        input_ids = [torch.tensor(example['input_ids']) for example in batch]
        attention_mask = [torch.tensor(example['attention_mask']) for example in batch]
        
        # 找出最大長度
        max_len = max(len(ids) for ids in input_ids)
        
        # 填充到最大長度
        padded_input_ids = []
        padded_attention_mask = []
        for ids, mask in zip(input_ids, attention_mask):
            padding_len = max_len - len(ids)
            padded_input_ids.append(torch.cat([ids, torch.ones(padding_len, dtype=torch.long) * tokenizer.pad_token_id]))
            padded_attention_mask.append(torch.cat([mask, torch.zeros(padding_len, dtype=torch.long)]))
        
        return {
            'input_ids': torch.stack(padded_input_ids),
            'attention_mask': torch.stack(padded_attention_mask)
        }
    
    dataloader = torch.utils.data.DataLoader(
        tokenized_dataset[sample_set], 
        batch_size=batch_size,
        collate_fn=collate_fn  # 使用自定義的 collate_fn
    )
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            generated_tokens = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.pad_token_id
            )
            
            decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            outputs_raw.extend(decoded)
            
            # 轉換為二進制標籤
            batch_outputs = [0 if out.startswith("Option 1") else 1 for out in decoded]
            outputs.extend(batch_outputs)
            
    return outputs, outputs_raw

# Cell 12: 加載測試數據
print("加載測試數據...")
test_data = read_jsonl(args["data_test_pth"])
test_dict = trans_to_dict_qqa(test_data[0])
test_dataset = Dataset.from_dict(test_dict)

# 對測試數據進行預處理
test_tokenized = test_dataset.map(
    preprocess_function,
    remove_columns=test_dataset.column_names,
    batched=True
)

加載測試數據...


  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
# Cell 13: 進行預測和評估
print("開始預測...")
decoded_preds, decoded_preds_raw = get_predict(
    model=model,
    tokenized_dataset={"test": test_tokenized},
    batch_size=8,
    max_new_tokens=25
)

labels = [0 if ans.startswith("Option 1") else 1 for ans in test_dict['answer']]

# 計算評估指標
# f1_metric = evaluate.load("./f1.py")
macro_f1 = f1_metric.compute(predictions=decoded_preds, references=labels, average="macro")
micro_f1 = f1_metric.compute(predictions=decoded_preds, references=labels, average="micro")

# 計算準確率
accuracy = sum(1 for p, t in zip(decoded_preds, labels) if p == t) / len(labels)

print("\n測試集評估結果：")
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Macro F1: {macro_f1['f1']*100:.2f}%")
print(f"Micro F1: {micro_f1['f1']*100:.2f}%")

開始預測...


100%|██████████████████████████████████████████████████████████| 21/21 [00:01<00:00, 13.41it/s]


測試集評估結果：
Accuracy: 50.62%
Macro F1: 42.14%
Micro F1: 50.62%





In [23]:
save_res = []
for q, o1, o2, ans, pred, pred_raw in zip(
    test_dict['question'],
    test_dict['Option1'],
    test_dict['Option2'],
    test_dict['answer'],
    decoded_preds,
    decoded_preds_raw
):
    save_res.append({
        "question": q,
        "option1": o1,
        "option2": o2,
        "answer": ans,
        "prediction": f"Option {pred+1}",
        "model_output": pred_raw
    })

output_dir = "./results"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "predictions_qqa.json")

print(f"\n保存預測結果到：{output_path}")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(save_res, f, ensure_ascii=False, indent=2)


保存預測結果到：./results/predictions_qqa.json
