In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
# 由於GPT-2沒有PAD token所以使用EOS Token
tokenizer.pad_token_id = tokenizer.eos_token_id 

# 讀取CSV檔案並只選取指定的3個欄位
df = pd.read_csv('squad2.0_converted.csv', usecols=['context', 'question', 'answer'])
df = df.fillna('nan')

In [None]:
# 加入Prompt
df['context'] = '### Context:\n' + df['context']
df['question'] = '\n### Question:\n' + df['question']

# 在答案後方加入EOS token表示文本結尾
df['answer'] = '\n### Answer:\n' + df['answer'] + tokenizer.eos_token 

In [None]:
train_df, valid_df = train_test_split(df, train_size=0.8, random_state=46, shuffle=True)
print(train_df['context'][0], end='')
print(train_df['question'][0], end='')
print(train_df['answer'][0])

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class SquadDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        item = self.dataframe.iloc[index]
        return item['context'], item['question'], item['answer']
       
    def __len__(self):
        return len(self.dataframe)
    
    # 將文本進行分詞
    def tokenize_data(self, texts, max_length=512):
        tokenized_inputs = self.tokenizer(
            list(texts),
            truncation=True,
            padding='longest',
            max_length=max_length,
            return_tensors='pt',
        )
        
        return tokenized_inputs.input_ids, tokenized_inputs.attention_mask

    # 定義數據加載過程中的數據整理方法
    def collate_fn(self, batch):
        contexts, questions, answers = zip(*batch)
        
        # 輸入和答案
        question_ids, question_attention_mask = self.tokenize_data(questions)
        answer_ids, answer_attention_mask = self.tokenize_data(answers)
        context_ids, context_attention_mask = self.tokenize_data(contexts, max_length=1024-answer_ids.shape[1]-question_ids.shape[1])
       

        # 模型的輸入 = context_ids + question_ids + answer_ids
        combined_input_ids = torch.cat((context_ids, question_ids, answer_ids), dim=-1)
        # 模型的MASK = context_attention_mask + question_attention_mask + answer_attention_mask
        combined_attention_mask = torch.cat((context_attention_mask, question_attention_mask, answer_attention_mask), dim=-1)

        # 模型的標籤 = context_ids * [-100] + question_ids * [-100] + answer_ids + [EOS] 
        context_ignore_mask = torch.full((context_ids.shape[0], context_ids.shape[-1]), -100) # 產生context_ids * [-100]
        question_ignore_mask = torch.full((question_ids.shape[0], question_ids.shape[-1]), -100) # 產生question_ids * [-100]
        answer_ignore_indices = (answer_attention_mask == 0) # 找出Answer的[PAD] idx
        answer_ids[answer_ignore_indices] = -100 # 將Answer為[PAD]的部分轉換成-100
        combined_answers = torch.cat((context_ignore_mask, question_ignore_mask, answer_ids), dim=-1) #context_ignore_mask + question_ignore_mask + answer_ids

        return {
            'input_ids': combined_input_ids,
            'attention_mask': combined_attention_mask,
            'labels': combined_answers,
        }

In [None]:
# 建立資料集
trainset = SquadDataset(train_df, tokenizer)
validset = SquadDataset(valid_df, tokenizer)

# 創建 DataLoader
train_loader = DataLoader(trainset, batch_size=4, shuffle=True, collate_fn=trainset.collate_fn)
valid_loader = DataLoader(validset, batch_size=4, shuffle=True, collate_fn=validset.collate_fn)

In [None]:
import torch.optim as optim
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import AutoModelForCausalLM

# 訓練設置
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=len(train_loader) * 0.2, 
        num_training_steps=len(train_loader) * 10, 
        num_cycles=1, 
)

In [None]:
from Trainer import Trainer
trainer = Trainer(
    epochs=10, 
    train_loader=train_loader, 
    valid_loader=valid_loader,
    model=model, 
    optimizer=[optimizer],
    scheduler=[scheduler],
    early_stopping=3,
)
trainer.train()

In [None]:
def inference(model, tokenizer, context, question, device):
    # 準備輸入數據
    inference_data = f"{context}{question}\n### Answer:\n"
    # 進行編碼和截斷
    try:
        inputs = tokenizer(inference_data, max_length=1024, truncation=True, return_tensors='pt').to(device)
        # 禁用梯度計算，進行生成
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
        
        # 解碼並提取答案部分
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = generated_text.split('\n### Answer:\n')[1].strip()
        
        return answer
    except:
        return 'Error'

# 載入模型和設定評估模式
model.load_state_dict(torch.load('model.ckpt'))
model.eval()

# 指定要進行推理的索引
idx = 7

# 準備推理資料
context = valid_df['context'].values[idx]
question = valid_df['question'].values[idx]
answer = valid_df['answer'].values[idx]


# 進行推理
model.generation_config.pad_token_id = tokenizer.eos_token_id
model_answer = inference(model, tokenizer, context, question, device)


# 輸出原始上下文、問題、真實答案和模型生成的答案
print(f"{context}")
print(f"{question}")
print(f"{answer.split(tokenizer.eos_token)[0]}")
print("\n### Model Answer:\n" + model_answer)