In [1]:
import json
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
#readin training and test datas
file_path = 'qa_dataset.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [3]:
#customized dataset
class QADataset(Dataset):
    def __init__(self, tokenizer, data, block_size):
        self.tokenizer = tokenizer
        self.questions_answers = []

        with open(file_path, 'r') as file:
            for item in data:
                encoded = tokenizer.encode(item["product_name"]+item["product_description"]+item['question'] + tokenizer.eos_token + item['answer'], truncation=True, max_length=block_size)
                self.questions_answers.append(encoded)

    def __len__(self):
        return len(self.questions_answers)

    def __getitem__(self, idx):
        return torch.tensor(self.questions_answers[idx], dtype=torch.long)

def collate_fn(batch):
    batch = pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    return batch

In [4]:
#prepare model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
#prepare data
train_size = round(len(data) * 0.7)
train_data = data[:train_size]
test_data = data[train_size:]
train_dataset = QADataset(tokenizer, train_data, 512)
test_dataset = QADataset(tokenizer, test_data, 512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
# train_loader = DataLoader(test_dataset, batch_size=3, shuffle=False)

In [6]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)



In [14]:
# Parameters
epochs = 50

# Loss function for binary classification
loss_fn = torch.nn.CrossEntropyLoss()


# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader):
        inputs = batch.to(device)
        labels = batch.to(device)
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # scheduler.step()
    print(f"Epoch {epoch+1} finished. Loss: {total_loss/len(train_loader)}")

100%|██████████| 143/143 [00:27<00:00,  5.18it/s]


Epoch 1 finished. Loss: 2.158963008360429


100%|██████████| 143/143 [00:27<00:00,  5.19it/s]


Epoch 2 finished. Loss: 2.1642734020740004


100%|██████████| 143/143 [00:28<00:00,  5.08it/s]


Epoch 3 finished. Loss: 2.101853851671819


100%|██████████| 143/143 [00:27<00:00,  5.22it/s]


Epoch 4 finished. Loss: 2.1628483850639184


100%|██████████| 143/143 [00:27<00:00,  5.17it/s]


Epoch 5 finished. Loss: 2.14087531783364


100%|██████████| 143/143 [00:27<00:00,  5.14it/s]


Epoch 6 finished. Loss: 2.1311162768544016


  6%|▌         | 8/143 [00:01<00:26,  5.01it/s]


KeyboardInterrupt: 

In [8]:
model.save_pretrained('./gpt2-qa-finetuned')

In [None]:
# Switch to evaluation mode
model.eval()

predictions, true_labels = [], []

for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    batch_preds = np.argmax(logits, axis=1)
    predictions.extend(batch_preds)
    true_labels.extend(label_ids)

# Calculate the accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")

In [17]:
def generate_answer(model, tokenizer, question, max_length=256):
    model.eval()
    inputs = tokenizer(question, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)

    return tokenizer.decode(output[0], skip_special_tokens=True)
sample_question = "Hornby 2014 Catalogue Does this catalogue detail all the previous Hornby products please?"
generated_answer = generate_answer(model, tokenizer, sample_question)
print(f"Question: {sample_question}\nAnswer: {generated_answer}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Hornby 2014 Catalogue Does this catalogue detail all the previous Hornby products please?
Answer: Hornby 2014 Catalogue Does this catalogue detail all the previous Hornby products please?

Yes, this catalogue is for the Hornby Hornby 2014 Catalogue.

What is the difference between the Hornby Hornby 2014 Catalogue and the Hornby Hornby 2014 Catalogue?

The Hornby Hornby 2014 Catalogue is a collection of Hornby products. The Hornby Hornby 2014 Catalogue is a collection of Hornby products.
