In [48]:
import json
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [49]:
#readin training and test datas
file_path = 'qa_dataset.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [50]:
#prepare model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [120]:
#customized dataset
class QADataset(Dataset):
    def __init__(self, tokenizer, data, block_size, train=True):
        self.train = True
        self.tokenizer = tokenizer
        self.questions_answers = []
        if not train:
            self.train = False
            self.answers = []

        with open(file_path, 'r') as file:
            for item in data:
                if train:
                    encoded = tokenizer.encode(f'Product: {item["product_name"]} Product Description: {item["product_description"]} Question: {item["question"]} Answer: {item["answer"]} {tokenizer.eos_token}', truncation=True, max_length=block_size, add_special_tokens=True)
                else:
                    encoded = tokenizer.encode(f'Product: {item["product_name"]} Product Description: {item["product_description"]} Question: {item["question"]}', truncation=True, max_length=block_size, add_special_tokens=False)
                    encoded2 = f'Product: {item["product_name"]} Product Description: {item["product_description"]} Question: {item["question"]} Answer: {item["answer"]}'
                    self.answers.append(encoded2)
                self.questions_answers.append(encoded)

    def __len__(self):
        return len(self.questions_answers)

    def __getitem__(self, idx):
        if self.train:
            return torch.tensor(self.questions_answers[idx], dtype=torch.long)
        else:
            return torch.tensor(self.questions_answers[idx], dtype=torch.long), self.answers[idx]

def collate_fn(batch):
    batch = pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    return batch

def collate_fn2(batch):
    questions, answers = zip(*batch)

    # Convert questions to tensors if they are not already
    questions_tensor = [torch.tensor(question) for question in questions]

    # Find the maximum length of the sequences in the batch
    max_len = max([q.size(0) for q in questions_tensor])

    # Manually pad each sequence on the left
    questions_padded = torch.stack([torch.cat([torch.full((max_len - q.size(0),), tokenizer.pad_token_id), q]) for q in questions_tensor])

    # Create attention masks for left-padded questions
    attention_masks = torch.stack([torch.cat([torch.zeros(max_len - q.size(0)), torch.ones(q.size(0))]) for q in questions_tensor])

    # Return the left-padded questions, their corresponding attention masks, and answers
    return questions_padded, attention_masks, answers


In [121]:
#prepare data
train_size = round(len(data) * 0.7)
train_data = data[:train_size]
test_data = data[train_size:]
train_dataset = QADataset(tokenizer, train_data, 512)
test_dataset = QADataset(tokenizer, test_data, 512, train=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=3, shuffle=False, collate_fn=collate_fn2)

In [66]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [67]:
# Parameters
epochs = 25

# Loss function for binary classification
loss_fn = torch.nn.CrossEntropyLoss()


# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader):
        inputs = batch.to(device)
        labels = batch.to(device)
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # scheduler.step()
    print(f"Epoch {epoch+1} finished. Loss: {total_loss/len(train_loader)}")

100%|██████████| 143/143 [00:28<00:00,  4.97it/s]


Epoch 1 finished. Loss: 1.9753698431528532


100%|██████████| 143/143 [00:28<00:00,  4.95it/s]


Epoch 2 finished. Loss: 1.6165214964559862


100%|██████████| 143/143 [00:29<00:00,  4.82it/s]


Epoch 3 finished. Loss: 1.4270736837720537


100%|██████████| 143/143 [00:29<00:00,  4.82it/s]


Epoch 4 finished. Loss: 1.2839950898310521


100%|██████████| 143/143 [00:29<00:00,  4.80it/s]


Epoch 5 finished. Loss: 1.1583627995911179


100%|██████████| 143/143 [00:29<00:00,  4.85it/s]


Epoch 6 finished. Loss: 1.0521926892387283


100%|██████████| 143/143 [00:29<00:00,  4.78it/s]


Epoch 7 finished. Loss: 0.9500113946574551


100%|██████████| 143/143 [00:29<00:00,  4.83it/s]


Epoch 8 finished. Loss: 0.886465324805333


100%|██████████| 143/143 [00:29<00:00,  4.84it/s]


Epoch 9 finished. Loss: 0.8088926885511492


100%|██████████| 143/143 [00:29<00:00,  4.86it/s]


Epoch 10 finished. Loss: 0.756320019285162


100%|██████████| 143/143 [00:29<00:00,  4.84it/s]


Epoch 11 finished. Loss: 0.6957090847142093


100%|██████████| 143/143 [00:29<00:00,  4.81it/s]


Epoch 12 finished. Loss: 0.6473162655646985


100%|██████████| 143/143 [00:29<00:00,  4.80it/s]


Epoch 13 finished. Loss: 0.5946685510290253


100%|██████████| 143/143 [00:29<00:00,  4.81it/s]


Epoch 14 finished. Loss: 0.5537256621397458


100%|██████████| 143/143 [00:29<00:00,  4.82it/s]


Epoch 15 finished. Loss: 0.5180426467131901


100%|██████████| 143/143 [00:29<00:00,  4.89it/s]


Epoch 16 finished. Loss: 0.4819876163989514


100%|██████████| 143/143 [00:29<00:00,  4.79it/s]


Epoch 17 finished. Loss: 0.44196643195785845


100%|██████████| 143/143 [00:29<00:00,  4.77it/s]


Epoch 18 finished. Loss: 0.4105317572822104


100%|██████████| 143/143 [00:29<00:00,  4.78it/s]


Epoch 19 finished. Loss: 0.38904376192526385


100%|██████████| 143/143 [00:30<00:00,  4.75it/s]


Epoch 20 finished. Loss: 0.3555531578359904


100%|██████████| 143/143 [00:30<00:00,  4.70it/s]


Epoch 21 finished. Loss: 0.3229819705019464


100%|██████████| 143/143 [00:30<00:00,  4.69it/s]


Epoch 22 finished. Loss: 0.2957903680788887


100%|██████████| 143/143 [00:29<00:00,  4.88it/s]


Epoch 23 finished. Loss: 0.29023147900621377


100%|██████████| 143/143 [00:29<00:00,  4.77it/s]


Epoch 24 finished. Loss: 0.2617293029919371


100%|██████████| 143/143 [00:29<00:00,  4.77it/s]

Epoch 25 finished. Loss: 0.24846246682263753





In [76]:
model.save_pretrained('./gpt2-qa-finetuned-25-epoch')

In [123]:
from nltk.translate.bleu_score import sentence_bleu
tokenizer.padding_side = 'left'

# Switch to evaluation mode
model.eval()

# BLEU scores list
bleu_scores = []

for batch in tqdm(test_loader):
    encoded_questions, attention_masks, correct_answers = batch
    encoded_questions = encoded_questions.to(device)
    attention_masks = attention_masks.to(device)

    # Process each item in the batch
    for i in range(encoded_questions.size(0)):  # Iterate through each sequence in the batch
        with torch.no_grad():
            # Generate model output, using both encoded_questions and attention_masks
            output = model.generate(encoded_questions[i].unsqueeze(0), 
                                    attention_mask=attention_masks[i].unsqueeze(0),
                                    max_length=512)  # Set `your_max_length` appropriately
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

            # Prepare reference text and generated text for BLEU
            reference = correct_answers[i].split()
            candidate = generated_text.split()

            # Calculate BLEU score for this item and store it
            score = sentence_bleu([reference], candidate)
            bleu_scores.append(score)

# Calculate average BLEU score across all items
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {avg_bleu_score}")

  questions_tensor = [torch.tensor(question) for question in questions]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 1/163 [00:01<03:36,  1.34s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 2/163 [00:03<04:41,  1.75s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 3/163 [00:04<03:38,  1.37s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generat

Average BLEU Score: 0.7754362933363205





In [124]:
print(f"Average BLEU Score: {avg_bleu_score}")

Average BLEU Score: 0.7754362933363205
