In [41]:
import json
import torch
import numpy as np
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, AdamW
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, RandomSampler, TensorDataset


In [42]:
file_path = 'qa_dataset.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [43]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [44]:
# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Assuming binary classification (change as needed)
labels = torch.tensor([1 if x['answer_length'] > 100 else 0 for x in data])

# Tokenize and truncate long sequences
tokenized_data = tokenizer(
    [x['question'] + " [SEP] " + x['answer'] for x in data],
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=512  # Adjust the max_length based on your model's maximum sequence length
)

# Create a dataset
dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)


In [47]:
model = GPT2ForSequenceClassification.from_pretrained('gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [None]:
# Parameters
batch_size = 16
epochs = 3

# Data loader
dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

# Classification head for binary classification
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = len(tokenizer)

# Loss function for binary classification
loss_fn = torch.nn.CrossEntropyLoss()

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids, b_input_mask, b_labels = b_input_ids.to(device), b_input_mask.to(device), b_labels.to(device)

        model.zero_grad()

        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits

        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1} finished. Loss: {total_loss / len(dataloader)}")


In [None]:
# Switch to evaluation mode
model.eval()

predictions, true_labels = [], []

for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    batch_preds = np.argmax(logits, axis=1)
    predictions.extend(batch_preds)
    true_labels.extend(label_ids)

# Calculate the accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")