In [8]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define the dataset class
class RankingDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]["query"]
        document = self.data[idx]["document"]
        label = self.data[idx]["label"]

        input_text = f"Query: {query} , Document: {document} , Relevant:"

        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training


# Assuming you're fine-tuning the BackpackGPT2 model
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)


model.config.pad_token_id = model.config.eos_token_id


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [9]:

# Sample training data
data = [
    {"query": "What is AI?", "document": "Artificial Intelligence is...", "label": 1},
    {"query": "What is AI?", "document": "The sky is blue.", "label": 0},
    # Add more examples...
]

# Prepare the dataset
train_dataset = RankingDataset(data, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss


TrainOutput(global_step=3, training_loss=1.023418664932251, metrics={'train_runtime': 27.1152, 'train_samples_per_second': 0.221, 'train_steps_per_second': 0.111, 'total_flos': 1567780503552.0, 'train_loss': 1.023418664932251, 'epoch': 3.0})

In [10]:

# Saving the model after training
trainer.save_model('/mnt/d/EE/Term6/Projects/LLM/BP/models/rankers/gpt2_ranking')

# Example of inference
def predict_relevance(query, document):
    model.eval()
    input_text = f"Query: {query} , Document: {document} , Relevant:"
    encoding = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=-1).item()

    return "true" if predicted_label == 1 else "false"

query = "What is AI?"
document = "Artificial Intelligence is a branch of computer science..."
print(f"Query: {query} , Document: {document} , Relevant: {predict_relevance(query, document)}")


Query: What is AI? , Document: Artificial Intelligence is a branch of computer science... , Relevant: true


In [13]:
query = "Can you talk about World War 2?"
document = "Artificial Intelligence is a branch of computer science."
print(f"Query: {query} , Document: {document} , Relevant: {predict_relevance(query, document)}")

Query: Can you talk about World War 2? , Document: Artificial Intelligence is a branch of computer science. , Relevant: true
