In [None]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
!pip show accelerate

Name: accelerate
Version: 0.30.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [None]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.0
    Uninstalling transformers-4.41.0:
      Successfully uninstalled transformers-4.41.0
Successfully installed transformers-4.41.1


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=dcf1f771cddc0c4dc81617984a159a07e245977751b481bb3a7a70c0cb4fe9c7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Imports
from transformers import AutoTokenizer, BertModel, GPT2LMHeadModel, GPT2Tokenizer
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
import math
import time
import sys
import json
import numpy as np
from tqdm import tqdm
import warnings
from rouge_score import rouge_scorer

# Ignore all warnings
warnings.filterwarnings("ignore")

def evaluate(dataset_path, dataloader, model, device):
    with open(dataset_path, 'r') as file:
        dataset = [json.loads(line) for line in file]

    model.eval()
    generated_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                     max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
            generated_predictions.extend(outputs)

    correct_predictions = 0
    total_predictions = 0

    for idx, instance in enumerate(dataset):
        input_ids = generated_predictions[idx]
        decoded_output = tokenizer.decode(input_ids, skip_special_tokens=True)

        choices = instance['question']['choices']
        answer_key = instance['answerKey']

        choice_texts = [choice['text'] for choice in choices]
        choice_labels = [choice['label'] for choice in choices]

        rouge_scores = [compute_rouge(decoded_output, choice_text) for choice_text in choice_texts]
        predicted_label = choice_labels[rouge_scores.index(max(rouge_scores))]

        if predicted_label == answer_key:
            correct_predictions += 1
        total_predictions += 1

    accuracy = correct_predictions / total_predictions
    print(f"Accuracy: {accuracy * 100:.2f}%")

def load_answer_keys(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    answer_keys = [item['answerKey'] for item in data]
    return answer_keys

def compute_rouge(predicted_text, reference_text):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(predicted_text, reference_text)
    return scores['rougeL'].fmeasure

def generate_predictions(model, tokenizer, data, device):
    model.to(device)
    predictions = []
    for item in tqdm(data):
        input_text = item.split('[ANSWER]')[0] + '[ANSWER]'
        inputs = tokenizer(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512).to(device)

        # Debug prints
        print("Input text:", input_text)
        print("Tokenized input shape:", inputs['input_ids'].shape)
        print("Tokenized input IDs:", inputs['input_ids'])

        output = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(decoded_output.split('[ANSWER]')[-1].strip())
    return predictions

class GPT2GenerationDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = self.preprocess_data(data)
        self.tokenizer = tokenizer
        self.tokenized_data = self.tokenize_data(self.data)
        self.label_data = self.tokenize_data(self.data)

    def preprocess_data(self, data):
        preprocessed_data = []
        for item in data:
            fact = item['fact1']
            stem = item['question']['stem']
            choices = item['question']['choices']
            answer_key = item['answerKey']
            text = f"[START] {fact} {stem} [A] {choices[0]['text']} [B] {choices[1]['text']} [C] {choices[2]['text']} [D] {choices[3]['text']} [ANSWER] {answer_key}"
            preprocessed_data.append(text)
        return preprocessed_data

    def tokenize_data(self, data):
        return self.tokenizer(data, padding='max_length',
                              truncation=True, max_length=512, return_tensors='pt')

    def __len__(self):
        return len(self.tokenized_data['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.tokenized_data['input_ids'][idx]
        attention_mask = self.tokenized_data['attention_mask'][idx]
        labels = self.label_data['input_ids'][idx]

        # Find the position of the [ANSWER] token and set all tokens before it to -100
        answer_token_index = (input_ids == self.tokenizer.encode('[ANSWER]')[0]).nonzero(as_tuple=True)[0].item() + 1
        labels[:answer_token_index] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

if __name__ == "__main__":
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Load the training data
    with open('train_complete.jsonl', 'r') as file:
        training_data = [json.loads(line) for line in file]

    training_dataset = GPT2GenerationDataset(training_data, tokenizer)

    # Load the validation data
    with open('dev_complete.jsonl', 'r') as file:
        validation_data = [json.loads(line) for line in file]

    validation_answer_keys = load_answer_keys('./dev_complete.jsonl')
    validation_dataset = GPT2GenerationDataset(validation_data, tokenizer)
    validation_dataloader = DataLoader(validation_dataset, batch_size=8, shuffle=False)

    # Load the test data
    with open('test_complete.jsonl', 'r') as file:
        test_data = [json.loads(line) for line in file]

    test_answer_keys = load_answer_keys('./test_complete.jsonl')
    test_dataset = GPT2GenerationDataset(test_data, tokenizer)
    test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Training arguments
    training_args = TrainingArguments(
        output_dir='results',
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=8,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
        logging_steps=300,
        logging_dir='logs'
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=training_dataset
    )

    # Finetune the model on the training dataset
    trainer.train()

    # Evaluate on the validation set
    print("Validation Set Evaluation:")
    evaluate('dev_complete.jsonl', validation_dataloader, model, device)

    # Evaluate on the test set
    print("Test Set Evaluation:")
    evaluate('est_complete.jsonl', test_dataloader, model, device)


Step,Training Loss
300,3.3776
600,2.3017
900,2.0554
1200,1.9951
1500,1.8538
1800,1.8152
2100,1.7269
2400,1.7138
2700,1.6614
3000,1.6367


Validation Set Evaluation:


100%|██████████| 63/63 [01:01<00:00,  1.02it/s]


Accuracy: 33.60%
Test Set Evaluation:


FileNotFoundError: [Errno 2] No such file or directory: 'est_complete.jsonl'

In [None]:
print("Test Set Evaluation:")
evaluate('test_complete.jsonl', test_dataloader, model, device)

Test Set Evaluation:


100%|██████████| 63/63 [01:01<00:00,  1.02it/s]


Accuracy: 32.80%
