In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Mount Google Drive for data and model storage
%cd /content/drive/MyDrive/  # Change to the desired working directory

In [None]:
# Install and upgrade necessary libraries
!pip install transformers --upgrade
!pip install accelerate --upgrade
!pip install --upgrade --force-reinstall accelerate
!pip install --upgrade --force-reinstall transformers[torch]
!pip install torch transformers
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
import json
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, GPT2Config
from sklearn.model_selection import train_test_split
import torch

In [None]:
# Load data from the JSON file
with open('/content/drive/MyDrive/00_PFE/DataSet/Visual_Question_Answering /FloodNet Challenge @ EARTHVISION 2021 - Track 2/Questions/Training Question.json', 'r') as file:
    data = json.load(file)

In [None]:
# Convert the loaded JSON data to a DataFrame for easier manipulation
df = pd.DataFrame(data).transpose()

In [None]:
# Extract questions and labels from the DataFrame
questions = df['Question'].tolist()  # List of questions
labels = pd.factorize(df['Question_Type'])[0]  # Convert categorical labels to numerical labels

In [None]:
# Load the GPT-2 tokenizer and configure padding
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to end-of-sequence token

In [None]:
# Configure the GPT-2 model for sequence classification
model_config = GPT2Config.from_pretrained('gpt2', num_labels=len(set(labels)), pad_token_id=tokenizer.eos_token_id)
model = GPT2ForSequenceClassification(config=model_config)

In [None]:
# Define a function for tokenizing the text
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

In [None]:
# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(questions, labels, test_size=0.3)

In [None]:
# Tokenize the training and validation text
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [None]:
# Define a custom Dataset class for PyTorch
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Retrieve a single item as tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        # Return the total number of samples
        return len(self.labels)

In [None]:
# Create dataset objects for training and validation
train_dataset = VQADataset(train_encodings, train_labels)
val_dataset = VQADataset(val_encodings, val_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/00_PFE/Question_Answering/GPT/Result/results',  # Output directory for the model
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay for optimizer
    logging_dir='/content/drive/MyDrive/00_PFE/Question_Answering/GPT/Result/logs',  # Directory for logs
    logging_steps=50,  # Log every 50 steps
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,  # The GPT-2 model for sequence classification
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=val_dataset  # Validation dataset
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the fine-tuned model to the specified directory
model.save_pretrained("/content/drive/MyDrive/00_PFE/")