#GPT2-FINE TUNNING

In [None]:
from datasets import load_from_disk

# Load previously tokenized datasets from the specified directory
tokenized_datasets = load_from_disk(f"{base_dir}/tokenized_datasets")

In [None]:
# Display the tokenized training dataset
tokenized_datasets['train']

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 99357
})

In [None]:
# Display the tokenized validation dataset
tokenized_datasets['validation']

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 24840
})

In [None]:
# Importing the 'accelerate' library for streamlined hardware acceleration
import accelerate
import transformers
# Importing necessary class for setting up training arguments
from transformers import TrainingArguments
from transformers import DataCollatorForLanguageModeling

# Setting up training arguments for the training session
training_args = TrainingArguments(
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=3,              # Total number of training epochs
    gradient_accumulation_steps=4,   # Number of updates steps to accumulate before performing a backward/update pass
    logging_steps=100,               # Log metrics every 100 steps
    fp16=True,
    evaluation_strategy="steps",     # Evaluate the model every 'eval_steps'
    eval_steps=500,                  # Evaluate the model every 500 steps
    learning_rate=5e-5,              # Learning rate
    weight_decay=0.01,               # Weight decay
    save_steps=500,                  # Save the model every 500 steps
    save_total_limit=3,              # Keep only the last 3 models
    logging_dir=f'{base_dir}/logs',  # Directory for storing logs
    output_dir=f'{base_dir}/results' # Directory for storing results and model checkpoints
)
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# Import necessary modules from the transformers library
from transformers import Trainer
import torch

# Define a custom collation function to handle batching of data
def collate_batch(data):
    # Stacking input_ids from the tokenized data
    input_ids = torch.stack([torch.tensor(item['input_ids']) for item in data])

    # Stacking attention masks from the tokenized data
    attention_mask = torch.stack([torch.tensor(item['attention_mask']) for item in data])

    # Using input_ids as labels (relevant for tasks like masked language modeling)
    labels = torch.stack([torch.tensor(item['input_ids']) for item in data])

    # Return a dictionary with keys corresponding to model input names
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


In [None]:
# Set up the Trainer with the model, training arguments, datasets, and the custom data collator
trainer = Trainer(
    model=model,                                        # Model to be trained
    args=training_args,                                 # Training arguments
    train_dataset=tokenized_datasets["train"],          # Training dataset
    eval_dataset=tokenized_datasets["validation"],      # Evaluation dataset
    data_collator=collate_batch                         # Custom function to form batches
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

# Start the training process
trainer.train('/content/drive/MyDrive/GPT2-finetune/results/checkpoint-18500')

Step,Training Loss,Validation Loss
18000,0.104,0.104319
18500,0.1055,0.104324


TrainOutput(global_step=18630, training_loss=0.006397193667073437, metrics={'train_runtime': 3023.2438, 'train_samples_per_second': 98.593, 'train_steps_per_second': 6.162, 'total_flos': 7.815362055399014e+16, 'train_loss': 0.006397193667073437, 'epoch': 3.0})

In [None]:
import math

# Given validation loss
validation_loss = 0.104324

# Compute perplexity
perplexity = math.exp(validation_loss)

print("Perplexity:", perplexity)


Perplexity: 1.1099600237099743


In [None]:
results = trainer.evaluate()

#Save Model

In [None]:
model.save_pretrained("/content/drive/MyDrive/GPT2-finetune/Model")
tokenizer.save_pretrained("/content/drive/MyDrive/GPT2-finetune/Model")