## Import Essential Libraries

In [2]:
import torch
print("CUDA Version:", torch.version.cuda)
print("Is CUDA available:", torch.cuda.is_available())
from transformers import GPT2Tokenizer , GPT2LMHeadModel , TrainingArguments , Trainer , DataCollatorForLanguageModeling
from torch.utils.data import Dataset , DataLoader
import pandas as pd

CUDA Version: 12.1
Is CUDA available: True


## PreProcess Data

In [4]:
class JokesDataset(Dataset):
    def __init__(self, file_path, tokenizer, num_samples):
        # read data file
        self.data = pd.read_csv(file_path)
        # get random samples from data file that is == to number of samples
        # we want to use to train model
        if num_samples and num_samples < len(self.data):
            self.data = self.data.sample(n=num_samples, random_state=42).reset_index(drop=True)
        # save the jokes in a list
        self.jokes = self.data["joke"].tolist()
        # initialize tokenizer
        self.tokenizer = tokenizer
        # tokenize jokes and save output to be input to the model in training phase
        self.inputs = self.tokenizer(self.jokes, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    def __len__(self):
        # return no. of  jokes in the dataset
        return len(self.jokes)
    
    def __getitem__(self, idx):
        # tokenized text
        input_ids = self.inputs['input_ids'][idx]
        # indicate which token is an actual token and which is a padded token
        attention_mask = self.inputs['attention_mask'][idx]
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Load Tokenizer and set a pad token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

# Create a dataset instance 
dataset = JokesDataset(file_path="/mnt/d/Workspace/Prodigy_InfoTech_Internship/task1-Text_Genration_with_gpt2/data/Jokes.csv", tokenizer=tokenizer, num_samples=20000)

# Create a data loader
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)



In [3]:
# test that data loader has created correct shapes
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    print(input_ids.shape,attention_mask.shape)
    break

torch.Size([2, 512]) torch.Size([2, 512])


In [4]:
# Import prtrained gpt2-medium (124M parameters)
gpt2_medium = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# Initialize training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=200,
    fp16=True,
)

# Create data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

# Initialize trainer
trainer = Trainer(
    model=gpt2_medium,
    args=training_args,
    data_collator= data_collator,
    train_dataset=dataset
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [5]:
# Train model
trainer.train()

Step,Training Loss
200,3.72
400,3.4209
600,3.4825
800,3.3994
1000,3.4371
1200,3.3347
1400,3.4131
1600,3.4102
1800,3.3502
2000,3.4434


TrainOutput(global_step=30000, training_loss=2.641936144002279, metrics={'train_runtime': 9202.4373, 'train_samples_per_second': 6.52, 'train_steps_per_second': 3.26, 'total_flos': 5.572204167168e+16, 'train_loss': 2.641936144002279, 'epoch': 3.0})