In [None]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer,GPT2LMHeadModel, get_scheduler 
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm

Parameters

In [None]:
num_epochs = 1 
lr = 5e-5
batch_size = 1
save_loc = '/home/arjun/Documents/ModelSaves/GPT2Alpaca'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

Dataset preparation and tokenising

In [None]:
dataset = load_dataset("tatsu-lab/alpaca")

In [None]:
dataset = dataset['train']
dataset

In [None]:
# Making dataset smaller for fast training
dataset = dataset.select(range(100))

In [None]:
# model = GPT2LMHeadModel.from_pretrained("gpt2")
# tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_loc)
model =   GPT2LMHeadModel.from_pretrained(save_loc)

In [None]:
new_dataset = {'input_ids':[], 'attention_mask':[]}

for example in dataset:
    input_text = example['text'].replace('###','')   
    encoded_data = tokenizer('<|startoftext|>' + input_text + '<|endoftext|>',truncation=True, max_length=768, padding="max_length")
    new_dataset['input_ids'].append(encoded_data['input_ids'])
    new_dataset['attention_mask'].append(encoded_data['attention_mask'])

new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")

DataLoader

In [None]:
dataloader = DataLoader(new_dataset,shuffle=True, batch_size=batch_size)

Optimiser and scheduler

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
model.resize_token_embeddings(len(tokenizer))
model.to(device)

Gradient accumulation for training loop

In [None]:
'''
This is gradient accumulation.
The idea is to put a low batch size(say, 1) and a high gradient_accumulation step(say, 8) instead
of taking a batch size of 8

This will reduce memory footprint by a lot, and retain the performance(with a small loss)
But the training time increases a lot

Probably useful for very large LLMs.

'''

from torch import nn

# Define the gradient accumulation step size
gradient_accumulation_steps = 8  # Choose an appropriate value based on your GPU memory

progress_bar = tqdm(range(num_training_steps-1), desc='Training', unit='steps')
model.train()
ep = 0
total_train_loss = 0
optimizer.zero_grad()  # Move optimizer.zero_grad() outside the epoch loop

for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)
        
        outputs = model(batch_data,
                        labels=batch_data,
                        attention_mask=attention,
                        token_type_ids=None
                        )

        loss = outputs[0]
        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Perform backward pass and accumulate gradients
        loss = loss / gradient_accumulation_steps
        loss.backward()

        progress_bar.update(1)
        if (i + 1) % gradient_accumulation_steps == 0:
            # Update parameters after accumulating gradients for the specified number of steps
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()


    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:', ep, 'Average training loss =', avg_train_loss)
    total_train_loss = 0  # Reset the total training loss for the next epoch


In [None]:
question = 'Write a poem on cow'

prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 Instruction:{question}
 Response: """
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=100,
                                max_length=100,
                                max_new_tokens=200,
                                top_p=.95, 
                                num_return_sequences= 5,
                                temperature = .9,
                                )

for i, sample_output in enumerate(sample_outputs):
    ans = tokenizer.decode(sample_output, skip_special_tokens=True).split('Response: ')
    print("\n\n-------------------------------------------------------------------------------------------------------------------------------------------")
    try:        print(f'<-{i+1}-> {ans[1]}')
    except:
        print(f'<-{i+1}-> ___No response___')