Imports

In [None]:
save_loc = '/home/arjun/Documents/ModelSaves/GPT2Alpaca'

In [None]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer,GPT2LMHeadModel, get_scheduler 
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm

Dataset preparation and tokenising

In [None]:
dataset = load_dataset("tatsu-lab/alpaca")

In [None]:
dataset['train'].features

In [None]:
dataset = dataset['train']
dataset

In [None]:
dataset[0]

In [None]:
# Making dataset smaller for fast training
dataset = dataset.select(range(1000))

For loading model and tokeniser from scratch

In [None]:
# model = GPT2LMHeadModel.from_pretrained("gpt2")
# tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

For loading tokeniser from save

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_loc)
model =   GPT2LMHeadModel.from_pretrained(save_loc)

In [None]:
new_dataset = {'input_ids':[], 'attention_mask':[]}

for example in dataset:
    input_text = example['text'].replace('\n', '').replace('###','\n')   
    encoded_data = tokenizer('<|startoftext|>' + input_text + '<|endoftext|>',truncation=True, max_length=768, padding="max_length")
    new_dataset['input_ids'].append(encoded_data['input_ids'])
    new_dataset['attention_mask'].append(encoded_data['attention_mask'])

new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")

In [None]:
new_dataset

DataLoader

In [None]:
dataloader = DataLoader(new_dataset,shuffle=True, batch_size=8)

Optimiser and scheduler

In [None]:
num_epochs = 14

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
model.to(device)
print(device)


In [18]:
progress_bar = tqdm(range(num_training_steps-1),desc='Training', unit='steps')
model.train()   # Some layers behave differently to training and inference. This sets all those 
                 # layers into training mode
ep = 0
for epoch in range(num_epochs):
    total_train_loss=0
    for batch in dataloader:
        # print(batch)
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)
        # batch_data = batch_data.flatten()  # Flatten the tensor to 1-dimensional
        # outputs = model.generate(batch_data.unsqueeze(0))
        model.zero_grad()  
        outputs = model(  batch_data,
                          labels=batch_data, 
                          attention_mask = attention,
                          token_type_ids=None
                        )

        # metric.add_batch(predictions=predictions, references=batch["input_ids"])
        loss = outputs[0]     # compute loss 
        batch_loss = loss.item()
        total_train_loss += batch_loss
        loss.backward()          # computes gradients
        optimizer.step()         # optimises
        lr_scheduler.step()      # updates lr according to schedule. Improves performance
        optimizer.zero_grad()    # resets the gradients
        progress_bar.update(1)   # updates progress bar by 1
    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:',ep,'Average training loss = ',avg_train_loss)


Training:   0%|          | 0/1749 [00:00<?, ?steps/s]

Epoch: 1 Average training loss =  0.27909325844049454
Epoch: 2 Average training loss =  0.23163198763132095
Epoch: 3 Average training loss =  0.21218926632404328
Epoch: 4 Average training loss =  0.1976565789580345
Epoch: 5 Average training loss =  0.18594333910942076
Epoch: 6 Average training loss =  0.17401826828718187
Epoch: 7 Average training loss =  0.16392649263143538
Epoch: 8 Average training loss =  0.15621806234121322
Epoch: 9 Average training loss =  0.14885319823026658
Epoch: 10 Average training loss =  0.14225157314538955
Epoch: 11 Average training loss =  0.13674755388498305
Epoch: 12 Average training loss =  0.13316831189393996
Epoch: 13 Average training loss =  0.1299300209879875
Epoch: 14 Average training loss =  0.1280406168103218


Saving model and tokeniser

In [19]:
model.save_pretrained(save_loc)
tokenizer.save_pretrained(save_loc)

('/home/arjun/Documents/ModelSaves/GPT2Alpaca.pt/tokenizer_config.json',
 '/home/arjun/Documents/ModelSaves/GPT2Alpaca.pt/special_tokens_map.json',
 '/home/arjun/Documents/ModelSaves/GPT2Alpaca.pt/vocab.json',
 '/home/arjun/Documents/ModelSaves/GPT2Alpaca.pt/merges.txt',
 '/home/arjun/Documents/ModelSaves/GPT2Alpaca.pt/added_tokens.json',
 '/home/arjun/Documents/ModelSaves/GPT2Alpaca.pt/tokenizer.json')

In [55]:
question = 'Write a poem on cow'

prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 Instruction:{question}
 Response: """
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=100,
                                max_length=100,
                                max_new_tokens=200,
                                top_p=.95, 
                                num_return_sequences= 5,
                                temperature = .9,
                                )

for i, sample_output in enumerate(sample_outputs):
    ans = tokenizer.decode(sample_output, skip_special_tokens=True).split('Response: ')
    print("\n\n-------------------------------------------------------------------------------------------------------------------------------------------")
    try:        print(f'<-{i+1}-> {ans[1]}')
    except:
        print(f'<-{i+1}-> ___No response___')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




-------------------------------------------------------------------------------------------------------------------------------------------
<-1->            ‘Duck is king                                                                                                                                                                                        


-------------------------------------------------------------------------------------------------------------------------------------------
<-2-> _______Do you remember when it was all sweet and sweet? When the cows came home, they were all so happy. _______But when they came home, they were all so sad. So why did they come back? They were not giving back what they had given them. And that's when they began to cry. They began to speak, telling stories, sharing food and companionship. Then they were like this: they were going to go back to their old ways and do what they loved. And that's when they were free to do whatever they wanted