In [None]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler
torch.backends.cuda.matmul.allow_tf32 = True
import bitsandbytes as bnb

# Parameters
num_epochs = 20
lr = 5e-5
batch_size = 14
warmup_steps= 750
save_loc1 = '/home/arjun/Documents/ModelSaves/GPT2Alpaca'
save_loc2 = '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew'
save_loc3 = '/home/arjun/Documents/ModelSaves/GPT2Alpaca-midEpoch'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [None]:
dataset = load_dataset("tatsu-lab/alpaca")
dataset = dataset['train']

# Making dataset smaller for fast training
dataset = dataset.select(range(52000))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_loc1)
model = GPT2LMHeadModel.from_pretrained(save_loc1)

optimizer = AdamW(model.parameters(), lr=lr)

In [4]:
new_dataset = {'input_ids': [], 'attention_mask': []}

for example in dataset:
    input_text = example['text'].replace('###', '')
    encoded_data = tokenizer('' + input_text + '', truncation=True, max_length=768, padding="max_length")
    new_dataset['input_ids'].append(encoded_data['input_ids'])
    new_dataset['attention_mask'].append(encoded_data['attention_mask'])

new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")

# DataLoader
dataloader = DataLoader(new_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)

In [5]:
# Optimizer and scheduler
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model = torch.compile(model)
scaler = GradScaler()

In [6]:
# Training loop
progress_bar = tqdm(range(num_training_steps-1), desc='Training', unit='steps')
model.train()
ep = 0
prev_avg_train_loss = 200
for epoch in range(num_epochs):
    total_train_loss = 0
    for batch in dataloader:
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(batch_data,
                            labels=batch_data,
                            attention_mask=attention,
                            token_type_ids=None
                            )

            loss = outputs[0]
            batch_loss = loss.item()
            total_train_loss += batch_loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:', ep, 'Average training loss =', avg_train_loss)
    if abs(prev_avg_train_loss - avg_train_loss) < 0.0001:
        model.save_pretrained(save_loc2)
        tokenizer.save_pretrained(save_loc2)
        print("Loss is very small")
        break
    prev_avg_train_loss = avg_train_loss
    model.save_pretrained(save_loc3)
    tokenizer.save_pretrained(save_loc3)


Training:   0%|          | 0/74299 [00:00<?, ?steps/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



Epoch: 1 Average training loss = 1.0319024256990863
Epoch: 2 Average training loss = 0.2395178459414242
Epoch: 3 Average training loss = 0.22346324833012204
Epoch: 4 Average training loss = 0.21141124311157705
Epoch: 5 Average training loss = 0.20068976198038535
Epoch: 6 Average training loss = 0.1915696698655832
Epoch: 7 Average training loss = 0.18382316657102604
Epoch: 8 Average training loss = 0.1772697814681328
Epoch: 9 Average training loss = 0.17136197922247415
Epoch: 10 Average training loss = 0.1662357897829881
Epoch: 11 Average training loss = 0.1615352763662749
Epoch: 12 Average training loss = 0.1574526953725282
Epoch: 13 Average training loss = 0.15371499086252613
Epoch: 14 Average training loss = 0.15039131903712555
Epoch: 15 Average training loss = 0.14750854182652479
Epoch: 16 Average training loss = 0.14495826907867057
Epoch: 17 Average training loss = 0.14282140334915183
Epoch: 18 Average training loss = 0.1410154115243202
Epoch: 19 Average training loss = 0.139578124

In [7]:
model.save_pretrained(save_loc2)
tokenizer.save_pretrained(save_loc2)

('/home/arjun/Documents/ModelSaves/GPT2AlpacaNew/tokenizer_config.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew/special_tokens_map.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew/vocab.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew/merges.txt',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew/added_tokens.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew/tokenizer.json')

In [8]:
question = 'Give me tips to be healthy'

prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 Instruction:{question}
 Response: """
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=100,
                                max_length=100,
                                max_new_tokens=200,
                                top_p=.95, 
                                num_return_sequences= 5,
                                temperature = .9,
                                )

for i, sample_output in enumerate(sample_outputs):
    ans = tokenizer.decode(sample_output, skip_special_tokens=True).split('Response: ')
    print("\n\n-------------------------------------------------------------------------------------------------------------------------------------------")
    try:        print(f'<-{i+1}-> {ans[1]}')
    except:
        print(f'<-{i+1}-> ___No response___')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




-------------------------------------------------------------------------------------------------------------------------------------------
<-1-> 
1. Exercise regularly: Eating a healthy diet and getting plenty of rest are essential for proper health. 
2. Exercise smarter: Exercise has several benefits, such as reducing inflammation, improving energy levels, and improving digestion. 
3. Eat more fruits and vegetables: A healthy diet is one of the most nutrient-dense and essential foods we can eat. 
4. Exercise at night: Taking the time to enjoy the night is essential for developing good habits and energizing muscles. 
5. Stay informed: Not eating too much food and getting plenty of sleep are important for physical and mental health. 
6. Exercise at home: Getting enough of both physical and mental exercise helps to maintain physical and mental health.


-------------------------------------------------------------------------------------------------------------------------------------