In [3]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler
torch.backends.cuda.matmul.allow_tf32 = True

# Parameters
num_epochs = 10
lr = 5e-5
batch_size = 8
save_loc = '/home/arjun/Documents/ModelSaves/GPT2Alpaca'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

# Dataset preparation and tokenizing
dataset = load_dataset("tatsu-lab/alpaca")
dataset = dataset['train']

# Making dataset smaller for fast training
dataset = dataset.select(range(100))

tokenizer = AutoTokenizer.from_pretrained(save_loc)
model = GPT2LMHeadModel.from_pretrained(save_loc)

new_dataset = {'input_ids': [], 'attention_mask': []}

for example in dataset:
    input_text = example['text'].replace('###', '')
    encoded_data = tokenizer('' + input_text + '', truncation=True, max_length=768, padding="max_length")
    new_dataset['input_ids'].append(encoded_data['input_ids'])
    new_dataset['attention_mask'].append(encoded_data['attention_mask'])

new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")

# DataLoader
dataloader = DataLoader(new_dataset, shuffle=True, batch_size=batch_size)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Mixed precision training setup
scaler = GradScaler()

# Training loop
progress_bar = tqdm(range(num_training_steps-1), desc='Training', unit='steps')
model.train()
ep = 0

for epoch in range(num_epochs):
    total_train_loss = 0
    for batch in dataloader:
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)

        # Zero the gradients before forward pass
        optimizer.zero_grad()

        # Use autocast to perform forward pass in TF16 (mixed precision)
        with autocast():
            outputs = model(batch_data,
                            labels=batch_data,
                            attention_mask=attention,
                            token_type_ids=None
                            )

            loss = outputs[0]
            batch_loss = loss.item()
            total_train_loss += batch_loss

        # Use scaler to scale the loss and perform backward pass in TF32
        scaler.scale(loss).backward()

        # Update parameters using optimizer step and scaler
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:', ep, 'Average training loss =', avg_train_loss)
    total_train_loss = 0  # Reset the total training loss for the next epoch

# Rest of the code for text generation...


cuda


Using custom data configuration tatsu-lab--alpaca-2b32f0433506ef5f
Found cached dataset parquet (/home/arjun/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/129 [00:00<?, ?steps/s]

Epoch: 1 Average training loss = 57.84068159415172
Epoch: 2 Average training loss = 2.04199204995082
Epoch: 3 Average training loss = 0.7538362099574163
Epoch: 4 Average training loss = 0.3766096807443179
Epoch: 5 Average training loss = 0.3399999634577678
Epoch: 6 Average training loss = 0.3195021588068742
Epoch: 7 Average training loss = 0.3052640568751555
Epoch: 8 Average training loss = 0.3029276132583618
Epoch: 9 Average training loss = 0.2995922817633702
Epoch: 10 Average training loss = 0.286622547186338
