# Imports and Initializations

In [2]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler
torch.backends.cuda.matmul.allow_tf32 = True
# import wandb
# Parameters
num_epochs = 20
lr = 5e-5                                           
batch_size = 8
warmup_steps= 750

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


# Initialising model

In [3]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

optimizer = AdamW(model.parameters(), lr=lr)

# Dataset

In [4]:
dataset = load_dataset("MuskumPillerum/General-Knowledge")
dataset = dataset['train']

Downloading readme: 100%|██████████| 1.90k/1.90k [00:00<00:00, 10.9MB/s]
Downloading data: 100%|██████████| 16.2M/16.2M [00:06<00:00, 2.45MB/s]
Generating train split: 37635 examples [00:00, 335657.41 examples/s]


In [5]:
new_dataset = {'input_ids': [], 'attention_mask': []}

data_format = 'Question:' + 'Answer: '
ct = 0
for example in dataset:
    try:
        input_text =  'Question:'+ example['Question']+ ' Answer: ' + example['Answer']
        encoded_data = tokenizer('' + input_text + '', truncation=True, max_length=768, padding="max_length")
        new_dataset['input_ids'].append(encoded_data['input_ids'])
        new_dataset['attention_mask'].append(encoded_data['attention_mask'])
    except:
        ct += 1

new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")

# DataLoader
dataloader = DataLoader(new_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
ct

12

In [6]:
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model = torch.compile(model)
scaler = GradScaler()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

# Training loop

In [7]:
save_loc = 'Your save location'

In [8]:
progress_bar = tqdm(range(num_training_steps-1), desc='Training', unit='steps')
model.train()
ep = 0
prev_avg_train_loss = 999
for epoch in range(num_epochs):
    total_train_loss = 0
    for batch in dataloader:
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(batch_data,
                            labels=batch_data,
                            attention_mask=attention,
                            token_type_ids=None
                            )

            loss = outputs[0]
            batch_loss = loss.item()
            total_train_loss += batch_loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:', ep, 'Average training loss =', avg_train_loss)
    if abs(prev_avg_train_loss - avg_train_loss) < 0.0001:
        model.save_pretrained(save_loc)
        tokenizer.save_pretrained(save_loc)
        print("Loss is very small")
        break
    prev_avg_train_loss = avg_train_loss
    model.save_pretrained(save_loc)
    tokenizer.save_pretrained(save_loc)


Training:   0%|          | 96/94059 [00:46<4:08:19,  6.31steps/s] 

KeyboardInterrupt: 