In [None]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler
torch.backends.cuda.matmul.allow_tf32 = True
import wandb
# Parameters
num_epochs = 30
lr = 5e-5
batch_size = 12
warmup_steps= 750
save_loc1 = '/home/arjun/Documents/ModelSaves/GPT2Alpaca-chat'
save_loc2 = '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat'
save_loc3 = '/home/arjun/Documents/ModelSaves/GPT2Alpaca-midEpoch-chat'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [2]:
dataset = load_dataset("MuskumPillerum/General-Knowledge")
dataset = dataset['train']

# Making dataset smaller for fast training
dataset

Dataset({
    features: ['Question', 'Answer'],
    num_rows: 37635
})

In [3]:
# model = GPT2LMHeadModel.from_pretrained("gpt2")
# tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

tokenizer = AutoTokenizer.from_pretrained(save_loc2)
model =   GPT2LMHeadModel.from_pretrained(save_loc2)

optimizer = AdamW(model.parameters(), lr=lr)

In [4]:
new_dataset = {'input_ids': [], 'attention_mask': []}

ct = 0
for example in dataset:
    try:
        input_text =  'Read the question and give an honest answer. Your answers should not include any unethical, racist, sexist, dangerous, or illegal content. If the question is wrong, or does not make sense, accept it instead of giving the wrong answer.\nQuestion:'+ example['Question']+ ' Answer: ' + example['Answer']
        encoded_data = tokenizer('' + input_text + '', truncation=True, max_length=768, padding="max_length")
        new_dataset['input_ids'].append(encoded_data['input_ids'])
        new_dataset['attention_mask'].append(encoded_data['attention_mask'])
    except:
        ct += 1

new_dataset = Dataset.from_dict(new_dataset)
new_dataset.set_format("torch")

# DataLoader
dataloader = DataLoader(new_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
ct

12

In [5]:
# Optimizer and scheduler
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model = torch.compile(model)
scaler = GradScaler()

In [6]:
# Training loop
progress_bar = tqdm(range(num_training_steps-1), desc='Training', unit='steps')
model.train()
ep = 0
prev_avg_train_loss = 999
for epoch in range(num_epochs):
    total_train_loss = 0
    for batch in dataloader:
        batch_data = batch['input_ids'].to(device)
        attention = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(batch_data,
                            labels=batch_data,
                            attention_mask=attention,
                            token_type_ids=None
                            )

            loss = outputs[0]
            batch_loss = loss.item()
            total_train_loss += batch_loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

    avg_train_loss = total_train_loss / len(dataloader)
    ep += 1
    print('Epoch:', ep, 'Average training loss =', avg_train_loss)
    if abs(prev_avg_train_loss - avg_train_loss) < 0.0001:
        model.save_pretrained(save_loc1)
        tokenizer.save_pretrained(save_loc1)
        print("Loss is very small")
        break
    prev_avg_train_loss = avg_train_loss
    model.save_pretrained(save_loc3)
    tokenizer.save_pretrained(save_loc3)


Training:   0%|          | 0/94079 [00:00<?, ?steps/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



Epoch: 1 Average training loss = 0.15495460552320225
Epoch: 2 Average training loss = 0.1495365913130571
Epoch: 3 Average training loss = 0.14217235956445565
Epoch: 4 Average training loss = 0.13529891558395393
Epoch: 5 Average training loss = 0.12893614050224234
Epoch: 6 Average training loss = 0.12308988953009248
Epoch: 7 Average training loss = 0.11768362518371445
Epoch: 8 Average training loss = 0.11275354474402811
Epoch: 9 Average training loss = 0.1080466335727263
Epoch: 10 Average training loss = 0.10377367872896852
Epoch: 11 Average training loss = 0.09973031613852221
Epoch: 12 Average training loss = 0.09599598706581117
Epoch: 13 Average training loss = 0.09247008363516736
Epoch: 14 Average training loss = 0.08935188474988907
Epoch: 15 Average training loss = 0.08631168904224867
Epoch: 16 Average training loss = 0.08360245769453824
Epoch: 17 Average training loss = 0.08107709017230616
Epoch: 18 Average training loss = 0.07864251712453552
Epoch: 19 Average training loss = 0.076

In [9]:
model.save_pretrained(save_loc2)
tokenizer.save_pretrained(save_loc2)

('/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat/tokenizer_config.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat/special_tokens_map.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat/vocab.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat/merges.txt',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat/added_tokens.json',
 '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat/tokenizer.json')

In [8]:
bug

NameError: name 'bug' is not defined

In [1]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel
# torch.backends.cuda.matmul.allow_tf32 = True
save_loc2 = '/home/arjun/Documents/ModelSaves/GPT2AlpacaNew-chat'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(save_loc2)
model = GPT2LMHeadModel.from_pretrained(save_loc2)
model.to(device)
print(device)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/arjun/NewPytorchEnv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
ERROR: /home/arjun/NewPytorchEnv/bin/python3.10: undefined symbol: cudaRuntimeGetVersion
CUDA SETUP: libcudart.so path is None
CUDA SETUP: Is seems that your cuda installation is not in your path. See https://github.com/TimDettmers/bitsandbytes/issues/85 for more information.
CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 00
CUDA SETUP: Loading binary /home/arjun/NewPytorchEnv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


cuda


In [2]:
qn = 'Who is the king of the jungle?'
prompt = 'Read the question and give an honest answer. Your answers should not include any unethical, racist, sexist, dangerous, or illegal content.If the question is wrong, or does not make sence, accept it instead of giving wrong answer.\nQuestion: '+qn+ ' Answer: ' 
print(prompt)
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=8,
                                max_length=500,
                                top_p=.3, 
                                num_return_sequences= 5,
                                temperature = .9,
                                )

for i, sample_output in enumerate(sample_outputs):
    ans = tokenizer.decode(sample_output, skip_special_tokens=True).split('Response: ')
    print("\n\n-------------------------------------------------------------------------------------------------------------------------------------------")
    try:        print(f'<-{i+1}-> {ans[1]}')
    except:
        print(f'<-{i+1}-> ___No response___')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Read the question and give an honest answer. Your answers should not include any unethical, racist, sexist, dangerous, or illegal content.If the question is wrong, or does not make sence, accept it instead of giving wrong answer.
Question: Who is the king of the jungle? Answer: 


-------------------------------------------------------------------------------------------------------------------------------------------
<-1-> ___No response___


-------------------------------------------------------------------------------------------------------------------------------------------
<-2-> ___No response___


-------------------------------------------------------------------------------------------------------------------------------------------
<-3-> ___No response___


-------------------------------------------------------------------------------------------------------------------------------------------
<-4-> ___No response___


------------------------------------------------------