In [None]:
import torch
import re
import polars as pl
from datasets import Dataset, NamedSplit, DatasetDict
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
tokenizer.add_tokens(['<think>', '</think>', '<answer>', '</answer>'])
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
print(model.get_input_embeddings())
model.resize_token_embeddings(len(tokenizer))
print(model.get_input_embeddings())

Parameters after creating a LoRA config get decreased to 86,016 parameters. Also, i convert the dtype of parameters from float32 to bfloat16.\
**Training the model using float32 params runs out of GPU memory when using mps.**

Total params = 86,016\
bfloat16 = 2 bytes\
Params space = 86,016 x 2 = 132,032 bytes

In [None]:
lora_config = LoraConfig(
    task_type="QUESTION_ANS",
    r=2,
    target_modules=["q", "v"])
LoRA_model = get_peft_model(model, lora_config)
print(LoRA_model.print_trainable_parameters())

print("\n--- Changing Dtype of Trainable Parameters to bfloat16 ---")
LoRA_model = LoRA_model.to(torch.bfloat16)

for name, param in LoRA_model.named_parameters():
    if param.requires_grad:
        print(f"Parameter: {name}, Dtype: {param.dtype}, Requires Grad: {param.requires_grad}")
        

In [None]:
mpsDevice = torch.device("mps")
LoRA_model.to(device=mpsDevice)
print(f"Total GPU memory allocated by Metal driver for the process: {torch.mps.driver_allocated_memory()} bytes")
print(f"Current GPU memory occupied by tensors: {torch.mps.current_allocated_memory()} bytes")

In [None]:
print(tokenizer("amish kakka "))
tokenizer.encode("amish kakka ")

In [None]:
text = "Answer the Question: President of USA?"
answer = "It is Donald Trump"
encodedText = tokenizer(text, return_tensors='pt', padding=True)
tensorInput = torch.Tensor(encodedText['input_ids']).view(1,-1)
tensorInput = tensorInput.to(dtype=torch.long,
                             device=mpsDevice)
tensorMask = torch.Tensor(encodedText['attention_mask']).view(1,-1)
tensorMask = tensorMask.to(dtype=torch.long,
                           device=mpsDevice)
encodedAns = tokenizer(answer, return_tensors='pt', padding=True)
tensorAnswer = torch.Tensor(encodedAns['input_ids']).view(1,-1)
tensorAnswer = tensorAnswer.to(dtype=torch.long,
                             device=mpsDevice)
tensorLabel = torch.Tensor(encodedAns['attention_mask']).view(1,-1)
tensorLabel = tensorLabel.to(dtype=torch.long,
                           device=mpsDevice)
print(tensorInput)
print(tensorMask)
print(tensorAnswer)
print(tensorLabel)

In [None]:
model.eval()
model.to(device=mpsDevice)
tensorOutput = LoRA_model(input_ids=tensorInput,
                     attention_mask=tensorMask,
                     decoder_input_ids=tensorAnswer,
                     labels=tensorLabel)
print(tensorOutput)
# tokenizer.decode(tensorOutput[0].tolist(), skip_special_tokens=True)

In [None]:
'''
    Open your terminal and login to huggingface using the command :- huggingface-cli login
    Enter your generated token from Hugging Face.
'''

splits = {'train': 'data/train-00000-of-00001.parquet',
          'validation': 'data/validation-00000-of-00001.parquet',
          'test': 'data/test-00000-of-00001.parquet'}

df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['train'])
val_df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['validation'])
test_df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['test'])

In [None]:
(df.head(5))

In [None]:
def restructureData(example):
  '''
    The purpose of "restructuring" is to have a clear format for the input and output for the model.
    Args:
      example: A single row of the original dataset to format.
  '''
  re_input = f"{example[0]}\nOptions: {example[1]}\n{example[2]}\n{example[3]}\n{example[4]}"
  re_output = f"<think>{example[5]}</think>\n<answer>{example[4]}</answer>"
  # print(re_input)
  # print(re_output)
  return re_input, re_output

newData = df.map_rows(lambda x: restructureData(x))
valData = val_df.map_rows(lambda x: restructureData(x))
testData = test_df.map_rows(lambda x: restructureData(x))

In [None]:
newData.columns = ['questions', 'answers']
valData.columns = ['questions', 'answers']
testData.columns = ['questions', 'answers']

In [None]:
# Interestingly enough Dataset.from_polars() is not supported now.
# So, I convert the newly formed polars dataset to pandas, and then in the cell below into an distinct Dataset object based on the splits.
# Then, combining them into a master 'dataset_dict' object.
Data_pd = newData.to_pandas()
valData_pd = valData.to_pandas()
testData_pd = testData.to_pandas()

Data_pd.iloc[2000]['questions']

In [None]:
train_dataset = Dataset.from_pandas(Data_pd, split=NamedSplit('train'))
val_dataset = Dataset.from_pandas(valData_pd, split=NamedSplit('validation'))
test_dataset = Dataset.from_pandas(testData_pd, split=NamedSplit('test'))

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

dataset_dict

In [None]:
def getLengthInfo(data, column):
  lengths = [len(i) for i in data.iloc[:, column]]
  print("Max length: ", max(lengths))
  print("Min length: ", min(lengths))

  length_pt = torch.tensor(lengths, dtype=torch.bfloat16)
  print(length_pt.mean())
  print(length_pt.std())

  import plotly.express as px
  fig = px.histogram(lengths, nbins=500)
  fig.show()

# print('Length info for questions in train dataset:')
# getLengthInfo(0)
# print('Length info for answers in train dataset:')
# getLengthInfo(1)

# print('Length info for questions in test dataset:')
# getLengthInfo(data='test', column=0)
# print('Length info for answers in test dataset:')
# getLengthInfo(data='test')

In [None]:
def tokenize(examples):
  model_inputs = tokenizer.batch_encode_plus(examples['questions'], padding='max_length', truncation=True, return_tensors='pt')
  print("Input tokenized!")
  labels = tokenizer.batch_encode_plus(examples['answers'], padding='max_length', truncation=True, return_tensors='pt')
  print("Labels tokenized!")
  model_inputs['input_ids'] = torch.tensor(model_inputs['input_ids'], dtype=torch.long)
  model_inputs['labels'] = torch.tensor(labels['input_ids'], dtype=torch.long)
  return model_inputs

In [None]:
tokenized_dataset = dataset_dict.map(tokenize, batched=True)
tokenized_dataset

# Now, we define the Rewards to be given to the Model during training.


In [None]:
s1 = Data_pd.iloc[100]['questions']
s2 = Data_pd.iloc[100]['answers']

print(s1,"\n",s2)

In [None]:
def ResponseStructureReward(completions):
  rewards = []
  for completion in completions:
    match1 = re.search(r"^<think>.*?</think>.*?", completion)
    match2 = re.search(r"<answer>.*?</answer>.*?", completion)
    # if match1:
    #   print("match1:", match1.group(0))
    #   print(len(match1.group(0).split(" ")))
    # if match2:
    #   print("match2:", match2.group(0),"\n")
    #   print(len(match2.group(0).split(" ")))
    if match1 and match2:
      rewards.append(1.0)
    else:
      rewards.append(0.5)  # Partial reward for correct format but limited content
  # print(rewards)
  return rewards

# ResponseStructureReward(["The energy is sun. Correct answer: sun"])
# ResponseStructureReward(["<think>The nu#$%^&*([]/[;'/mber of molecular orbi875676rbtals </> produced is the same as the number. <answer>energy </answer>"])
ResponseStructureReward([s2])


def ResponseLengthReward(completions):
  rewards = []
  for completion in completions:
    if len(completion) > 100:
      if len(completion) > 200:
        rewards.append(0.5)
      elif len(completion) > 400:
        rewards.append(1.0)
      else:
        rewards.append(0.25)
    else:
      rewards.append(0.0)
  # print(rewards)
  return rewards

ResponseLengthReward([s2])

# Here, we create batches of input_ids, attention_mask, and labels.
**Batch size = 8, for loading data along with the model on GPU**

-------------------------------

1 long value = 8 bytes of memory \
512 long values = 1 tensor in our case \
3 such tensors at each input instance = 3 x 512 x 8 = 12,288 bytes

For a single batch, \
8 instances = 8 x 12,288 = 98,304 bytes 

Memory for a single batch during training = 98.3 KB 

-------------------------------

1 bfloat value = 2 bytes of memory \
512 bfloat values = 1 tensor in our case \
3 such tensors at each input instance = 3 x 512 x 2 = 3072 bytes

For a single batch, \
8 instances = 8 x 3072 = 24,576 bytes

Memory for a single batch during training = 24.5 KB

In [None]:
DataLoader = torch.utils.data.DataLoader
data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['train']['input_ids']),
                                      torch.tensor(tokenized_dataset['train']['attention_mask']),
                                      torch.tensor(tokenized_dataset['train']['labels']))
batched_data = DataLoader(dataset=data,
                          batch_size=8)

val_data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['validation']['input_ids']),
                                      torch.tensor(tokenized_dataset['validation']['attention_mask']),
                                      torch.tensor(tokenized_dataset['validation']['labels']))
batched_val_data = DataLoader(dataset=val_data,
                          batch_size=10)

test_data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['test']['input_ids']),
                                      torch.tensor(tokenized_dataset['test']['attention_mask']),
                                      torch.tensor(tokenized_dataset['test']['labels']))
batched_test_data = DataLoader(dataset=test_data,
                          batch_size=10)

**Define the optimizer we are gonna use and the parameters for LoRA model. Also, moving the model to cuda**

In [None]:
optimizer = torch.optim.Adam(LoRA_model.parameters(), lr=1e-3, eps=1e-4)
training_loss = []
metrics = []

In [None]:
def SupervisedTraining(epochs, model_dir=''):
  for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    epoch_losses = []
    b = 0
    train_iterator = iter(batched_data)
    val_iterator = iter(batched_val_data)
    
    input_id, attn_mask, target = next(train_iterator)
    target_ids = target.contiguous()
    labels = target.clone().detach()
    labels[target[:, :] == tokenizer.pad_token_type_id] = -100
    
    mps_input_ids = input_id.to(device=mpsDevice)
    mps_target_ids = target_ids.to(device=mpsDevice)
    mps_attnMask_ids = attn_mask.to(device=mpsDevice)
    mps_labels_ids = labels.to(device=mpsDevice)
    print("Data into MPS.")
    
    LoRA_model.train()
    output = LoRA_model(input_ids=mps_input_ids,
                        attention_mask=mps_target_ids,
                        decoder_input_ids=mps_attnMask_ids,
                        labels=mps_labels_ids)
    loss = output[0]
    print(output)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    epoch_losses.append(loss)
    print(f"Epoch loss: {sum(epoch_losses)/len(epoch_losses)}")
    training_loss.append(sum(epoch_losses)/len(epoch_losses))
    
  print(f'Training loss: {training_loss:.4f}')
  torch.save(LoRA_model.state_dict(), model_dir)

SupervisedTraining(epochs=1, model_dir='T5_Model1.pt')

In [None]:
# LoRA_model.eval()
# for i, m, o in batched_test_data:
#   outputs = LoRA_model.generate(input_ids=i, attention_mask=m)
#   break
# print(outputs)