In [1]:
import torch
import re
import polars as pl
from datasets import Dataset, NamedSplit, DatasetDict
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
tokenizer.add_tokens(['<think>', '</think>', '<answer>', '</answer>'])
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
print(model.get_input_embeddings())
model.resize_token_embeddings(len(tokenizer))
print("After adding new tokens: ", model.get_input_embeddings())

Embedding(32128, 512)
After adding new tokens:  Embedding(32104, 512)


Parameters after creating a LoRA config get decreased to 86,016 parameters. Also, i convert the dtype of parameters from float32 to bfloat16.\
**Training the model using float32 params runs out of GPU memory when using mps.**

Total params = 86,016\
bfloat16 = 2 bytes\
Params space = 86,016 x 2 = 132,032 bytes \
Also, the same amount of memory will be required during backpropogation to store the differentiated value of all the variable parameters. \
Toal Params space = 132,032 x 2 = 264,064 bytes = **264 KB**

In [2]:
lora_config = LoraConfig(
    task_type="QUESTION_ANS",
    r=2,
    target_modules=["q", "v"])
LoRA_model = get_peft_model(model, lora_config)
print(LoRA_model.print_trainable_parameters())

print("\n--- Changing Dtype of Trainable Parameters to bfloat16 ---")
LoRA_model = LoRA_model.to(torch.bfloat16)

for name, param in LoRA_model.named_parameters():
    if param.requires_grad:
        print(f"Parameter: {name}, Dtype: {param.dtype}, Requires Grad: {param.requires_grad}")
        

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 86,016 || all params: 77,022,592 || trainable%: 0.1117
None

--- Changing Dtype of Trainable Parameters to bfloat16 ---


  warn("The installed version of bitsandbytes was compiled without GPU support. "


Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttention.q.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttention.q.lora_B.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttention.v.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttent

In [3]:
mpsDevice = torch.device("mps")
LoRA_model.to(device=mpsDevice)
print(f"Total GPU memory allocated by Metal driver for the process: {torch.mps.driver_allocated_memory()} bytes")
print(f"Current GPU memory occupied by tensors: {torch.mps.current_allocated_memory()} bytes")

Total GPU memory allocated by Metal driver for the process: 1166426112 bytes
Current GPU memory occupied by tensors: 154045440 bytes


In [4]:
print(tokenizer("amish kakka "))
tokenizer.encode("amish kakka ")

{'input_ids': [183, 1273, 3, 1258, 8511, 9, 3, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


[183, 1273, 3, 1258, 8511, 9, 3, 1]

In [50]:
text = "Answer the Question: President of USA?"
answer = "It is Donald Trump"
encodedText = tokenizer(text, return_tensors='pt', padding=True)
tensorInput = torch.Tensor(encodedText['input_ids']).view(1,-1)
tensorInput = tensorInput.to(dtype=torch.long,
                             device=mpsDevice)
tensorMask = torch.Tensor(encodedText['attention_mask']).view(1,-1)
tensorMask = tensorMask.to(dtype=torch.long,
                           device=mpsDevice)
encodedAns = tokenizer(answer, return_tensors='pt', padding=True)
tensorAnswer = torch.Tensor(encodedAns['input_ids']).view(1,-1)
tensorAnswer = tensorAnswer.to(dtype=torch.long,
                             device=mpsDevice)
tensorLabel = torch.Tensor(encodedAns['attention_mask']).view(1,-1)
tensorLabel = tensorLabel.to(dtype=torch.long,
                           device=mpsDevice)
print(tensorInput)
print(tensorMask)
print(tensorAnswer)
print(tensorLabel)

tensor([[11801,     8, 11860,    10,  1661,    13,  2312,    58,     1]],
       device='mps:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')
tensor([[  94,   19, 7459, 2523,    1]], device='mps:0')
tensor([[1, 1, 1, 1, 1]], device='mps:0')


In [62]:
LoRA_model.eval()
outputs = LoRA_model.generate(input_ids=tensorInput, 
                              attention_mask=tensorMask)
print(tokenizer.decode(outputs[0].tolist(), skip_special_tokens=True))

john d. kennedy


In [57]:
model.eval()
model.to(device=mpsDevice)
tensorOutput = model(input_ids=tensorInput,
                     attention_mask=tensorMask,
                     decoder_input_ids=tensorAnswer)

In [7]:
logits = tensorOutput.logits
logits = logits.view(5, -1)
print(logits.size())
softmax = torch.nn.Softmax(dim=1)
probs = softmax(logits)
print(probs)
maxProbs = torch.argmax(probs, dim=1)
print(maxProbs)
tokenizer.decode(maxProbs)

torch.Size([5, 32104])
tensor([[7.5687e-23, 5.8984e-01, 2.0599e-03,  ..., 9.7607e-23, 9.7607e-23,
         7.5687e-23],
        [7.9823e-23, 5.9375e-01, 1.5411e-03,  ..., 7.9823e-23, 7.9823e-23,
         7.9823e-23],
        [1.8281e-22, 6.7188e-01, 3.1853e-04,  ..., 2.3492e-22, 1.8281e-22,
         1.8281e-22],
        [1.4641e-22, 4.3945e-01, 8.8215e-05,  ..., 1.8860e-22, 1.8860e-22,
         1.8860e-22],
        [1.7448e-25, 7.9102e-02, 1.3769e-05,  ..., 1.7448e-25, 1.7448e-25,
         1.3652e-25]], device='mps:0', dtype=torch.bfloat16,
       grad_fn=<SoftmaxBackward0>)
tensor([  1,   1,   1,   1, 233], device='mps:0')


'</s></s></s></s>...'

In [8]:
'''
    Open your terminal and login to huggingface using the command :- huggingface-cli login
    Enter your generated token from Hugging Face.
'''

splits = {'train': 'data/train-00000-of-00001.parquet',
          'validation': 'data/validation-00000-of-00001.parquet',
          'test': 'data/test-00000-of-00001.parquet'}

df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['train'])
val_df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['validation'])
test_df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['test'])

In [9]:
(df.head(5))

question,distractor3,distractor1,distractor2,correct_answer,support
str,str,str,str,str,str
"""What type of organism is commo…","""viruses""","""protozoa""","""gymnosperms""","""mesophilic organisms""","""Mesophiles grow best in modera…"
"""What phenomenon makes global w…","""tropical effect""","""muon effect""","""centrifugal effect""","""coriolis effect""","""Without Coriolis Effect the gl…"
"""Changes from a less-ordered st…","""endothermic""","""unbalanced""","""reactive""","""exothermic""","""Summary Changes of state are e…"
"""What is the least dangerous ra…","""zeta decay""","""beta decay""","""gamma decay""","""alpha decay""","""All radioactive decay is dange…"
"""Kilauea in hawaii is the world…","""magma""","""greenhouse gases""","""carbon and smog""","""smoke and ash""","""Example 3.5 Calculating Projec…"


In [95]:
def restructureData(example):
  '''
    The purpose of "restructuring" is to have a clear format for the input and output data for the model.
    Args:
      example: A single row of the original dataset to format.
  '''
  re_input = f"{example[0]}\nOptions: {example[1]}\n{example[2]}\n{example[3]}\n{example[4]}"
  re_reason = f"<think>{example[5]}</think>"
  re_output = f"{example[4]}"
  return re_input, re_reason, re_output

newData = df.map_rows(lambda x: restructureData(x))
valData = val_df.map_rows(lambda x: restructureData(x))
testData = test_df.map_rows(lambda x: restructureData(x))

In [96]:
newData.columns = ['questions', 'reasoning', 'answers']
valData.columns = ['questions', 'reasoning', 'answers']
testData.columns = ['questions', 'reasoning', 'answers']

In [97]:
# Interestingly enough Dataset.from_polars() is not supported now.
# So, I convert the newly formed polars dataset to pandas, and then in the cell below into an distinct Dataset object based on the splits.
# Then, combining them into a master 'dataset_dict' object.
Data_pd = newData.to_pandas()
valData_pd = valData.to_pandas()
testData_pd = testData.to_pandas()

print(Data_pd.iloc[10000]['questions'])
print(Data_pd.iloc[10000]['reasoning'])
print(Data_pd.iloc[10000]['answers'])

Why does water infiltrate the ground?
Options: run-off from flooding
gravity
prolonged drought conditions
because soil and rocks are porous
<think>Water infiltrates the ground because soil and rock are porous. Between the grains are pores, or tiny holes. Since water can move through this rock it is permeable. Eventually, the water reaches a layer of rock that is not porous and so is impermeable. Water stops moving downward when it reaches this layer of rock.</think>
because soil and rocks are porous


In [98]:
train_dataset = Dataset.from_pandas(Data_pd, split=NamedSplit('train'))
val_dataset = Dataset.from_pandas(valData_pd, split=NamedSplit('validation'))
test_dataset = Dataset.from_pandas(testData_pd, split=NamedSplit('test'))

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['questions', 'reasoning', 'answers'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['questions', 'reasoning', 'answers'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['questions', 'reasoning', 'answers'],
        num_rows: 1000
    })
})

From the below charts, we see that almost 10% of the questions have reasoning part with just 10-20 words.

In [100]:
def getLengthInfo(data, column, bins=500):
  lengths = [len(i) for i in data.iloc[:, column]]
  print("Max length: ", max(lengths))
  print("Min length: ", min(lengths))

  length_pt = torch.tensor(lengths, dtype=torch.bfloat16)
  print(length_pt.mean())
  print(length_pt.std())

  import plotly.express as px
  fig = px.histogram(lengths, nbins=bins)
  fig.show()

print('Length info for questions in train dataset:')
getLengthInfo(data=Data_pd, column=0)
print('Length info for reasoning in train dataset:')
getLengthInfo(data=Data_pd, column=1, bins=700)

print('Length info for questions in test dataset:')
getLengthInfo(data=testData_pd, column=0)
print('Length info for reasoning in test dataset:')
getLengthInfo(data=testData_pd, column=1, bins=700)

Length info for questions in train dataset:
Max length:  480
Min length:  46
tensor(132., dtype=torch.bfloat16)
tensor(37.2500, dtype=torch.bfloat16)


Length info for reasoning in train dataset:
Max length:  3574
Min length:  15
tensor(432., dtype=torch.bfloat16)
tensor(446., dtype=torch.bfloat16)


Length info for questions in test dataset:
Max length:  312
Min length:  51
tensor(130., dtype=torch.bfloat16)
tensor(36., dtype=torch.bfloat16)


Length info for reasoning in test dataset:
Max length:  2955
Min length:  15
tensor(438., dtype=torch.bfloat16)
tensor(450., dtype=torch.bfloat16)


In [101]:
def tokenize(examples):
  model_inputs = tokenizer.batch_encode_plus(examples['questions'], padding='max_length', truncation=True, return_tensors='pt')
  print("Input questions tokenized!")
  reasoning = tokenizer.batch_encode_plus(examples['reasoning'], padding='max_length', truncation=True, return_tensors='pt')
  print("Reasoning parts tokenized!")
  labels = tokenizer.batch_encode_plus(examples['answers'], padding='max_length', truncation=True, return_tensors='pt')
  print("Labels tokenized!")
  model_inputs['input_ids'] = torch.tensor(model_inputs['input_ids'], dtype=torch.long)
  model_inputs['labels'] = torch.tensor(labels['input_ids'], dtype=torch.long)
  model_inputs['reasoning'] = torch.tensor(reasoning['input_ids'], dtype=torch.long)
  return model_inputs

In [102]:
tokenized_dataset = dataset_dict.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!



To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).



Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!


DatasetDict({
    train: Dataset({
        features: ['questions', 'reasoning', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['questions', 'reasoning', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['questions', 'reasoning', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

# Now, we define the Rewards to be given to the Model during training.


In [108]:
s1 = Data_pd.iloc[100]['questions']
s2 = Data_pd.iloc[100]['reasoning']

print(s1,"\n",s2)

In a chemical reaction, the amounts of reactants and products will be constant when what state is attained?
Options: homogeneity
saturation
peak
equilibrium 
 <think>The amount of reactants and products do not have to be equal. However, after equilibrium is attained, the amounts of reactants and products will be constant.</think>


In [109]:
def ResponseStructureReward(completions):
  rewards = []
  for completion in completions:
    match1 = re.search(r"^<think>.*?</think>.*?", completion)
    match2 = re.search(r"<answer>.*?</answer>.*?", completion)
    # if match1:
    #   print("match1:", match1.group(0))
    #   print(len(match1.group(0).split(" ")))
    # if match2:
    #   print("match2:", match2.group(0),"\n")
    #   print(len(match2.group(0).split(" ")))
    if match1 and match2:
      rewards.append(1.0)
    else:
      rewards.append(0.5)  # Partial reward for correct format but limited content
  # print(rewards)
  return rewards

# ResponseStructureReward(["The energy is sun. Correct answer: sun"])
# ResponseStructureReward(["<think>The nu#$%^&*([]/[;'/mber of molecular orbi875676rbtals </> produced is the same as the number. <answer>energy </answer>"])
ResponseStructureReward([s2])


def ResponseLengthReward(completions):
  rewards = []
  for completion in completions:
    if len(completion) > 100:
      if len(completion) > 200:
        rewards.append(0.5)
      elif len(completion) > 400:
        rewards.append(1.0)
      else:
        rewards.append(0.25)
    else:
      rewards.append(0.0)
  # print(rewards)
  return rewards

ResponseLengthReward([s2])

[0.25]

# Here, we create batches of input_ids, attention_mask, and labels.
**Batch size = 8, for loading data along with the model on GPU**

-------------------------------

1 long value = 8 bytes of memory \
512 long values = 1 tensor in our case \
3 such tensors at each input instance = 3 x 512 x 8 = 12,288 bytes

For a single batch, \
8 instances = 8 x 12,288 = 98,304 bytes 

Memory for a single batch during training = 98.3 KB 

-------------------------------

1 bfloat value = 2 bytes of memory \
512 bfloat values = 1 tensor in our case \
3 such tensors at each input instance = 3 x 512 x 2 = 3072 bytes

For a single batch, \
8 instances = 8 x 3072 = 24,576 bytes

Memory for a single batch during training = 24.5 KB

In [110]:
DataLoader = torch.utils.data.DataLoader
data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['train']['input_ids']),
                                      torch.tensor(tokenized_dataset['train']['attention_mask']),
                                      torch.tensor(tokenized_dataset['train']['reasoning']),
                                      torch.tensor(tokenized_dataset['train']['labels']))
batched_data = DataLoader(dataset=data,
                          batch_size=8)

val_data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['validation']['input_ids']),
                                      torch.tensor(tokenized_dataset['validation']['attention_mask']),
                                      torch.tensor(tokenized_dataset['validation']['reasoning']),
                                      torch.tensor(tokenized_dataset['validation']['labels']))
batched_val_data = DataLoader(dataset=val_data,
                          batch_size=10)

test_data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['test']['input_ids']),
                                      torch.tensor(tokenized_dataset['test']['attention_mask']),
                                      torch.tensor(tokenized_dataset['test']['reasoning']),
                                      torch.tensor(tokenized_dataset['test']['labels']))
batched_test_data = DataLoader(dataset=test_data,
                          batch_size=10)

In [None]:
LoRA_model.eval()
input_id, attn_mask, _, target = next(iter(batched_data))
mps1 = input_id.to(mpsDevice)
mps2 = attn_mask.to(mpsDevice)
mps3 = target.to(mpsDevice)
outputs = LoRA_model.generate(input_ids=mps1,
                              attention_mask=mps2)
questions = tokenizer.batch_decode(input_id, skip_special_tokens=True)
answers = tokenizer.batch_decode(target, skip_special_tokens=True)

for q, a, output in zip(questions, answers, outputs):
    print("Question: ", q)
    print("Actual answer: ", a)
    print("Predicted output: ", tokenizer.decode(output, skip_special_tokens=True))

Question:  What type of organism is commonly used in preparation of foods such as cheese and yogurt? Options: viruses protozoa gymnosperms mesophilic organisms
Actual answer:  mesophilic organisms
Predicted output:  fungi
Question:  What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere? Options: tropical effect muon effect centrifugal effect coriolis effect
Actual answer:  coriolis effect
Predicted output:  tropical effect muon effect
Question:  Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always what? Options: endothermic unbalanced reactive exothermic
Actual answer:  exothermic
Predicted output:  unbalanced
Question:  What is the least dangerous radioactive decay? Options: zeta decay beta decay gamma decay alpha decay
Actual answer:  alpha decay
Predicted output:  Gamma decay is the least dangerous radioactive decay. Th

**Define the optimizer we are gonna use and the parameters for LoRA model. Also, moving the model to cuda**

In [21]:
optimizer = torch.optim.Adam(LoRA_model.parameters(), lr=1e-3, eps=1e-4)
training_loss = []
metrics = []

In [None]:
def SupervisedTraining(epochs, model_dir=''):
  for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    epoch_losses = []
    b = 0
    train_iterator = iter(batched_data)
    val_iterator = iter(batched_val_data)
    
    input_id, attn_mask, target = next(train_iterator)
    target_ids = target.contiguous()
    labels = target.clone().detach()
    labels[target[:, :] == tokenizer.pad_token_type_id] = -100
    
    mps_input_ids = input_id.to(device=mpsDevice)
    mps_target_ids = target_ids.to(device=mpsDevice)
    mps_attnMask_ids = attn_mask.to(device=mpsDevice)
    mps_labels_ids = labels.to(device=mpsDevice)
    print("Data into MPS.")
    
    LoRA_model.train()
    output = LoRA_model.generate(input_ids=mps_input_ids,
                        attention_mask=mps_target_ids,
                        decoder_input_ids=mps_attnMask_ids,
                        labels=mps_labels_ids)
    loss = output[0]
    print(output)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    epoch_losses.append(loss)
    print(f"Epoch loss: {sum(epoch_losses)/len(epoch_losses)}")
    training_loss.append(sum(epoch_losses)/len(epoch_losses))
    
  print(f'Training loss: {training_loss:.4f}')
  torch.save(LoRA_model.state_dict(), model_dir)
  print("Model saved.")

SupervisedTraining(epochs=1, model_dir='./T5_Model1.pt')

In [None]:
# LoRA_model.eval()
# for i, m, o in batched_test_data:
#   outputs = LoRA_model.generate(input_ids=i, attention_mask=m)
#   break
# print(outputs)