In [1]:
import torch
import random
import re
import polars as pl
from datasets import Dataset, NamedSplit, DatasetDict
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
tokenizer.add_tokens(['<think>', '</think>', '<answer>', '</answer>'])
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small",
                                              torch_dtype=torch.bfloat16)
print(model.get_input_embeddings())
model.resize_token_embeddings(len(tokenizer))
print("After adding new tokens: ", model.get_input_embeddings())

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Embedding(32128, 512)
After adding new tokens:  Embedding(32104, 512)


Parameters after creating a LoRA config get decreased to 86,016 parameters. Also, i convert the dtype of parameters from float32 to bfloat16.\
**Training the model using float32 params runs out of GPU memory when using mps.**

Total params = 86,016\
bfloat16 = 2 bytes\
Params space = 86,016 x 2 = 132,032 bytes \
Also, the same amount of memory will be required during backpropogation to store the differentiated value of all the variable parameters. \
Toal Params space = 132,032 x 2 = 264,064 bytes = **264 KB**

In [2]:
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=2,
    target_modules=["q", "v"])

LoRA_model = get_peft_model(model, lora_config)
print(LoRA_model.print_trainable_parameters())

LoRA_model = LoRA_model.to(torch.bfloat16)
for name, param in LoRA_model.named_parameters():
    if param.requires_grad:
        print(f"Parameter: {name}, Dtype: {param.dtype}, Requires Grad: {param.requires_grad}")


trainable params: 86,016 || all params: 77,022,592 || trainable%: 0.1117
None
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttention.q.lora_A.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttention.q.lora_B.default.weight, Dtype: torch.bfloat16, Requires Grad: True
Parameter: base_model.model.encoder.block.1.layer.0.SelfAttention.v.lora_A.default.weight, Dtype: torch.bfloat16, Requ

In [3]:
mpsDevice = torch.device("cuda")
LoRA_model.to(device=mpsDevice)
# print(f"Total GPU memory allocated by Metal driver for the process: {torch.mps.driver_allocated_memory()} bytes")
print(f"Current GPU memory occupied by tensors: {torch.cuda.memory_summary()} bytes")

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 151763 KiB | 151763 KiB | 151763 KiB |      0 B   |
|       from large pool |  65536 KiB |  65536 KiB |  65536 KiB |      0 B   |
|       from small pool |  86227 KiB |  86227 KiB |  86227 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         | 151763 KiB | 151763 KiB | 151763 KiB |      0 B   |
|       from large pool |  65536 KiB |  65536 KiB |  65536 KiB |      0 B   |
|       from small pool |  86227 KiB |  86227 KiB |  86227 KiB |      0 B   |
|---------------------------------------------------------------

In [4]:
print(tokenizer("amish kakka "))
tokenizer.encode("amish kakka ")

{'input_ids': [183, 1273, 3, 1258, 8511, 9, 3, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


[183, 1273, 3, 1258, 8511, 9, 3, 1]

In [5]:
text = "Answer the Question: President of USA?"
answer = "It is Donald Trump"
encodedText = tokenizer(text, return_tensors='pt', padding=True)
tensorInput = torch.Tensor(encodedText['input_ids']).view(1,-1)
tensorInput = tensorInput.to(dtype=torch.long,
                             device=mpsDevice)
tensorMask = torch.Tensor(encodedText['attention_mask']).view(1,-1)
tensorMask = tensorMask.to(dtype=torch.long,
                           device=mpsDevice)
encodedAns = tokenizer(answer, return_tensors='pt', padding=True)
tensorAnswer = torch.Tensor(encodedAns['input_ids']).view(1,-1)
tensorAnswer = tensorAnswer.to(dtype=torch.long,
                             device=mpsDevice)
tensorLabel = torch.Tensor(encodedAns['attention_mask']).view(1,-1)
tensorLabel = tensorLabel.to(dtype=torch.long,
                           device=mpsDevice)
print(tensorInput)
print(tensorMask)
print(tensorAnswer)
print(tensorLabel)

tensor([[11801,     8, 11860,    10,  1661,    13,  2312,    58,     1]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([[  94,   19, 7459, 2523,    1]], device='cuda:0')
tensor([[1, 1, 1, 1, 1]], device='cuda:0')


In [6]:
LoRA_model.eval()
outputs = LoRA_model.generate(input_ids=tensorInput,
                              attention_mask=tensorMask)
print(tokenizer.decode(outputs[0].tolist(), skip_special_tokens=True))

john d. kennedy


In [7]:
'''
    Open your terminal and login to huggingface using the command :- huggingface-cli login
    Enter your generated token from Hugging Face.
'''

splits = {'train': 'data/train-00000-of-00001.parquet',
          'validation': 'data/validation-00000-of-00001.parquet',
          'test': 'data/test-00000-of-00001.parquet'}

df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['train'])
val_df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['validation'])
test_df = pl.read_parquet('hf://datasets/allenai/sciq/' + splits['test'])

In [8]:
(df.head(5))

question,distractor3,distractor1,distractor2,correct_answer,support
str,str,str,str,str,str
"""What type of organism is commo…","""viruses""","""protozoa""","""gymnosperms""","""mesophilic organisms""","""Mesophiles grow best in modera…"
"""What phenomenon makes global w…","""tropical effect""","""muon effect""","""centrifugal effect""","""coriolis effect""","""Without Coriolis Effect the gl…"
"""Changes from a less-ordered st…","""endothermic""","""unbalanced""","""reactive""","""exothermic""","""Summary Changes of state are e…"
"""What is the least dangerous ra…","""zeta decay""","""beta decay""","""gamma decay""","""alpha decay""","""All radioactive decay is dange…"
"""Kilauea in hawaii is the world…","""magma""","""greenhouse gases""","""carbon and smog""","""smoke and ash""","""Example 3.5 Calculating Projec…"


In [9]:
def restructureData(example):
  '''
    The purpose of "restructuring" is to have a clear format for the input and output data for the model.
    Args:
      example: A single row of the original dataset to format.
  '''
  options = [f"{example[i]}" for i in range(1, 5)]
  random.shuffle(options)
  re_input = f"{example[0]}\nOptions: {options[0]}\n{options[1]}\n{options[2]}\n{options[3]}"
  re_reason = f"<think>{example[5]}</think>"
  re_output = f"{example[4]}"
  return re_input, re_reason, re_output

newData = df.map_rows(lambda x: restructureData(x))
valData = val_df.map_rows(lambda x: restructureData(x))
testData = test_df.map_rows(lambda x: restructureData(x))

In [10]:
newData.columns = ['questions', 'reasoning', 'answers']
valData.columns = ['questions', 'reasoning', 'answers']
testData.columns = ['questions', 'reasoning', 'answers']

In [11]:
# Interestingly enough Dataset.from_polars() is not supported now.
# So, I convert the newly formed polars dataset to pandas, and then in the cell below into an distinct Dataset object based on the splits.
# Then, combining them into a master 'dataset_dict' object.
Data_pd = newData.to_pandas()
valData_pd = valData.to_pandas()
testData_pd = testData.to_pandas()

print(Data_pd.iloc[0]['questions'])
print(Data_pd.iloc[0]['reasoning'])
print(Data_pd.iloc[0]['answers'])

What type of organism is commonly used in preparation of foods such as cheese and yogurt?
Options: protozoa
gymnosperms
mesophilic organisms
viruses
<think>Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.</think>
mesophilic organisms


In [12]:
train_dataset = Dataset.from_pandas(Data_pd, split=NamedSplit('train'))
val_dataset = Dataset.from_pandas(valData_pd, split=NamedSplit('validation'))
test_dataset = Dataset.from_pandas(testData_pd, split=NamedSplit('test'))

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['questions', 'reasoning', 'answers'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['questions', 'reasoning', 'answers'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['questions', 'reasoning', 'answers'],
        num_rows: 1000
    })
})

From the below charts, we see that almost 10% of the questions have reasoning part with just 10-20 words.

In [13]:
def getLengthInfo(data, column, bins=500):
  lengths = [len(i) for i in data.iloc[:, column]]
  print("Max length: ", max(lengths))
  print("Min length: ", min(lengths))

  length_pt = torch.tensor(lengths, dtype=torch.bfloat16)
  print(length_pt.mean())
  print(length_pt.std())

  import plotly.express as px
  fig = px.histogram(lengths, nbins=bins)
  fig.show()

print('Length info for questions in train dataset:')
getLengthInfo(data=Data_pd, column=0)
print('Length info for reasoning in train dataset:')
getLengthInfo(data=Data_pd, column=1, bins=700)

print('Length info for questions in test dataset:')
getLengthInfo(data=testData_pd, column=0)
print('Length info for reasoning in test dataset:')
getLengthInfo(data=testData_pd, column=1, bins=700)

Length info for questions in train dataset:
Max length:  480
Min length:  46
tensor(132., dtype=torch.bfloat16)
tensor(37.2500, dtype=torch.bfloat16)


Length info for reasoning in train dataset:
Max length:  3574
Min length:  15
tensor(432., dtype=torch.bfloat16)
tensor(446., dtype=torch.bfloat16)


Length info for questions in test dataset:
Max length:  312
Min length:  51
tensor(130., dtype=torch.bfloat16)
tensor(36., dtype=torch.bfloat16)


Length info for reasoning in test dataset:
Max length:  2955
Min length:  15
tensor(438., dtype=torch.bfloat16)
tensor(450., dtype=torch.bfloat16)


In [14]:
def tokenize(examples):
  model_inputs = tokenizer.batch_encode_plus(examples['questions'], padding='max_length', truncation=True, return_tensors='pt')
  print("Input questions tokenized!")
  reasoning = tokenizer.batch_encode_plus(examples['reasoning'], padding='max_length', truncation=True, return_tensors='pt')
  print("Reasoning parts tokenized!")
  labels = tokenizer.batch_encode_plus(examples['answers'], padding='max_length', truncation=True, return_tensors='pt')
  print("Labels tokenized!")
  model_inputs['input_ids'] = torch.tensor(model_inputs['input_ids'], dtype=torch.long)
  model_inputs['labels'] = torch.tensor(labels['input_ids'], dtype=torch.long)
  model_inputs['reasoning'] = torch.tensor(reasoning['input_ids'], dtype=torch.long)
  return model_inputs

In [15]:
tokenized_dataset = dataset_dict.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!
Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Input questions tokenized!
Reasoning parts tokenized!
Labels tokenized!


DatasetDict({
    train: Dataset({
        features: ['questions', 'reasoning', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['questions', 'reasoning', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['questions', 'reasoning', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

# Now, we define the Rewards to be given to the Model during training.


In [16]:
s1 = Data_pd.iloc[100]['questions']
s2 = Data_pd.iloc[100]['reasoning']

print(s1,"\n",s2)

In a chemical reaction, the amounts of reactants and products will be constant when what state is attained?
Options: saturation
equilibrium
peak
homogeneity 
 <think>The amount of reactants and products do not have to be equal. However, after equilibrium is attained, the amounts of reactants and products will be constant.</think>


In [17]:
def ResponseStructureReward(completions):
  rewards = []
  for completion in completions:
    match1 = re.search(r"^<think>.*?</think>.*?", completion)
    match2 = re.search(r"<answer>.*?</answer>.*?", completion)
    # if match1:
    #   print("match1:", match1.group(0))
    #   print(len(match1.group(0).split(" ")))
    # if match2:
    #   print("match2:", match2.group(0),"\n")
    #   print(len(match2.group(0).split(" ")))
    if match1 and match2:
      rewards.append(1.0)
    else:
      rewards.append(0.5)  # Partial reward for correct format but limited content
  # print(rewards)
  return rewards

# ResponseStructureReward(["The energy is sun. Correct answer: sun"])
# ResponseStructureReward(["<think>The nu#$%^&*([]/[;'/mber of molecular orbi875676rbtals </> produced is the same as the number. <answer>energy </answer>"])
ResponseStructureReward([s2])


def ResponseLengthReward(completions):
  rewards = []
  for completion in completions:
    if len(completion) > 100:
      if len(completion) > 200:
        rewards.append(0.5)
      elif len(completion) > 400:
        rewards.append(1.0)
      else:
        rewards.append(0.25)
    else:
      rewards.append(0.0)
  # print(rewards)
  return rewards

ResponseLengthReward([s2])

[0.25]

# Here, we create batches of input_ids, attention_mask, and labels.
**Batch size = 8, for loading data along with the model on GPU**

-------------------------------

1 long value = 8 bytes of memory \
512 long values = 1 tensor in our case \
3 such tensors at each input instance = 3 x 512 x 8 = 12,288 bytes

For a single batch, \
8 instances = 8 x 12,288 = 98,304 bytes

Memory for a single batch during training = 98.3 KB

-------------------------------

1 bfloat value = 2 bytes of memory \
512 bfloat values = 1 tensor in our case \
3 such tensors at each input instance = 3 x 512 x 2 = 3072 bytes

For a single batch, \
8 instances = 8 x 3072 = 24,576 bytes

Memory for a single batch during training = 24.5 KB

In [18]:
DataLoader = torch.utils.data.DataLoader
data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['train']['input_ids']),
                                      torch.tensor(tokenized_dataset['train']['attention_mask']),
                                      torch.tensor(tokenized_dataset['train']['reasoning']),
                                      torch.tensor(tokenized_dataset['train']['labels']))
batched_data = DataLoader(dataset=data,
                          batch_size=8)

val_data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['validation']['input_ids']),
                                      torch.tensor(tokenized_dataset['validation']['attention_mask']),
                                      torch.tensor(tokenized_dataset['validation']['reasoning']),
                                      torch.tensor(tokenized_dataset['validation']['labels']))
batched_val_data = DataLoader(dataset=val_data,
                          batch_size=10)

test_data = torch.utils.data.TensorDataset(torch.tensor(tokenized_dataset['test']['input_ids']),
                                      torch.tensor(tokenized_dataset['test']['attention_mask']),
                                      torch.tensor(tokenized_dataset['test']['reasoning']),
                                      torch.tensor(tokenized_dataset['test']['labels']))
batched_test_data = DataLoader(dataset=test_data,
                          batch_size=10)

**Define the optimizer we are gonna use and the parameters for LoRA model. Also, moving the model to cuda**

In [19]:
optimizer = torch.optim.Adam(LoRA_model.parameters(), lr=1e-3, eps=1e-4)
training_loss = []
metrics = []

In [20]:
def SupervisedTraining(epochs, model_dir=''):
  for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    epoch_losses = []
    b = 0
    train_iterator = iter(batched_data)
    # val_iterator = iter(batched_val_data)

    for i, (input_id, attn_mask, _, target) in enumerate(train_iterator):
      labels = target.clone().detach()
      labels[labels == tokenizer.pad_token_type_id] = -100

      mps_input_ids = input_id.to(device=mpsDevice)
      mps_attnMask_ids = attn_mask.to(device=mpsDevice)
      mps_labels = labels.to(device=mpsDevice)

      LoRA_model.train()
      outputs = LoRA_model(input_ids=mps_input_ids,
                          attention_mask=mps_attnMask_ids,
                          labels=mps_labels)
      loss = outputs.loss
      if i%50 == 0:
        print(f"Batch {i} loss: ", loss)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      epoch_losses.append(loss)
    print(f"Epoch loss: {sum(epoch_losses)/len(epoch_losses)}")
    torch.cuda.empty_cache()

  torch.save(LoRA_model.state_dict(), model_dir)
  print("Model saved.")

SupervisedTraining(epochs=1, model_dir='./T5_Model1.pt')

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch: 0
Batch 0 loss:  tensor(1.2734, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 50 loss:  tensor(0.8125, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 100 loss:  tensor(0.8086, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 150 loss:  tensor(0.6406, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 200 loss:  tensor(0.6094, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 250 loss:  tensor(0.6172, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 300 loss:  tensor(0.5898, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 350 loss:  tensor(0.8867, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 400 loss:  tensor(0.7031, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)
Batch 450 loss:  tensor(0.4551, device='

In [32]:
LoRA_model.eval()
input_id, attn_mask, _, target = next(iter(batched_val_data))
labels = target.clone().detach()
labels[labels == tokenizer.pad_token_type_id] = -100
mps1 = input_id.to(mpsDevice)
mps2 = attn_mask.to(mpsDevice)
mps3 = labels.to(mpsDevice)

outputs = LoRA_model.generate(input_ids=mps1,
                        attention_mask=mps2,
                        labels=mps3)

questions = tokenizer.batch_decode(input_id, skip_special_tokens=True)
answers = tokenizer.batch_decode(target, skip_special_tokens=True)
print(outputs)
for q, a, output in zip(questions, answers, outputs):
    print("Question: ", q)
    print("Actual answer: ", a)
    print("Predicted output: ", tokenizer.decode(output, skip_special_tokens=True))

torch.cuda.empty_cache()

tensor([[    0,   649,  3757,     1,     0,     0,     0,     0,     0],
        [    0,     3, 18053,     1,     0,     0,     0,     0,     0],
        [    0, 13485,     1,     0,     0,     0,     0,     0,     0],
        [    0,    62,    17,    40,   232,     1,     0,     0,     0],
        [    0,     8,  1997,     1,     0,     0,     0,     0,     0],
        [    0,  1717, 12979,     1,     0,     0,     0,     0,     0],
        [    0,  1712,  9008,  2176,    11,    46,  9008,  2176,     1],
        [    0, 27140,     3,  3198,     1,     0,     0,     0,     0],
        [    0,  2241,     1,     0,     0,     0,     0,     0,     0],
        [    0,  1552, 14612,    17,     1,     0,     0,     0,     0]],
       device='cuda:0')
Question:  Who proposed the theory of evolution by natural selection? Options: shaw Scopes Linnaeus darwin
Actual answer:  darwin
Predicted output:  darwin
Question:  Each specific polypeptide has a unique linear sequence of which acids? Options