In [1]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - /home/aalla4/SML


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import T5Config
import torch
import random

# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 1    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


2025-12-15 18:13:37.402286: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Total number of logical cores = 48
CUDA available:  True
GPU name: NVIDIA A100-SXM4-80GB


In [3]:
# Load the tokenizer 

# model_name = "t5-base" # ---------------------------------------------------------------------------------------------
# model_name = "t5-large" 
model_name = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250


In [4]:
# Making some special tokens as placeholders/seperators for the input 

# we don't waste any additional tokens in this process to seperate the inputs into sections
special_tokens_description = {
    "<extra_id_99>": "[CONTEXT]",      # Represents the context section of the input
    "<extra_id_98>": "[DIFFICULTY]",   # Represents the difficulty section of the input - 'easy'/ 'medium'/ 'hard'
    "<extra_id_97>": "[TAG]"           # Represents the tag section of the input
}

print("Explanation of special tokens used:")
for token, description in special_tokens_description.items():
    print(f"- {token} is used as a placeholder for {description}")

# if we can simply use the existing tokens we don't wanna increase additional special tokens
# special_tokens = {'additional_special_tokens': ['[CONTEXT]', '[DIFFICULTY]', '[TAG]']}

# Get current additional special tokens from the tokenizer
# existing_special_tokens = tokenizer.special_tokens_map.get('additional_special_tokens', [])

# tokenizer.add_special_tokens(special_tokens)
# model.resize_token_embeddings(len(tokenizer))

Explanation of special tokens used:
- <extra_id_99> is used as a placeholder for [CONTEXT]
- <extra_id_98> is used as a placeholder for [DIFFICULTY]
- <extra_id_97> is used as a placeholder for [TAG]


In [5]:
import json

# loading the custom datasets
# Loading the dataset
# Function to load the data from a JSON file

filepath_descriptive = "./dataset/descriptive.json" 
filepath_mcq = "./dataset/mcq.json" 
filepath_tf = "./dataset/true_false.json" 

def load_json_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)                    # This should be a list of dicts

        
descriptive_data = load_json_file(filepath_descriptive)
mcq_data = load_json_file(filepath_mcq)
tf_data = load_json_file(filepath_tf)

# Convert list of dicts to Hugging Face Dataset
dataset_descriptive = Dataset.from_list(descriptive_data)
dataset_mcq = Dataset.from_list(mcq_data)
dataset_tf = Dataset.from_list(tf_data)


print("Completed loading the datasets ........")

Completed loading the datasets ........


In [6]:
# preprocessing steps

# Preprocessing function to tokenize the input and target text
def preprocess_descriptive(example):
    '''<extra_id_97>short answer question <extra_id_98>easy <extra_id_99>Drinking enough water each day helps regulate body temperature, keep joints lubricated, prevent infections, and keep organs functioning properly. Proper hydration also improves sleep quality, cognition, and mood. 
       List two ways drinking water benefits the human body.'''
    tag = example["tag"]
    difficulty = example["difficulty"]
    context = example["context"]
    question = example["question"]

    input_text = f"<extra_id_97>descriptive question ({tag}) <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = question

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # print(input_text, "\n", target_text)
    
    return model_inputs

def preprocess_mcq(example):
    '''there has been a mistake in the dataset so we have considered all the mcq to be of medium difficulty'''
    '''<extra_id_97>multiple choice question <extra_id_98>medium <extra_id_99>Rainforests are essential to Earth’s ecosystem. They produce oxygen, absorb carbon dioxide, and help regulate the global climate. Rainforests are also home to more than half of the world’s plant and animal species. Despite their importance, they are being destroyed at an alarming rate due to logging, agriculture, and mining. When rainforests are cleared, biodiversity is lost, and carbon is released into the atmosphere, contributing to global warming. Indigenous people who depend on these forests are also displaced. Preserving rainforests is vital for maintaining environmental balance and protecting wildlife. 
       Which of the following is a consequence of rainforest destruction? [C. Global warming] (A. Improved biodiversity; B. Carbon absorption; C. Global warming; D. Increased rainfall)'''
    tag = example["difficulty"]            # there is a problem with the dataset so had to keep it like this
    difficulty = "medium"                  # there is a problem with the dataset so had to keep this like it
    context = example["context"]
    question = example["question"]
    options = example["options"]
    answer = example["answer"]

    # Prepare formatted options
    # option_labels = ['A', 'B', 'C', 'D']
    # formatted_options = [f"{label}. {opt}" for label, opt in zip(option_labels, options)]
    # correct_index = options.index(answer)
    # correct_option = f"{option_labels[correct_index]}. {answer}"

    # don't use any options here as this might confuse the model; instead just give the answer and other options
    # Get ONLY the other options (no A/B/C/D labels)
    formatted_options = [opt for opt in options if opt != answer]
    correct_option = f"{answer}"

    # Prepare raw input and label strings
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = f"{question}; answer:[{correct_option}]; wrong options:({'; '.join(formatted_options)})"

    # Tokenize input and target
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    # Add labels
    model_inputs["labels"] = labels["input_ids"]
    
    # print(input_text, "\n", target_text)

    return model_inputs

def preprocess_tf(example):
    '''<extra_id_97>true or false question <extra_id_98>easy <extra_id_99>Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water. The process typically occurs in the chloroplasts of plant cells and releases oxygen as a byproduct. Chlorophyll, the green pigment in plants, plays a crucial role in capturing light energy. This energy is then used to convert water and carbon dioxide into glucose, which serves as the plant’s food source. 
        Photosynthesis releases oxygen as a byproduct. [true]'''
    tag = example["tag"]
    difficulty = example["difficulty"]
    context = example["context"]
    question = example["question"]
    answer = example["answer"].lower()  # ensure it's "true" or "false"

    # Format the input and target
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = f"{question}; answer: [{answer}]"

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # print(input_text, "\n", target_text)

    return model_inputs



In [7]:

processed_descriptive_dataset = dataset_descriptive.map(preprocess_descriptive, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=dataset_descriptive.column_names)
processed_mcq_dataset = dataset_mcq.map(preprocess_mcq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=dataset_mcq.column_names)
processed_tf_dataset = dataset_tf.map(preprocess_tf, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=dataset_tf.column_names)


Map (num_proc=47):   0%|          | 0/110 [00:00<?, ? examples/s]

Map (num_proc=47):   0%|          | 0/108 [00:00<?, ? examples/s]

Map (num_proc=47):   0%|          | 0/110 [00:00<?, ? examples/s]

In [8]:
processed_descriptive_dataset = processed_descriptive_dataset.select(range(110))
processed_mcq_dataset = processed_mcq_dataset.select(range(100))
processed_tf_dataset = processed_tf_dataset.select(range(110))

In [9]:
dataset = processed_descriptive_dataset
print(len(dataset))
# print(dataset[0])

110


In [10]:
dataset = processed_mcq_dataset
print(len(dataset))

100


In [11]:
dataset = processed_tf_dataset
print(len(dataset))

110


In [12]:
# Proportionally sample per dataset every batch

datasets_dict = {
    "descriptive": processed_descriptive_dataset,
    "mcq": processed_mcq_dataset,
    "tf": processed_tf_dataset,
}

proportions = {
    "descriptive": 11/32,
    "mcq": 10/32,
    "tf": 11/32    
}




In [13]:
import math
from torch.utils.data import Sampler

# class ProportionalBatchSampler(Sampler):
#     def __init__(self, datasets_dict, proportions, batch_size):
#         self.datasets_dict = datasets_dict
#         self.proportions = proportions
#         self.batch_size = batch_size

#         # Pre-calc: how many samples per dataset per batch
#         self.samples_per_dataset = {
#             k: max(1, int(batch_size * p))
#             for k, p in proportions.items()
#         }

#         # Make sure total == batch_size
#         diff = batch_size - sum(self.samples_per_dataset.values())
#         if diff > 0:
#             largest_key = max(self.samples_per_dataset, key=lambda x: self.samples_per_dataset[x])
#             self.samples_per_dataset[largest_key] += diff

#         # Convert dataset to list of indices
#         self.index_pools = {
#             k: list(range(len(ds)))
#             for k, ds in datasets_dict.items()
#         }

#         # Shuffle each dataset's index pool
#         for k in self.index_pools:
#             random.shuffle(self.index_pools[k])

#         # Total batches = smallest number of batches any dataset can support
#         self.total_batches = min([
#             len(v) // self.samples_per_dataset[k]
#             for k, v in self.index_pools.items()
#         ])

#     def __iter__(self):
#         for _ in range(self.total_batches):
#             batch_indices = []
#             for k in self.index_pools:
#                 take = self.samples_per_dataset[k]
#                 batch_indices.extend(self.index_pools[k][:take])
#                 del self.index_pools[k][:take]
#             random.shuffle(batch_indices)
#             yield batch_indices

#     def __len__(self):
#         return self.total_batches

class ProportionalBatchSampler(Sampler):
    def __init__(self, datasets_dict, proportions, batch_size):
        self.datasets_dict = datasets_dict
        self.proportions = proportions
        self.batch_size = batch_size

        self.samples_per_dataset = {
            k: max(1, int(batch_size * p))
            for k, p in proportions.items()
        }

        diff = batch_size - sum(self.samples_per_dataset.values())
        if diff > 0:
            largest = max(self.samples_per_dataset, key=self.samples_per_dataset.get)
            self.samples_per_dataset[largest] += diff

        self.full_index_pools = {
            k: list(range(len(ds)))
            for k, ds in datasets_dict.items()
        }

        self.index_pools = {
            k: pool.copy()
            for k, pool in self.full_index_pools.items()
        }

        for k in self.index_pools:
            random.shuffle(self.index_pools[k])

        # Trainer controls total steps, not us
        self.total_batches = 10**12  # effectively infinite

    def __iter__(self):
        while True:
            batch = []

            for k in self.index_pools:
                take = self.samples_per_dataset[k]

                if len(self.index_pools[k]) < take:
                    # Refill + reshuffle
                    self.index_pools[k] = self.full_index_pools[k].copy()
                    random.shuffle(self.index_pools[k])

                batch.extend(self.index_pools[k][:take])
                del self.index_pools[k][:take]

            random.shuffle(batch)
            yield batch

    def __len__(self):
        return self.total_batches




In [14]:
from torch.utils.data import DataLoader

class ProportionalDataLoader(DataLoader):
    def __init__(self, datasets_dict, proportions, batch_size):
        self.datasets_dict = datasets_dict
        merged_dataset = concatenate_datasets(list(datasets_dict.values()))
        sampler = ProportionalBatchSampler(datasets_dict, proportions, batch_size)
        super().__init__(
            merged_dataset,
            batch_sampler=sampler,
            collate_fn=trainer.data_collator
        )


In [15]:
# Initiating the model

In [16]:
# Load the model and tokenizer 

# model_name = "t5-base"                            # -----------------------------------------------------------------------------------------------------------------
# model_name = "t5-large" \
model_name = "google/flan-t5-base"


save_path = "./T5flanbase_Question_Generation"           # ---------------------------------------------------------------------------------------------------------------

config = T5Config.from_pretrained(
    model_name,
    dropout_rate=0.1,           # encoder/decoder FFN dropout
    attention_dropout_rate=0.1, # self-attention dropout
)

# tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name, config = config)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

Max positional embeddings supported by model - google/flan-t5-base:  512


In [17]:
# Choosing the dataset and Loading the Tokenized Dataset if Preprocessing is already done


In [18]:

# dataset paths - choosing the dataset


#  ----------------------------------------------------------------------------------------------
# Choose Training set

# custom dataset 
# tokenized_trainset_path = "./dataset/tokenized_custom_trainset.parquet"

# 5 datasets mixed
# tokenized_trainset_path = "./dataset/tokenized_trainset.parquet"

# 5 datasets mixed proportionally
tokenized_trainset_path = "using 3 datasets mixed proportionally"

# Load from Parquet ---------- uncomment this
# tokenized_train_dataset = Dataset.from_parquet(tokenized_trainset_path)



# ----------------------------------------------------------------------------------------------
# Validation set


# you only use the val set when needed but it can always be loaded up from the already preprocessed parquet file
# tokenized_valset_path = "./dataset/tokenized_valset.parquet"
# tokenized_val_dataset = Dataset.from_parquet(tokenized_valset_path)


# ----------------------------------------------------------------------------------------------

In [19]:
# prarmeters to change when you want to test out and experiment with various values depending on the dataset which you are using

num_of_epochs = 20
batch_size = 32

learning_rate = 1e-5
warmup_steps = 10
weight_decay = 1e-3

# for the model saving strategy
save_steps = 100

logging_steps = 10

print(f" Using the dataset =  {tokenized_trainset_path}\n Using the below parameters for training:\n epochs = {num_of_epochs} \n batch size = {batch_size} \n learning rate = {learning_rate} \n warmup steps = {warmup_steps} \n weight decay = {weight_decay}\n logging steps = {logging_steps}")

# print(proportions)



 Using the dataset =  using 3 datasets mixed proportionally
 Using the below parameters for training:
 epochs = 20 
 batch size = 32 
 learning rate = 1e-05 
 warmup steps = 10 
 weight decay = 0.001
 logging steps = 10


In [20]:
# Defning the Training args 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("This is the current device: ", device)
print("Device updated to:", model.device)


training_args = TrainingArguments(
    output_dir=save_path,              # directory where the training logs, checkpoints, and evaluation results (like metrics) are saved during the training process   
    
    learning_rate=learning_rate,
    warmup_steps=warmup_steps,
    num_train_epochs=num_of_epochs,                                     
    weight_decay=1e-3,                          

    # generally Batch size = per_device_train_batch_size * gradient_accumulation_steps * number of devices
    # -----------------------------------------------------------------------
    # per_device_train_batch_size=batch_size,                    # ****** if using GPU's = 16; if using CPU's = 1 or 2
    # # gradient_accumulation_steps=1,                     # Increase this if you need to simulate larger batch sizes, without running into 'Out or Memory' errors when memory is limited
    # per_device_eval_batch_size=batch_size,
    # ---------------------------------------------------------------------

    # this is for the t5 large model on the custom dataset
    # per_device_train_batch_size = 2,
    # gradient_accumulation_steps = 4,

    # this is not useful for CPU based training as hugging face trainer handles multi-core utilization automatically based on the system configuration
    dataloader_num_workers= USABLE_CPU_CORES,          # ****** for optimal use of CPU and not wasting GPU time [ this helps in loading the next batch of data into the VRAM ]

    # Print validation loss every epoch
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # eval_strategy="epoch",                                                                                                   # need to change this for every new experiment
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    # Print and logs the training loss of the training data
    logging_strategy="steps",   
    logging_steps=logging_steps,                                   # ****** if using GPU = 100; if using CPU = 1 or 2 

    # saves model at the end of every epoch
    
    # save_strategy="epoch",  
    save_strategy="steps",
    save_steps=save_steps,
    
    save_total_limit=1,
    # save_total_limit=2,

    # report_to="none",  # Disable default logging
    
    logging_dir= save_path + "/logs",           # save logs to a directory
    report_to="tensorboard",                    # Reports to TensorBoard
    log_level='info',                           # Set logging level to 'info' to see the logs in the terminal
    # run this command in your terminal ~ tensorboard --logdir=./output_dir/runs
    # and open 'http://localhost:6006/' to monitor the logs [loss over the training]

    fp16=False                                   # ***** Mixed precision for faster training on A100; this won't work on CPU's
    
)

This is the current device:  cuda
Device updated to: cuda:0


In [21]:
# Logging in the Terminal
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step {state.global_step}: {logs}")



In [22]:
# Trainer setup


# --------------------------------------------------------------------------------------------------------------------
# # for training on the custom dataset
# trainer = Trainer(
#     model=model,
#     args=training_args,
    
#     train_dataset=tokenized_train_dataset,            # your test set
#     # eval_dataset=tokenized_val_dataset,               # your validation set

#     # train_dataset=tokenized_train_dataset.select(range(100)),    
#     # eval_dataset=tokenized_val_dataset.select(range(100)),
    
#     # data_collator=data_collator, ----------------------------------------------------------------
    
#     callbacks=[LogCallback()]                   # *** to print the logs in the terminal
# )


# --------------------------------------------------------------------------------------------------------------------
# for training on the 5 different datasets mixed together and loaded from the tokenized parquet files
# trainer = Trainer(
#     model=model,
#     args=training_args,
    
#     train_dataset=tokenized_train_dataset,            # your test set
#     eval_dataset=tokenized_val_dataset,               # your validation set

#     # train_dataset=tokenized_train_dataset.select(range(100)),    
#     # eval_dataset=tokenized_val_dataset.select(range(100)),
    
#     callbacks=[LogCallback()]                   # *** to print the logs in the terminal
# )


# --------------------------------------------------------------------------------------------------------------------
# for training on the 5 different datasets mixed by proportions specified for each of the batch
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= None,                     # overridden
    # eval_dataset= tokenized_val_dataset,
    callbacks=[LogCallback()]
)

# Override the train dataloader
trainer.get_train_dataloader = lambda: ProportionalDataLoader(
    datasets_dict,
    proportions,
    training_args.per_device_train_batch_size
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
# Training the Model

# Train
trainer.train()


# Save the model
trainer.save_model(save_path)                           

tokenizer.save_pretrained(save_path)                    
        


***** Running training *****
  Num examples = 320
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 20,000,000,000,000
  Number of trainable parameters = 247,577,856
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,45.0949
20,39.2034
30,33.9411
40,30.4721
50,27.6411
60,25.1332
70,22.8238
80,20.8494
90,18.4301
100,15.1496


Step 10: {'loss': 45.0949, 'grad_norm': 403.60491943359375, 'learning_rate': 9e-06, 'epoch': 1e-11}
Step 20: {'loss': 39.2034, 'grad_norm': 969.8291625976562, 'learning_rate': 9.999999999995501e-06, 'epoch': 2e-11}
Step 30: {'loss': 33.9411, 'grad_norm': 236.54627990722656, 'learning_rate': 9.9999999999905e-06, 'epoch': 3e-11}
Step 40: {'loss': 30.4721, 'grad_norm': 280.5981750488281, 'learning_rate': 9.999999999985501e-06, 'epoch': 4e-11}
Step 50: {'loss': 27.6411, 'grad_norm': 261.7852478027344, 'learning_rate': 9.9999999999805e-06, 'epoch': 5e-11}
Step 60: {'loss': 25.1332, 'grad_norm': 165.44955444335938, 'learning_rate': 9.999999999975501e-06, 'epoch': 6e-11}
Step 70: {'loss': 22.8238, 'grad_norm': 143.09695434570312, 'learning_rate': 9.9999999999705e-06, 'epoch': 7e-11}
Step 80: {'loss': 20.8494, 'grad_norm': 150.2639923095703, 'learning_rate': 9.999999999965501e-06, 'epoch': 8e-11}
Step 90: {'loss': 18.4301, 'grad_norm': 156.84312438964844, 'learning_rate': 9.999999999960502e-06

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-100
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-100/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-100/generation_config.json


Step 100: {'loss': 15.1496, 'grad_norm': 240.167236328125, 'learning_rate': 9.999999999955501e-06, 'epoch': 1e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-100/model.safetensors


Step 110: {'loss': 12.2796, 'grad_norm': 74.86565399169922, 'learning_rate': 9.9999999999505e-06, 'epoch': 1.1e-10}
Step 120: {'loss': 8.7983, 'grad_norm': 183.2331085205078, 'learning_rate': 9.999999999945501e-06, 'epoch': 1.2e-10}
Step 130: {'loss': 6.2523, 'grad_norm': 38.465328216552734, 'learning_rate': 9.9999999999405e-06, 'epoch': 1.3e-10}
Step 140: {'loss': 5.2658, 'grad_norm': 18.886783599853516, 'learning_rate': 9.999999999935501e-06, 'epoch': 1.4e-10}
Step 150: {'loss': 4.7481, 'grad_norm': 13.156275749206543, 'learning_rate': 9.9999999999305e-06, 'epoch': 1.5e-10}
Step 160: {'loss': 4.4888, 'grad_norm': 12.477185249328613, 'learning_rate': 9.999999999925501e-06, 'epoch': 1.6e-10}
Step 170: {'loss': 4.3623, 'grad_norm': 15.485424995422363, 'learning_rate': 9.999999999920502e-06, 'epoch': 1.7e-10}
Step 180: {'loss': 4.1832, 'grad_norm': 20.9342098236084, 'learning_rate': 9.9999999999155e-06, 'epoch': 1.8e-10}
Step 190: {'loss': 4.0388, 'grad_norm': 21.470834732055664, 'learni

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-200
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-200/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-200/generation_config.json


Step 200: {'loss': 3.8337, 'grad_norm': 19.889909744262695, 'learning_rate': 9.9999999999055e-06, 'epoch': 2e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-200/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-100] due to args.save_total_limit


Step 210: {'loss': 3.657, 'grad_norm': 21.905519485473633, 'learning_rate': 9.9999999999005e-06, 'epoch': 2.1e-10}
Step 220: {'loss': 3.4409, 'grad_norm': 23.796180725097656, 'learning_rate': 9.9999999998955e-06, 'epoch': 2.2e-10}
Step 230: {'loss': 3.1994, 'grad_norm': 23.600038528442383, 'learning_rate': 9.9999999998905e-06, 'epoch': 2.3e-10}
Step 240: {'loss': 3.0095, 'grad_norm': 22.47574234008789, 'learning_rate': 9.9999999998855e-06, 'epoch': 2.4e-10}
Step 250: {'loss': 2.8378, 'grad_norm': 28.07978630065918, 'learning_rate': 9.999999999880501e-06, 'epoch': 2.5e-10}
Step 260: {'loss': 2.7066, 'grad_norm': 22.416032791137695, 'learning_rate': 9.9999999998755e-06, 'epoch': 2.6e-10}
Step 270: {'loss': 2.4256, 'grad_norm': 27.663860321044922, 'learning_rate': 9.999999999870501e-06, 'epoch': 2.7e-10}
Step 280: {'loss': 2.2848, 'grad_norm': 22.29022979736328, 'learning_rate': 9.9999999998655e-06, 'epoch': 2.8e-10}
Step 290: {'loss': 2.061, 'grad_norm': 23.251192092895508, 'learning_rat

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-300
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-300/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-300/generation_config.json


Step 300: {'loss': 1.8604, 'grad_norm': 20.980995178222656, 'learning_rate': 9.9999999998555e-06, 'epoch': 3e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-300/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-200] due to args.save_total_limit


Step 310: {'loss': 1.6944, 'grad_norm': 18.09907341003418, 'learning_rate': 9.999999999850501e-06, 'epoch': 3.1e-10}
Step 320: {'loss': 1.4756, 'grad_norm': 17.670705795288086, 'learning_rate': 9.9999999998455e-06, 'epoch': 3.2e-10}
Step 330: {'loss': 1.3718, 'grad_norm': 17.549856185913086, 'learning_rate': 9.999999999840501e-06, 'epoch': 3.3e-10}
Step 340: {'loss': 1.2147, 'grad_norm': 16.011398315429688, 'learning_rate': 9.999999999835502e-06, 'epoch': 3.4e-10}
Step 350: {'loss': 1.0643, 'grad_norm': 12.636578559875488, 'learning_rate': 9.999999999830501e-06, 'epoch': 3.5e-10}
Step 360: {'loss': 0.9452, 'grad_norm': 11.420974731445312, 'learning_rate': 9.9999999998255e-06, 'epoch': 3.6e-10}
Step 370: {'loss': 0.8475, 'grad_norm': 11.727056503295898, 'learning_rate': 9.999999999820501e-06, 'epoch': 3.7e-10}
Step 380: {'loss': 0.7415, 'grad_norm': 8.582765579223633, 'learning_rate': 9.9999999998155e-06, 'epoch': 3.8e-10}
Step 390: {'loss': 0.6487, 'grad_norm': 8.17969799041748, 'learn

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-400
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-400/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-400/generation_config.json


Step 400: {'loss': 0.5938, 'grad_norm': 16.61419677734375, 'learning_rate': 9.9999999998055e-06, 'epoch': 4e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-400/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-300] due to args.save_total_limit


Step 410: {'loss': 0.5347, 'grad_norm': 6.523571014404297, 'learning_rate': 9.9999999998005e-06, 'epoch': 4.1e-10}
Step 420: {'loss': 0.473, 'grad_norm': 5.449945449829102, 'learning_rate': 9.999999999795502e-06, 'epoch': 4.2e-10}
Step 430: {'loss': 0.4156, 'grad_norm': 5.68875789642334, 'learning_rate': 9.9999999997905e-06, 'epoch': 4.3e-10}
Step 440: {'loss': 0.392, 'grad_norm': 4.312360763549805, 'learning_rate': 9.999999999785502e-06, 'epoch': 4.4e-10}
Step 450: {'loss': 0.353, 'grad_norm': 3.449847936630249, 'learning_rate': 9.9999999997805e-06, 'epoch': 4.5e-10}
Step 460: {'loss': 0.3208, 'grad_norm': 2.99045991897583, 'learning_rate': 9.9999999997755e-06, 'epoch': 4.6e-10}
Step 470: {'loss': 0.2947, 'grad_norm': 2.332852602005005, 'learning_rate': 9.9999999997705e-06, 'epoch': 4.7e-10}
Step 480: {'loss': 0.2853, 'grad_norm': 3.0802416801452637, 'learning_rate': 9.999999999765501e-06, 'epoch': 4.8e-10}
Step 490: {'loss': 0.2756, 'grad_norm': 1.5654637813568115, 'learning_rate': 9

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-500
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-500/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-500/generation_config.json


Step 500: {'loss': 0.2595, 'grad_norm': 1.8341302871704102, 'learning_rate': 9.999999999755501e-06, 'epoch': 5e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-500/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-400] due to args.save_total_limit


Step 510: {'loss': 0.2209, 'grad_norm': 1.5108212232589722, 'learning_rate': 9.9999999997505e-06, 'epoch': 5.1e-10}
Step 520: {'loss': 0.2256, 'grad_norm': 1.2204900979995728, 'learning_rate': 9.999999999745501e-06, 'epoch': 5.2e-10}
Step 530: {'loss': 0.2035, 'grad_norm': 1.5909045934677124, 'learning_rate': 9.9999999997405e-06, 'epoch': 5.3e-10}
Step 540: {'loss': 0.1921, 'grad_norm': 0.980563759803772, 'learning_rate': 9.999999999735501e-06, 'epoch': 5.4e-10}
Step 550: {'loss': 0.1833, 'grad_norm': 0.9370376467704773, 'learning_rate': 9.9999999997305e-06, 'epoch': 5.5e-10}
Step 560: {'loss': 0.1636, 'grad_norm': 0.7660162448883057, 'learning_rate': 9.999999999725501e-06, 'epoch': 5.6e-10}
Step 570: {'loss': 0.1668, 'grad_norm': 0.9291651248931885, 'learning_rate': 9.9999999997205e-06, 'epoch': 5.7e-10}
Step 580: {'loss': 0.1614, 'grad_norm': 0.6746436953544617, 'learning_rate': 9.999999999715501e-06, 'epoch': 5.8e-10}
Step 590: {'loss': 0.157, 'grad_norm': 0.6578576564788818, 'learn

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-600
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-600/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-600/generation_config.json


Step 600: {'loss': 0.1395, 'grad_norm': 0.6668218970298767, 'learning_rate': 9.999999999705501e-06, 'epoch': 6e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-600/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-500] due to args.save_total_limit


Step 610: {'loss': 0.1383, 'grad_norm': 0.6324518322944641, 'learning_rate': 9.9999999997005e-06, 'epoch': 6.1e-10}
Step 620: {'loss': 0.1484, 'grad_norm': 0.6359052062034607, 'learning_rate': 9.999999999695501e-06, 'epoch': 6.2e-10}
Step 630: {'loss': 0.1287, 'grad_norm': 0.5455908179283142, 'learning_rate': 9.9999999996905e-06, 'epoch': 6.3e-10}
Step 640: {'loss': 0.1363, 'grad_norm': 0.46710920333862305, 'learning_rate': 9.999999999685501e-06, 'epoch': 6.4e-10}
Step 650: {'loss': 0.1199, 'grad_norm': 0.47697532176971436, 'learning_rate': 9.9999999996805e-06, 'epoch': 6.5e-10}
Step 660: {'loss': 0.1223, 'grad_norm': 0.5000445246696472, 'learning_rate': 9.9999999996755e-06, 'epoch': 6.6e-10}
Step 670: {'loss': 0.1235, 'grad_norm': 0.46192026138305664, 'learning_rate': 9.999999999670502e-06, 'epoch': 6.7e-10}
Step 680: {'loss': 0.1111, 'grad_norm': 0.42847028374671936, 'learning_rate': 9.9999999996655e-06, 'epoch': 6.8e-10}
Step 690: {'loss': 0.1236, 'grad_norm': 0.41707828640937805, '

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-700
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-700/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-700/generation_config.json


Step 700: {'loss': 0.1192, 'grad_norm': 0.4100814759731293, 'learning_rate': 9.9999999996555e-06, 'epoch': 7e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-700/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-600] due to args.save_total_limit


Step 710: {'loss': 0.124, 'grad_norm': 0.48507651686668396, 'learning_rate': 9.9999999996505e-06, 'epoch': 7.1e-10}
Step 720: {'loss': 0.1028, 'grad_norm': 0.4859665334224701, 'learning_rate': 9.9999999996455e-06, 'epoch': 7.2e-10}
Step 730: {'loss': 0.114, 'grad_norm': 0.45177146792411804, 'learning_rate': 9.999999999640501e-06, 'epoch': 7.3e-10}
Step 740: {'loss': 0.1075, 'grad_norm': 0.44294846057891846, 'learning_rate': 9.9999999996355e-06, 'epoch': 7.4e-10}
Step 750: {'loss': 0.1067, 'grad_norm': 0.5272119045257568, 'learning_rate': 9.999999999630501e-06, 'epoch': 7.5e-10}
Step 760: {'loss': 0.0991, 'grad_norm': 0.44872206449508667, 'learning_rate': 9.9999999996255e-06, 'epoch': 7.6e-10}
Step 770: {'loss': 0.1121, 'grad_norm': 0.7051324844360352, 'learning_rate': 9.999999999620501e-06, 'epoch': 7.7e-10}
Step 780: {'loss': 0.1033, 'grad_norm': 0.7304367423057556, 'learning_rate': 9.9999999996155e-06, 'epoch': 7.8e-10}
Step 790: {'loss': 0.0962, 'grad_norm': 0.3796403706073761, 'lea

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-800
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-800/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-800/generation_config.json


Step 800: {'loss': 0.0957, 'grad_norm': 0.40661147236824036, 'learning_rate': 9.9999999996055e-06, 'epoch': 8e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-800/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-700] due to args.save_total_limit


Step 810: {'loss': 0.0871, 'grad_norm': 0.43349704146385193, 'learning_rate': 9.999999999600501e-06, 'epoch': 8.1e-10}
Step 820: {'loss': 0.0949, 'grad_norm': 0.419904887676239, 'learning_rate': 9.9999999995955e-06, 'epoch': 8.2e-10}
Step 830: {'loss': 0.0984, 'grad_norm': 0.3862306773662567, 'learning_rate': 9.999999999590501e-06, 'epoch': 8.3e-10}
Step 840: {'loss': 0.0929, 'grad_norm': 0.28797274827957153, 'learning_rate': 9.999999999585502e-06, 'epoch': 8.4e-10}
Step 850: {'loss': 0.0938, 'grad_norm': 0.48715779185295105, 'learning_rate': 9.999999999580501e-06, 'epoch': 8.5e-10}
Step 860: {'loss': 0.0984, 'grad_norm': 0.36179718375205994, 'learning_rate': 9.9999999995755e-06, 'epoch': 8.6e-10}
Step 870: {'loss': 0.0873, 'grad_norm': 0.374920129776001, 'learning_rate': 9.999999999570501e-06, 'epoch': 8.7e-10}
Step 880: {'loss': 0.0872, 'grad_norm': 0.35798391699790955, 'learning_rate': 9.9999999995655e-06, 'epoch': 8.8e-10}
Step 890: {'loss': 0.0832, 'grad_norm': 0.27132055163383484

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-900
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-900/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-900/generation_config.json


Step 900: {'loss': 0.0859, 'grad_norm': 0.27623653411865234, 'learning_rate': 9.999999999555502e-06, 'epoch': 9e-10}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-900/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-800] due to args.save_total_limit


Step 910: {'loss': 0.086, 'grad_norm': 0.3675633370876312, 'learning_rate': 9.9999999995505e-06, 'epoch': 9.1e-10}
Step 920: {'loss': 0.0872, 'grad_norm': 0.4046382009983063, 'learning_rate': 9.999999999545502e-06, 'epoch': 9.2e-10}
Step 930: {'loss': 0.0769, 'grad_norm': 0.2977201044559479, 'learning_rate': 9.9999999995405e-06, 'epoch': 9.3e-10}
Step 940: {'loss': 0.0813, 'grad_norm': 0.39227205514907837, 'learning_rate': 9.999999999535502e-06, 'epoch': 9.4e-10}
Step 950: {'loss': 0.0724, 'grad_norm': 0.3691612780094147, 'learning_rate': 9.9999999995305e-06, 'epoch': 9.5e-10}
Step 960: {'loss': 0.0822, 'grad_norm': 0.3463331460952759, 'learning_rate': 9.9999999995255e-06, 'epoch': 9.6e-10}
Step 970: {'loss': 0.078, 'grad_norm': 0.34928029775619507, 'learning_rate': 9.9999999995205e-06, 'epoch': 9.7e-10}
Step 980: {'loss': 0.0723, 'grad_norm': 0.39660942554473877, 'learning_rate': 9.999999999515501e-06, 'epoch': 9.8e-10}
Step 990: {'loss': 0.0796, 'grad_norm': 0.35483866930007935, 'lea

Saving model checkpoint to ./T5flanbase_Question_Generation/checkpoint-1000
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-1000/config.json
Configuration saved in ./T5flanbase_Question_Generation/checkpoint-1000/generation_config.json


Step 1000: {'loss': 0.0734, 'grad_norm': 0.2984829246997833, 'learning_rate': 9.999999999505501e-06, 'epoch': 1e-09}


Model weights saved in ./T5flanbase_Question_Generation/checkpoint-1000/model.safetensors
Deleting older checkpoint [T5flanbase_Question_Generation/checkpoint-900] due to args.save_total_limit


Step 1010: {'loss': 0.0747, 'grad_norm': 0.5894613265991211, 'learning_rate': 9.9999999995005e-06, 'epoch': 1.01e-09}
Step 1020: {'loss': 0.0742, 'grad_norm': 0.4583534300327301, 'learning_rate': 9.999999999495501e-06, 'epoch': 1.02e-09}


KeyboardInterrupt: 

In [None]:
# Testing the model performance

In [27]:

# the finetuned model name
new_model = "T5flanbase_Question_Generation"

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(new_model)
model = T5ForConditionalGeneration.from_pretrained(new_model)

loading file spiece.model from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/tokenizer_config.json
loading file tokenizer.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/tokenizer.json
loading file chat_template.jinja from cache at None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Model config T5Config {
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "dense_act_fn": "relu",
  "dropou

OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory T5flanbase_Question_Generation.

In [28]:
# Running the finetuned question generation model with a sample context


def get_question(tag, difficulty, context, answer="", num_questions=3, max_length=150):
    """
    Generate questions using the fine-tuned T5 model.
    
    Parameters:
    - tag: Type of question (e.g., "short answer", "multiple choice question", "true or false question")
    - difficulty: "easy", "medium", "hard"
    - context: Supporting context or passage
    - answer: Optional — if you want targeted question generation
    - num_questions: Number of diverse questions to generate
    - max_length: Max token length of generated output
    
    Returns:
    - List of generated questions as strings
    """
    # Format input text based on whether answer is provided
    answer_part = f"[{answer}]" if answer else ""
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{answer_part} {context}"

    # Tokenize
    features = tokenizer([input_text], return_tensors='pt')

    # Generate questions
    output = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        max_length=max_length,
        num_return_sequences=num_questions,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )

    # Decode generated questions
    return [tokenizer.decode(out, skip_special_tokens=True) for out in output]

In [29]:
# short answer question
print(get_question(
    tag="short answer",
    difficulty="medium",
    context="Cadmium chloride is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol. Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH."
))

RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)

In [None]:
# true or false question
print(get_question(
    tag="true or false question",
    difficulty="medium",
    context="Ethanol, also known as drinking alcohol, is a clear, colorless liquid that is flammable and is produced by the fermentation of sugars by yeast. It has the chemical formula C2H5OH and is used both recreationally and industrially."
))

print(get_question(
    tag="true or false question",
    difficulty="easy",
    context="Ethanol, also known as drinking alcohol."
))

In [None]:
# multiple choice question
print(get_question(
    tag="multiple choice question",
    difficulty="medium",
    context="Ethanol is used as a recreational beverage, as a solvent, and as a fuel additive. It is a volatile, flammable, colorless liquid with a slight characteristic odor, and its chemical formula is C2H5OH."
))

In [None]:
# short answer question
print(get_question(
    tag="short answer",
    difficulty="medium",
    context="A computer is an electronic device that takes in data (input), processes it using hardware and software, stores it, and then produces information (output). At its core, it manipulates binary code (1s and 0s) through transistors and logic gates, performing complex tasks incredibly fast, making it efficient for everything from simple calculations to running complex applications"
))