In [None]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import random
import json

# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 2    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
# Load the model and tokenizer 

model_name = "./T5base_Question_Generation_v6" 

save_path = "./T5base_Question_Generation_v7"
tokenized_trainset_save_path =  "./dataset/custom_dataset/tokenized_trainset.parquet"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

In [None]:
# Loading the dataset
# Function to load the data from a JSON file
import json

filepath_descriptive = "./dataset/custom_dataset/descriptive.json" 
filepath_mcq = "./dataset/custom_dataset/mcq.json" 
filepath_tf = "./dataset/custom_dataset/true_false.json" 

def load_json_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)                    # This should be a list of dicts

        
descriptive_data = load_json_file(filepath_descriptive)
mcq_data = load_json_file(filepath_mcq)
tf_data = load_json_file(filepath_tf)

# Convert list of dicts to Hugging Face Dataset
dataset_descriptive = Dataset.from_list(descriptive_data)
dataset_mcq = Dataset.from_list(mcq_data)
dataset_tf = Dataset.from_list(tf_data)


In [None]:

# Preprocessing function to tokenize the input and target text
def preprocess_descriptive(example):
    '''<extra_id_97>short answer question <extra_id_98>easy <extra_id_99>Drinking enough water each day helps regulate body temperature, keep joints lubricated, prevent infections, and keep organs functioning properly. Proper hydration also improves sleep quality, cognition, and mood. 
       List two ways drinking water benefits the human body.'''
    tag = example["tag"]
    difficulty = example["difficulty"]
    context = example["context"]
    question = example["question"]

    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = question

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # print(input_text, "\n", target_text)
    
    return model_inputs

def preprocess_mcq(example):
    '''there has been a mistake in the dataset so we have considered all the mcq to be of medium difficulty'''
    '''<extra_id_97>multiple choice question <extra_id_98>medium <extra_id_99>Rainforests are essential to Earth’s ecosystem. They produce oxygen, absorb carbon dioxide, and help regulate the global climate. Rainforests are also home to more than half of the world’s plant and animal species. Despite their importance, they are being destroyed at an alarming rate due to logging, agriculture, and mining. When rainforests are cleared, biodiversity is lost, and carbon is released into the atmosphere, contributing to global warming. Indigenous people who depend on these forests are also displaced. Preserving rainforests is vital for maintaining environmental balance and protecting wildlife. 
       Which of the following is a consequence of rainforest destruction? [C. Global warming] (A. Improved biodiversity; B. Carbon absorption; C. Global warming; D. Increased rainfall)'''
    tag = example["difficulty"]
    difficulty = "medium"
    context = example["context"]
    question = example["question"]
    options = example["options"]
    answer = example["answer"]

    # Prepare formatted options
    option_labels = ['A', 'B', 'C', 'D']
    formatted_options = [f"{label}. {opt}" for label, opt in zip(option_labels, options)]
    correct_index = options.index(answer)
    correct_option = f"{option_labels[correct_index]}. {answer}"

    # Prepare raw input and label strings
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = f"{question} [{correct_option}] ({'; '.join(formatted_options)})"

    # Tokenize input and target
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    # Add labels
    model_inputs["labels"] = labels["input_ids"]
    
    # print(input_text, "\n", target_text)

    return model_inputs

def preprocess_tf(example):
    '''<extra_id_97>true or false question <extra_id_98>easy <extra_id_99>Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water. The process typically occurs in the chloroplasts of plant cells and releases oxygen as a byproduct. Chlorophyll, the green pigment in plants, plays a crucial role in capturing light energy. This energy is then used to convert water and carbon dioxide into glucose, which serves as the plant’s food source. 
        Photosynthesis releases oxygen as a byproduct. [true]'''
    tag = example["tag"]
    difficulty = example["difficulty"]
    context = example["context"]
    question = example["question"]
    answer = example["answer"].lower()  # ensure it's "true" or "false"

    # Format the input and target
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = f"{question} [{answer}]"

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # print(input_text, "\n", target_text)

    return model_inputs



In [None]:

num_proc = 1

processed_descriptive_dataset = dataset_descriptive.map(preprocess_descriptive, batched=False, num_proc=num_proc, remove_columns=dataset_descriptive.column_names)
processed_mcq_dataset = dataset_mcq.map(preprocess_mcq, batched=False, num_proc=num_proc, remove_columns=dataset_mcq.column_names)
processed_tf_dataset = dataset_tf.map(preprocess_tf, batched=False, num_proc=num_proc, remove_columns=dataset_tf.column_names)


In [None]:
label_lengths = [len(example['labels']) for example in processed_descriptive_dataset]
print(f"Min label length: {min(label_lengths)}, Max label length: {max(label_lengths)}")

# Count how many have length exactly 250
count_250 = sum(1 for length in label_lengths if length == 250)
print(f"Number of examples with label length 250: {count_250}")

In [None]:
# print(processed_descriptive_dataset)
# print(processed_mcq_dataset)
# print(processed_tf_dataset)

# Dataset({
#     features: ['input_ids', 'attention_mask', 'labels'],
#     num_rows: 5
# })

tokenized_train_dataset = concatenate_datasets([
    processed_descriptive_dataset,
    processed_mcq_dataset,
    processed_tf_dataset
]).shuffle(seed=92)

In [None]:

tokenized_trainset_save_path = "./dataset/custom_dataset/tokenized_trainset.parquet"

tokenized_train_dataset.to_parquet(tokenized_trainset_save_path)

In [1]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - /home/aalla4


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import random

In [3]:
# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 1    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Total number of logical cores = 48
CUDA available:  True
GPU name: NVIDIA A100-SXM4-80GB


In [4]:
# Load the model and tokenizer 

model_name = "t5-base"

save_path = "./T5base_Question_Generation_v0_custom_dataset"
tokenized_trainset_path = "./dataset/custom_dataset/tokenized_trainset.parquet"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

Max positional embeddings supported by model - t5-base:  512


In [5]:
# Load the Tokenized Dataset if Preprocessing is already done

# Load from Parquet
tokenized_train_dataset = Dataset.from_parquet(tokenized_trainset_path)

# tokenized_train_dataset = Dataset.from_parquet(tokenized_trainset_path)

In [6]:
print(tokenized_train_dataset)

# Dataset({
#     features: ['input_ids', 'attention_mask', 'labels'],
#     num_rows: 5
# })


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 328
})


In [7]:

# print(tokenized_train_dataset)  # Should be [batch_size, 512] or similar

# print(len(tokenized_train_dataset[0]['input_ids']))
# print(len(tokenized_train_dataset[0]['labels']))


# label_lengths = [len(example['labels']) for example in tokenized_train_dataset]
# print(f"Min label length: {min(label_lengths)}, Max label length: {max(label_lengths)}")

# # Count how many have length exactly 250
# count_250 = sum(1 for length in label_lengths if length == 250)
# print(f"Number of examples with label length 250: {count_250}")

# Check the length of a few tokenized examples
# for i in range(5):
#     print(len(tokenized_train_dataset[i]['input_ids']))

In [8]:
# Defning the Training args 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("This is the current device: ", device)
print("Device updated to:", model.device)


training_args = TrainingArguments(
    output_dir=save_path,              # directory where the training logs, checkpoints, and evaluation results (like metrics) are saved during the training process   
    
    learning_rate=1e-5,
    warmup_steps=10,
    num_train_epochs=7,                         
    weight_decay=1e-3,                          

    # generally Batch size = per_device_train_batch_size * per_device_train_batch_size
    per_device_train_batch_size=3,                       # ****** if using GPU's = 16; if using CPU's = 1 or 2
    # gradient_accumulation_steps=2,                     # Increase this if you need to simulate larger batch sizes, without running into 'Out or Memory' errors when memory is limited
    # per_device_eval_batch_size=8,

    # this is not useful for CPU based training as hugging face trainer handles multi-core utilization automatically based on the system configuration
    dataloader_num_workers= USABLE_CPU_CORES,          # ****** for optimal use of CPU and not wasting GPU time [ this helps in loading the next batch of data into the VRAM ]

    # Print validation loss every epoch
    # eval_strategy="epoch",            

    # Print and logs the training loss of the training data
    logging_strategy="steps",   
    logging_steps=5,                                   # ****** if using GPU = 100; if using CPU = 1 or 2 

    # saves model at the end of every epoch
    save_strategy="epoch",            
    # save_total_limit=2,
    save_total_limit=1,

    # report_to="none",  # Disable default logging
    
    logging_dir= save_path + "/logs",           # save logs to a directory
    report_to="tensorboard",                    # Reports to TensorBoard
    log_level='info',                           # Set logging level to 'info' to see the logs in the terminal
    # run this command in your terminal ~ tensorboard --logdir=./output_dir/runs
    # and open 'http://localhost:6006/' to monitor the logs [loss over the training]

    fp16=False                                   # ***** Mixed precision for faster training on A100; this won't work on CPU's
    
)

This is the current device:  cuda
Device updated to: cuda:0


In [9]:
# Logging in the Terminal
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step {state.global_step}: {logs}")

In [10]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    
    train_dataset=tokenized_train_dataset,            # your test set
    # eval_dataset=tokenized_val_dataset,               # your validation set

    # train_dataset=tokenized_train_dataset.select(range(100)),    
    # eval_dataset=tokenized_val_dataset.select(range(100)),
    
    callbacks=[LogCallback()]                   # *** to print the logs in the terminal
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
# Training the Model

# Train
trainer.train()

# Save the model
trainer.save_model(save_path)                           

tokenizer.save_pretrained(save_path)     

***** Running training *****
  Num examples = 328
  Num Epochs = 7
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 770
  Number of trainable parameters = 222,903,552
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
5,19.6844
10,19.2324
15,16.7283
20,15.2924
25,13.6819
30,12.2475
35,10.8149
40,9.1736
45,8.4071
50,7.4858


Step 5: {'loss': 19.6844, 'grad_norm': 123.0082778930664, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.045454545454545456}
Step 10: {'loss': 19.2324, 'grad_norm': 120.02975463867188, 'learning_rate': 9e-06, 'epoch': 0.09090909090909091}
Step 15: {'loss': 16.7283, 'grad_norm': 97.46022033691406, 'learning_rate': 9.947368421052632e-06, 'epoch': 0.13636363636363635}
Step 20: {'loss': 15.2924, 'grad_norm': 206.221435546875, 'learning_rate': 9.881578947368422e-06, 'epoch': 0.18181818181818182}
Step 25: {'loss': 13.6819, 'grad_norm': 67.22552490234375, 'learning_rate': 9.815789473684212e-06, 'epoch': 0.22727272727272727}
Step 30: {'loss': 12.2475, 'grad_norm': 70.75728607177734, 'learning_rate': 9.75e-06, 'epoch': 0.2727272727272727}
Step 35: {'loss': 10.8149, 'grad_norm': 50.66968536376953, 'learning_rate': 9.68421052631579e-06, 'epoch': 0.3181818181818182}
Step 40: {'loss': 9.1736, 'grad_norm': 55.1870231628418, 'learning_rate': 9.61842105263158e-06, 'epoch': 0.36363636363636365}
Ste

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-110
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-110/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-110/generation_config.json


Step 110: {'loss': 0.5892, 'grad_norm': 3.8145999908447266, 'learning_rate': 8.697368421052633e-06, 'epoch': 1.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-110/model.safetensors


Step 115: {'loss': 0.4605, 'grad_norm': 2.281625747680664, 'learning_rate': 8.631578947368422e-06, 'epoch': 1.0454545454545454}
Step 120: {'loss': 0.5103, 'grad_norm': 1.49209725856781, 'learning_rate': 8.565789473684213e-06, 'epoch': 1.0909090909090908}
Step 125: {'loss': 0.4734, 'grad_norm': 2.4539108276367188, 'learning_rate': 8.5e-06, 'epoch': 1.1363636363636362}
Step 130: {'loss': 0.4409, 'grad_norm': 1.1102465391159058, 'learning_rate': 8.43421052631579e-06, 'epoch': 1.1818181818181819}
Step 135: {'loss': 0.493, 'grad_norm': 1.6525719165802002, 'learning_rate': 8.36842105263158e-06, 'epoch': 1.2272727272727273}
Step 140: {'loss': 0.44, 'grad_norm': 1.2360838651657104, 'learning_rate': 8.302631578947369e-06, 'epoch': 1.2727272727272727}
Step 145: {'loss': 0.3651, 'grad_norm': 1.86359441280365, 'learning_rate': 8.236842105263158e-06, 'epoch': 1.3181818181818181}
Step 150: {'loss': 0.3832, 'grad_norm': 1.121274471282959, 'learning_rate': 8.171052631578949e-06, 'epoch': 1.36363636363

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-220
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-220/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-220/generation_config.json


Step 220: {'loss': 0.2993, 'grad_norm': 1.1355339288711548, 'learning_rate': 7.25e-06, 'epoch': 2.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-220/model.safetensors
Deleting older checkpoint [T5base_Question_Generation_v0_custom_dataset/checkpoint-110] due to args.save_total_limit


Step 225: {'loss': 0.3539, 'grad_norm': 1.1638354063034058, 'learning_rate': 7.18421052631579e-06, 'epoch': 2.0454545454545454}
Step 230: {'loss': 0.2982, 'grad_norm': 0.702418327331543, 'learning_rate': 7.11842105263158e-06, 'epoch': 2.090909090909091}
Step 235: {'loss': 0.3494, 'grad_norm': 0.7854982018470764, 'learning_rate': 7.052631578947369e-06, 'epoch': 2.1363636363636362}
Step 240: {'loss': 0.3152, 'grad_norm': 0.9248342514038086, 'learning_rate': 6.986842105263158e-06, 'epoch': 2.1818181818181817}
Step 245: {'loss': 0.281, 'grad_norm': 0.6607226729393005, 'learning_rate': 6.921052631578948e-06, 'epoch': 2.227272727272727}
Step 250: {'loss': 0.267, 'grad_norm': 0.8154204487800598, 'learning_rate': 6.855263157894737e-06, 'epoch': 2.2727272727272725}
Step 255: {'loss': 0.2863, 'grad_norm': 0.7409448027610779, 'learning_rate': 6.789473684210527e-06, 'epoch': 2.3181818181818183}
Step 260: {'loss': 0.266, 'grad_norm': 0.7227839827537537, 'learning_rate': 6.723684210526316e-06, 'epoc

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-330
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-330/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-330/generation_config.json


Step 330: {'loss': 0.2527, 'grad_norm': 1.2695918083190918, 'learning_rate': 5.802631578947368e-06, 'epoch': 3.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-330/model.safetensors
Deleting older checkpoint [T5base_Question_Generation_v0_custom_dataset/checkpoint-220] due to args.save_total_limit


Step 335: {'loss': 0.2872, 'grad_norm': 0.7799136638641357, 'learning_rate': 5.736842105263158e-06, 'epoch': 3.0454545454545454}
Step 340: {'loss': 0.2573, 'grad_norm': 0.9460180997848511, 'learning_rate': 5.671052631578948e-06, 'epoch': 3.090909090909091}
Step 345: {'loss': 0.2619, 'grad_norm': 0.781097948551178, 'learning_rate': 5.605263157894737e-06, 'epoch': 3.1363636363636362}
Step 350: {'loss': 0.2288, 'grad_norm': 0.8546037077903748, 'learning_rate': 5.5394736842105266e-06, 'epoch': 3.1818181818181817}
Step 355: {'loss': 0.2706, 'grad_norm': 0.731988251209259, 'learning_rate': 5.4736842105263165e-06, 'epoch': 3.227272727272727}
Step 360: {'loss': 0.2253, 'grad_norm': 0.6366394758224487, 'learning_rate': 5.407894736842106e-06, 'epoch': 3.2727272727272725}
Step 365: {'loss': 0.3068, 'grad_norm': 0.7625462412834167, 'learning_rate': 5.342105263157895e-06, 'epoch': 3.3181818181818183}
Step 370: {'loss': 0.2598, 'grad_norm': 0.6551209688186646, 'learning_rate': 5.276315789473685e-06,

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-440
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-440/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-440/generation_config.json


Step 440: {'loss': 0.2425, 'grad_norm': 2.2097840309143066, 'learning_rate': 4.3552631578947375e-06, 'epoch': 4.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-440/model.safetensors
Deleting older checkpoint [T5base_Question_Generation_v0_custom_dataset/checkpoint-330] due to args.save_total_limit


Step 445: {'loss': 0.2365, 'grad_norm': 0.8604252934455872, 'learning_rate': 4.289473684210527e-06, 'epoch': 4.045454545454546}
Step 450: {'loss': 0.2212, 'grad_norm': 0.6776068806648254, 'learning_rate': 4.223684210526316e-06, 'epoch': 4.090909090909091}
Step 455: {'loss': 0.2182, 'grad_norm': 0.7054225206375122, 'learning_rate': 4.157894736842106e-06, 'epoch': 4.136363636363637}
Step 460: {'loss': 0.2631, 'grad_norm': 0.8247942328453064, 'learning_rate': 4.092105263157895e-06, 'epoch': 4.181818181818182}
Step 465: {'loss': 0.2268, 'grad_norm': 0.5994593501091003, 'learning_rate': 4.026315789473684e-06, 'epoch': 4.2272727272727275}
Step 470: {'loss': 0.2505, 'grad_norm': 0.8202710151672363, 'learning_rate': 3.960526315789474e-06, 'epoch': 4.2727272727272725}
Step 475: {'loss': 0.2397, 'grad_norm': 1.394645094871521, 'learning_rate': 3.894736842105263e-06, 'epoch': 4.318181818181818}
Step 480: {'loss': 0.2009, 'grad_norm': 1.5548375844955444, 'learning_rate': 3.828947368421053e-06, 'ep

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-550
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-550/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-550/generation_config.json


Step 550: {'loss': 0.2431, 'grad_norm': 1.2193659543991089, 'learning_rate': 2.907894736842106e-06, 'epoch': 5.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-550/model.safetensors
Deleting older checkpoint [T5base_Question_Generation_v0_custom_dataset/checkpoint-440] due to args.save_total_limit


Step 555: {'loss': 0.2556, 'grad_norm': 1.8953415155410767, 'learning_rate': 2.842105263157895e-06, 'epoch': 5.045454545454546}
Step 560: {'loss': 0.216, 'grad_norm': 1.850293517112732, 'learning_rate': 2.776315789473684e-06, 'epoch': 5.090909090909091}
Step 565: {'loss': 0.2127, 'grad_norm': 0.6361673474311829, 'learning_rate': 2.710526315789474e-06, 'epoch': 5.136363636363637}
Step 570: {'loss': 0.2699, 'grad_norm': 0.7525569200515747, 'learning_rate': 2.644736842105263e-06, 'epoch': 5.181818181818182}
Step 575: {'loss': 0.2175, 'grad_norm': 0.7845690846443176, 'learning_rate': 2.578947368421053e-06, 'epoch': 5.2272727272727275}
Step 580: {'loss': 0.2264, 'grad_norm': 0.7046638131141663, 'learning_rate': 2.5131578947368423e-06, 'epoch': 5.2727272727272725}
Step 585: {'loss': 0.2188, 'grad_norm': 0.8054198622703552, 'learning_rate': 2.447368421052632e-06, 'epoch': 5.318181818181818}
Step 590: {'loss': 0.2323, 'grad_norm': 0.8321883082389832, 'learning_rate': 2.381578947368421e-06, 'ep

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-660
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-660/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-660/generation_config.json


Step 660: {'loss': 0.2426, 'grad_norm': 1.5828449726104736, 'learning_rate': 1.460526315789474e-06, 'epoch': 6.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-660/model.safetensors
Deleting older checkpoint [T5base_Question_Generation_v0_custom_dataset/checkpoint-550] due to args.save_total_limit


Step 665: {'loss': 0.1788, 'grad_norm': 0.7968388199806213, 'learning_rate': 1.394736842105263e-06, 'epoch': 6.045454545454546}
Step 670: {'loss': 0.2228, 'grad_norm': 0.826279878616333, 'learning_rate': 1.3289473684210526e-06, 'epoch': 6.090909090909091}
Step 675: {'loss': 0.1887, 'grad_norm': 0.8203238844871521, 'learning_rate': 1.2631578947368422e-06, 'epoch': 6.136363636363637}
Step 680: {'loss': 0.1954, 'grad_norm': 0.7113037109375, 'learning_rate': 1.1973684210526317e-06, 'epoch': 6.181818181818182}
Step 685: {'loss': 0.2101, 'grad_norm': 0.6687749624252319, 'learning_rate': 1.1315789473684213e-06, 'epoch': 6.2272727272727275}
Step 690: {'loss': 0.2253, 'grad_norm': 1.1908609867095947, 'learning_rate': 1.0657894736842106e-06, 'epoch': 6.2727272727272725}
Step 695: {'loss': 0.2224, 'grad_norm': 2.305074453353882, 'learning_rate': 1.0000000000000002e-06, 'epoch': 6.318181818181818}
Step 700: {'loss': 0.2064, 'grad_norm': 0.7772131562232971, 'learning_rate': 9.342105263157895e-07, '

Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset/checkpoint-770
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-770/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-770/generation_config.json


Step 770: {'loss': 0.2058, 'grad_norm': 1.4775458574295044, 'learning_rate': 1.3157894736842106e-08, 'epoch': 7.0}


Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/checkpoint-770/model.safetensors
Deleting older checkpoint [T5base_Question_Generation_v0_custom_dataset/checkpoint-660] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




Step 770: {'train_runtime': 218.3065, 'train_samples_per_second': 10.517, 'train_steps_per_second': 3.527, 'total_flos': 1398167316725760.0, 'train_loss': 1.2984254661318544, 'epoch': 7.0}


Saving model checkpoint to ./T5base_Question_Generation_v0_custom_dataset
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/config.json
Configuration saved in ./T5base_Question_Generation_v0_custom_dataset/generation_config.json
Model weights saved in ./T5base_Question_Generation_v0_custom_dataset/model.safetensors
tokenizer config file saved in ./T5base_Question_Generation_v0_custom_dataset/tokenizer_config.json
Special tokens file saved in ./T5base_Question_Generation_v0_custom_dataset/special_tokens_map.json
added tokens file saved in ./T5base_Question_Generation_v0_custom_dataset/added_tokens.json


('./T5base_Question_Generation_v0_custom_dataset/tokenizer_config.json',
 './T5base_Question_Generation_v0_custom_dataset/special_tokens_map.json',
 './T5base_Question_Generation_v0_custom_dataset/spiece.model',
 './T5base_Question_Generation_v0_custom_dataset/added_tokens.json')

In [17]:
# Running the finetuned question generation model with a sample context

# the finetuned model name
new_model = "./T5base_Question_Generation_v0_custom_dataset"


# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(new_model)
model = T5ForConditionalGeneration.from_pretrained(new_model)


def get_question(tag, difficulty, context, answer="", num_questions=3, max_length=150):
    """
    Generate questions using the fine-tuned T5 model.
    
    Parameters:
    - tag: Type of question (e.g., "short answer", "multiple choice question", "true or false question")
    - difficulty: "easy", "medium", "hard"
    - context: Supporting context or passage
    - answer: Optional — if you want targeted question generation
    - num_questions: Number of diverse questions to generate
    - max_length: Max token length of generated output
    
    Returns:
    - List of generated questions as strings
    """
    # Format input text based on whether answer is provided
    answer_part = f"[{answer}]" if answer else ""
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{answer_part} {context}"

    # Tokenize
    features = tokenizer([input_text], return_tensors='pt')

    # Generate questions
    output = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        max_length=max_length,

        # Beam Search - just prints only one question
        # num_beams = 5,
        # early_stopping=True,              # to stop when the first beam is finished 

        # Sampling
        num_return_sequences=num_questions,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )

    # Decode generated questions
    for i, out in enumerate(output):
        question = tokenizer.decode(out, skip_special_tokens=True)
        print(f"Question {i+1}: {question}")
    
    print("------------------------------------------------")
    

loading file spiece.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading file chat_template.jinja
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./T5base_Question_Generation_v0_custom_dataset/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  

In [18]:
# Giving the context and difficulty


context = "Reinforcement Learning (RL) is a dynamic area of machine learning where agents are trained to make a sequence of decisions by interacting with an environment. Each interaction leads to a new state and a scalar reward, which indicates the quality of the action taken. The agent’s objective is to learn an optimal policy that maximizes the total accumulated reward over time. This is different from supervised learning, which requires labeled datasets. In RL, learning is driven by experience and the agent often learns from delayed rewards, making the credit assignment problem a central challenge. The environment is often modeled as a Markov Decision Process (MDP), characterized by states, actions, transition dynamics, and rewards. Algorithms such as Q-learning, SARSA, and Policy Gradient methods are used to find optimal policies. Modern applications employ deep learning to approximate complex functions, giving rise to Deep Reinforcement Learning. Techniques like Deep Q-Networks (DQN), Proximal Policy Optimization (PPO), and Actor-Critic methods have demonstrated state-of-the-art performance in domains ranging from game playing (e.g., Atari, Go) to robotics and recommendation systems. Exploration-exploitation trade-offs, sample efficiency, and generalization are ongoing challenges in the field. RL has significant potential in real-world decision-making systems."


difficulty = "hard"





In [19]:
# short answer question 

get_question(
    tag="long answer question",
    difficulty=difficulty,
    context=context
)

Question 1: 
Question 2: 
Question 3: 
------------------------------------------------


In [15]:
# true or false question 

get_question(
    tag="true or false question",
    difficulty=difficulty,
    context=context
)

Question 1: 
------------------------------------------------


In [16]:
# multiple choice question 

get_question(
    tag="multiple choice question",
    difficulty=difficulty,
    context=context
)

Question 1: 
------------------------------------------------
