In [1]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - /home/aalla4/SML


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import T5Config
import torch
import random

# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 1    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


2025-12-14 08:46:29.207519: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Total number of logical cores = 48
CUDA available:  True
GPU name: NVIDIA A100-SXM4-80GB


In [3]:
# Load the tokenizer 

# model_name = "t5-base" # ---------------------------------------------------------------------------------------------
model_name = "t5-large" 

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250


In [4]:
# Making some special tokens as placeholders/seperators for the input 

# we don't waste any additional tokens in this process to seperate the inputs into sections
special_tokens_description = {
    "<extra_id_99>": "[CONTEXT]",      # Represents the context section of the input
    "<extra_id_98>": "[DIFFICULTY]",   # Represents the difficulty section of the input - 'easy'/ 'medium'/ 'hard'
    "<extra_id_97>": "[TAG]"           # Represents the tag section of the input
}

print("Explanation of special tokens used:")
for token, description in special_tokens_description.items():
    print(f"- {token} is used as a placeholder for {description}")

# if we can simply use the existing tokens we don't wanna increase additional special tokens
# special_tokens = {'additional_special_tokens': ['[CONTEXT]', '[DIFFICULTY]', '[TAG]']}

# Get current additional special tokens from the tokenizer
# existing_special_tokens = tokenizer.special_tokens_map.get('additional_special_tokens', [])

# tokenizer.add_special_tokens(special_tokens)
# model.resize_token_embeddings(len(tokenizer))

Explanation of special tokens used:
- <extra_id_99> is used as a placeholder for [CONTEXT]
- <extra_id_98> is used as a placeholder for [DIFFICULTY]
- <extra_id_97> is used as a placeholder for [TAG]


In [None]:
# Preprocessing steps for 5 datasets combined in certain proportions

# only run these below cells if you want to train your model on the 5 mixed datasets following a certain proportion

In [None]:
# Loading the dataset

# datasets used are squad v1, hotpotqa, openbookqa, drop, boolq
# the datasets returns a DatasetDict with "train" and "validation" splits

print("Loading the datasets ........")

# Load HotpotQA (distractor)
hotpotqa = load_dataset("hotpot_qa", "distractor",  trust_remote_code=True)

# SQUAD (V1)
squad = load_dataset("squad")

# OpenBookQA (additional)
openbookqa = load_dataset("openbookqa", "additional")

# Boolq dataset
boolq = load_dataset("boolq")

# Drop dataset
drop = load_dataset("drop")

print("Completed loading the datasets ........")


In [None]:
# Preprocessing functions of the datasets

# Preprocessing, combining and shuffling datasets only uses LOGICAL_CORES/ CPU's and not done on GPU's
print("Starting Preprocesing; Available CPU Cores to use - "+ str(USABLE_CPU_CORES))

def preprocess_hotpotqa(example):  
    "hotpotq - used for short answer question generation, with focus on multi-hop sentences rater than forming the question with just a single line in the whole context"      

    answer = example['answer']
    if len(answer.split()) < 12:                    
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = example['level']                   # Extract difficulty - 'easy'/ 'medium'/ 'hard'

    # thought of just keeping the supporting sentences instead of the all the sentences in the supporting titles, but the context length is too short for this approach           
    supporting_facts = example["supporting_facts"]
    supporting_titles = set([t for t in supporting_facts['title']])          # Use set to avoid duplicates
    context_titles = example['context']['title']
    context_sentences = example['context']['sentences']

    supporting_sentences = []

    for idx, title in enumerate(context_titles):
        if title in supporting_titles:
            # Add all sentences under this title
            req_sentences = []
            for sent in context_sentences[idx]:
                req_sentences.append(f"{sent}")
            sentence_block = " ".join(req_sentences)  
            supporting_sentences.append(f"{title}, {sentence_block}")

    short_context = " ".join(supporting_sentences)
    
    # Randomly decide whether to include the answer or not as not always the answer will be likely to be provided by the instructor
    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]

    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {short_context}"     # Prepare the model input
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{short_context}"
       
    target_text = f"{example['question']}"                                                                       # Prepare the target output

    # print(input_text, "\n", target_text) 
    
    # Tokenize both input and output
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    
    # Attach labels
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def preprocess_squad(example):
    "squad - used for short answer question generation, with multiple questions generated with the same context"
    
    answer = example['answers']['text'][0] if example['answers']['text'] else ""
    if len(answer.split()) < 12:
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = "easy"
    context = example['context']

    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]
    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {context}"
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"

    target_text = example["question"]

    # print(input_text, "\n", target_text)

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_openbookqa(example):
    "openbookqa - used for multiple choice question generation"
   
    # difficulty is calculated based on the human score and clarity
    clarity = example.get("clarity", 0)
    human_score = example.get("human_score", 1)
    
    # Filter out low clarity examples
    # if clarity <= 1:
    #     return None       # discard low-clarity / low-quality examples
        # return {}

    # Assign difficulty
    if clarity > 1.8 and human_score < 1:
        difficulty = "hard" 
    else:                             # [ 1 < clarity <= 1.8 ] and if [ clarity > 1.8 and human_score > 1 ]
        difficulty = "medium"

    tag = "multiple choice question"
    
    # Build the context using fact1 (from additional)
    fact1 = example.get("fact1", "")                # example.get("fact1", "") is safe — it gives you a default value if the key is missing, example["fact1"] will raise a KeyError if 'fact1' is missing.
    
    # Construct the multiple choice question format with answer marked
    question_stem = example["question_stem"]
    answer_key = example["answerKey"]  # 'A', 'B', 'C', 'D'
    choices = example["choices"]       # dict with "text" and "label"
    
    # Extract choice texts
    choices_text = choices["text"]
    
    # Get index of correct answer
    idx = ord(answer_key) - ord('A')
    # Correct answer text
    answer_text = choices_text[idx]
    
    # Remove correct answer
    other_options = [opt for i, opt in enumerate(choices_text) if i != idx]
    choices_str = "; ".join(other_options)

    
    target_text = f"{question_stem}; answer:[{answer_text}]; wrong options:({choices_str})"

    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{fact1}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["clarity"] = clarity                 # Add clarity for later filtering
    return model_inputs


def preprocess_boolq(example):
    "boolq - used for true/ false question generation"

    # difficulty tagging is not available in BoolQ, so we default it
    difficulty = "medium"
    tag = "true or false question"

    # Extract question and answer
    question = example["question"]
    answer = example["answer"]          # this is a bool value: True / False
    passage = example["passage"]

    target_text = f"{question}; answer:[{'true' if answer else 'false'}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_drop(example):
    "drop - used for very short answer question generation, with multiple questions generated with the same context, it  also includes some math calculations that has to be done by the model"

    tag = "one word answer"
    difficulty = "medium"
    question = example.get("question", "")
    passage = example.get("passage", "")
    
    # Retrieve the first available answer from 'answers_spans' (string or list of strings)
    answers = example.get("answers_spans", {}).get("spans", [])
    if not answers:
        return None                                           # Skip if no answer available

    answer = answers[0]      
    
    target_text = f"{question}; answer:[{answer}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs



In [None]:
# Splitting the datasets into train and validation & Preprocessing the datasets individually

# Preprocess both splits separately for both datasets
# put the .select(range(5)) steps for testing out the input and output results and printing them ******
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES)
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa)

# trail
# drop_train = drop["train"].select(range(5)).map(preprocess_drop)
# drop_val = drop["validation"].select(range(5)).map(preprocess_drop)
# boolq_train = boolq["train"].select(range(5)).map(preprocess_boolq)

In [None]:
hotpotqa_train = hotpotqa["train"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["train"].column_names)
hotpotqa_val = hotpotqa["validation"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["validation"].column_names)

In [None]:
squad_train = squad["train"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["train"].column_names)
squad_val = squad["validation"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["validation"].column_names)

In [None]:
openbookqa_train = openbookqa["train"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["train"].column_names).filter( lambda x: (x.get("clarity", 0) > 1)).remove_columns(["clarity"])
openbookqa_val = openbookqa["validation"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["validation"].column_names).filter( lambda x: (x.get("clarity", 0) > 1)).remove_columns(["clarity"])

In [None]:
boolq_train = boolq["train"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["train"].column_names)
boolq_val = boolq["validation"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["validation"].column_names)

In [None]:
drop_train = drop["train"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["train"].column_names)
drop_val = drop["validation"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["validation"].column_names)

In [None]:
# Proportionally sample per dataset every batch

datasets_dict = {
    "hotpot": hotpotqa_train,
    "squad": squad_train,
    "openbook": openbookqa_train,
    "boolq": boolq_train,
    "drop": drop_train,
}

proportions = {
    "hotpot": 0.25,
    "squad": 0.15,
    "openbook": 0.30,
    "boolq": 0.25,
    "drop": 0.05
}


In [None]:
import math
from torch.utils.data import Sampler

class ProportionalBatchSampler(Sampler):
    def __init__(self, datasets_dict, proportions, batch_size):
        self.datasets_dict = datasets_dict
        self.proportions = proportions
        self.batch_size = batch_size

        # Pre-calc: how many samples per dataset per batch
        self.samples_per_dataset = {
            k: max(1, int(batch_size * p))
            for k, p in proportions.items()
        }

        # Make sure total == batch_size
        diff = batch_size - sum(self.samples_per_dataset.values())
        if diff > 0:
            largest_key = max(self.samples_per_dataset, key=lambda x: self.samples_per_dataset[x])
            self.samples_per_dataset[largest_key] += diff

        # Convert dataset to list of indices
        self.index_pools = {
            k: list(range(len(ds)))
            for k, ds in datasets_dict.items()
        }

        # Shuffle each dataset's index pool
        for k in self.index_pools:
            random.shuffle(self.index_pools[k])

        # Total batches = smallest number of batches any dataset can support
        self.total_batches = min([
            len(v) // self.samples_per_dataset[k]
            for k, v in self.index_pools.items()
        ])

    def __iter__(self):
        for _ in range(self.total_batches):
            batch_indices = []
            for k in self.index_pools:
                take = self.samples_per_dataset[k]
                batch_indices.extend(self.index_pools[k][:take])
                del self.index_pools[k][:take]
            random.shuffle(batch_indices)
            yield batch_indices

    def __len__(self):
        return self.total_batches


In [None]:
from torch.utils.data import DataLoader

class ProportionalDataLoader(DataLoader):
    def __init__(self, datasets_dict, proportions, batch_size):
        self.datasets_dict = datasets_dict
        merged_dataset = concatenate_datasets(list(datasets_dict.values()))
        sampler = ProportionalBatchSampler(datasets_dict, proportions, batch_size)
        super().__init__(
            merged_dataset,
            batch_sampler=sampler,
            collate_fn=trainer.data_collator
        )


In [None]:
# Initiating the model

In [5]:
# Load the model and tokenizer 

# model_name = "t5-base"                            # -----------------------------------------------------------------------------------------------------------------

model_name = "t5-large" 

save_path = "./T5large_Question_Generation"           # ---------------------------------------------------------------------------------------------------------------

config = T5Config.from_pretrained(
    model_name,
    dropout_rate=0.1,           # encoder/decoder FFN dropout
    attention_dropout_rate=0.1, # self-attention dropout
)

# tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name, config = config)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

Max positional embeddings supported by model - t5-large:  512


In [None]:
# Choosing the dataset and Loading the Tokenized Dataset if Preprocessing is already done


In [6]:

# dataset paths - choosing the dataset


#  ----------------------------------------------------------------------------------------------
# Choose Training set

# custom dataset 
tokenized_trainset_path = "./dataset/tokenized_custom_trainset.parquet"

# 5 datasets mixed
# tokenized_trainset_path = "./dataset/tokenized_trainset.parquet"

# 5 datasets mixed proportionally
# tokenized_trainset_path = "using 5 datasets mixed proportionally"

# Load from Parquet ---------- uncomment this
tokenized_train_dataset = Dataset.from_parquet(tokenized_trainset_path)



# ----------------------------------------------------------------------------------------------
# Validation set


# you only use the val set when needed but it can always be loaded up from the already preprocessed parquet file
# tokenized_valset_path = "./dataset/tokenized_valset.parquet"
# tokenized_val_dataset = Dataset.from_parquet(tokenized_valset_path)


# ----------------------------------------------------------------------------------------------

In [18]:
# prarmeters to change when you want to test out and experiment with various values depending on the dataset which you are using

num_of_epochs = 30
batch_size = 8

learning_rate = 1e-5
warmup_steps = 10
weight_decay = 1e-3

# for the model saving strategy
save_steps = 500

logging_steps = 10

print(f" Using the dataset =  {tokenized_trainset_path}\n Using the below parameters for training:\n epochs = {num_of_epochs} \n batch size = {batch_size} \n learning rate = {learning_rate} \n warmup steps = {warmup_steps} \n weight decay = {weight_decay}\n logging steps = {logging_steps}")

# print(proportions)



 Using the dataset =  ./dataset/tokenized_custom_trainset.parquet
 Using the below parameters for training:
 epochs = 30 
 batch size = 8 
 learning rate = 1e-05 
 warmup steps = 10 
 weight decay = 0.001
 logging steps = 10


In [20]:
# Defning the Training args 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("This is the current device: ", device)
print("Device updated to:", model.device)


training_args = TrainingArguments(
    output_dir=save_path,              # directory where the training logs, checkpoints, and evaluation results (like metrics) are saved during the training process   
    
    learning_rate=learning_rate,
    warmup_steps=warmup_steps,
    num_train_epochs=num_of_epochs,                                     
    weight_decay=1e-3,                          

    # generally Batch size = per_device_train_batch_size * per_device_train_batch_size
    # -----------------------------------------------------------------------
    # per_device_train_batch_size=batch_size,                    # ****** if using GPU's = 16; if using CPU's = 1 or 2
    # # gradient_accumulation_steps=1,                     # Increase this if you need to simulate larger batch sizes, without running into 'Out or Memory' errors when memory is limited
    # per_device_eval_batch_size=batch_size,
    # ---------------------------------------------------------------------

    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,

    # this is not useful for CPU based training as hugging face trainer handles multi-core utilization automatically based on the system configuration
    dataloader_num_workers= USABLE_CPU_CORES,          # ****** for optimal use of CPU and not wasting GPU time [ this helps in loading the next batch of data into the VRAM ]

    # Print validation loss every epoch
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # eval_strategy="epoch",                                                                                                   # need to change this for every new experiment
    # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    # Print and logs the training loss of the training data
    logging_strategy="steps",   
    logging_steps=logging_steps,                                   # ****** if using GPU = 100; if using CPU = 1 or 2 

    # saves model at the end of every epoch
    
    # save_strategy="epoch",  
    save_strategy="steps",
    save_steps=save_steps,
    
    save_total_limit=1,
    # save_total_limit=2,

    # report_to="none",  # Disable default logging
    
    logging_dir= save_path + "/logs",           # save logs to a directory
    report_to="tensorboard",                    # Reports to TensorBoard
    log_level='info',                           # Set logging level to 'info' to see the logs in the terminal
    # run this command in your terminal ~ tensorboard --logdir=./output_dir/runs
    # and open 'http://localhost:6006/' to monitor the logs [loss over the training]

    fp16=False                                   # ***** Mixed precision for faster training on A100; this won't work on CPU's
    
)

PyTorch: setting up devices


This is the current device:  cuda
Device updated to: cuda:0


In [21]:
# Logging in the Terminal
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step {state.global_step}: {logs}")



In [22]:
# Trainer setup


# --------------------------------------------------------------------------------------------------------------------
# for training on the custom dataset
trainer = Trainer(
    model=model,
    args=training_args,
    
    train_dataset=tokenized_train_dataset,            # your test set
    # eval_dataset=tokenized_val_dataset,               # your validation set

    # train_dataset=tokenized_train_dataset.select(range(100)),    
    # eval_dataset=tokenized_val_dataset.select(range(100)),
    
    # data_collator=data_collator, ----------------------------------------------------------------
    
    callbacks=[LogCallback()]                   # *** to print the logs in the terminal
)


# --------------------------------------------------------------------------------------------------------------------
# for training on the 5 different datasets mixed together and loaded from the tokenized parquet files
# trainer = Trainer(
#     model=model,
#     args=training_args,
    
#     train_dataset=tokenized_train_dataset,            # your test set
#     eval_dataset=tokenized_val_dataset,               # your validation set

#     # train_dataset=tokenized_train_dataset.select(range(100)),    
#     # eval_dataset=tokenized_val_dataset.select(range(100)),
    
#     callbacks=[LogCallback()]                   # *** to print the logs in the terminal
# )


# --------------------------------------------------------------------------------------------------------------------
# # for training on the 5 different datasets mixed by proportions specified for each of the batch
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset= None,                     # overridden
#     eval_dataset= tokenized_val_dataset,
#     callbacks=[LogCallback()]
# )

# # Override the train dataloader
# trainer.get_train_dataloader = lambda: ProportionalDataLoader(
#     datasets_dict,
#     proportions,
#     training_args.per_device_train_batch_size
# )


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Training the Model

# Train
trainer.train()


# Save the model
trainer.save_model(save_path)                           

tokenizer.save_pretrained(save_path)                    
        


***** Running training *****
  Num examples = 328
  Num Epochs = 30
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 1,230
  Number of trainable parameters = 737,668,096


Step,Training Loss
10,30.3158
20,27.961
30,25.3641
40,22.7522
50,20.638
60,16.3215
70,9.9247
80,6.1906
90,3.421
100,1.8583


Step 10: {'loss': 30.3158, 'grad_norm': 81.26324462890625, 'learning_rate': 9e-06, 'epoch': 0.24390243902439024}
Step 20: {'loss': 27.961, 'grad_norm': 77.45418548583984, 'learning_rate': 9.926229508196722e-06, 'epoch': 0.4878048780487805}
Step 30: {'loss': 25.3641, 'grad_norm': 138.82952880859375, 'learning_rate': 9.844262295081968e-06, 'epoch': 0.7317073170731707}
Step 40: {'loss': 22.7522, 'grad_norm': 390.9015808105469, 'learning_rate': 9.762295081967213e-06, 'epoch': 0.975609756097561}
Step 50: {'loss': 20.638, 'grad_norm': 58.933746337890625, 'learning_rate': 9.68032786885246e-06, 'epoch': 1.2195121951219512}
Step 60: {'loss': 16.3215, 'grad_norm': 100.96739959716797, 'learning_rate': 9.598360655737707e-06, 'epoch': 1.4634146341463414}
Step 70: {'loss': 9.9247, 'grad_norm': 67.94738006591797, 'learning_rate': 9.516393442622952e-06, 'epoch': 1.7073170731707317}
Step 80: {'loss': 6.1906, 'grad_norm': 41.0464973449707, 'learning_rate': 9.434426229508199e-06, 'epoch': 1.9512195121951

In [None]:
# Testing the model performance

In [5]:

# the finetuned model name
new_model = "./T5large_Question_Generation"


# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(new_model)
model = T5ForConditionalGeneration.from_pretrained(new_model)

In [6]:
# Running the finetuned question generation model with a sample context


def get_question(tag, difficulty, context, answer="", num_questions=3, max_length=150):
    """
    Generate questions using the fine-tuned T5 model.
    
    Parameters:
    - tag: Type of question (e.g., "short answer", "multiple choice question", "true or false question")
    - difficulty: "easy", "medium", "hard"
    - context: Supporting context or passage
    - answer: Optional — if you want targeted question generation
    - num_questions: Number of diverse questions to generate
    - max_length: Max token length of generated output
    
    Returns:
    - List of generated questions as strings
    """
    # Format input text based on whether answer is provided
    answer_part = f"[{answer}]" if answer else ""
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{answer_part} {context}"

    # Tokenize
    features = tokenizer([input_text], return_tensors='pt')

    # Generate questions
    output = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        max_length=max_length,
        num_return_sequences=num_questions,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )

    # Decode generated questions
    return [tokenizer.decode(out, skip_special_tokens=True) for out in output]

In [7]:
# short answer question
print(get_question(
    tag="short answer",
    difficulty="medium",
    context="Cadmium chloride is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol. Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH."
))

['What is the chemical formula of the ethyl alcohol that gives rise to the chemical name alcohol?', 'What is the chemical formula of ethanol?; answer:[C2H5O]; wrong options:(Water, Acid, Air, Food, Fuel, etc.)', 'What is the chemical formula of alcohol?; answer:[Ethyl alcohol]; wrong options:(Alcohol; Water; Food)']


In [8]:
# true or false question
print(get_question(
    tag="true or false question",
    difficulty="medium",
    context="Ethanol, also known as drinking alcohol, is a clear, colorless liquid that is flammable and is produced by the fermentation of sugars by yeast. It has the chemical formula C2H5OH and is used both recreationally and industrially."
))

print(get_question(
    tag="true or false question",
    difficulty="easy",
    context="Ethanol, also known as drinking alcohol."
))

['What is a common misconception about ethanol?; answer:[false]', 'Alcohol is used to help prevent drunkenness and to fight crime.; answer: [true]', 'How do various different types of ethanol differ in terms of their properties?']
['Ethanol is a type of sugar.; answer: [false]', 'The amount of ethanol produced by the US is quite small in comparison to that of the European Union.; answer: [true]', 'How does alcohol differ from ethanol?']


In [9]:
# multiple choice question
print(get_question(
    tag="multiple choice question",
    difficulty="medium",
    context="Ethanol is used as a recreational beverage, as a solvent, and as a fuel additive. It is a volatile, flammable, colorless liquid with a slight characteristic odor, and its chemical formula is C2H5OH."
))

['What is a chemical reaction of ethanol?; answer:[Volatile fluid, flammable, characteristic odor]; wrong options:(Material, cooking; Fuel additive)', 'What uses are made of ethanol?; answer:[Recreational beverage]; wrong options:(Fuel additive; Fuel additive)', 'The basic chemical formula is C2H5.; answer: [false]']


In [12]:
# short answer question
print(get_question(
    tag="short answer",
    difficulty="medium",
    context="A computer is an electronic device that takes in data (input), processes it using hardware and software, stores it, and then produces information (output). At its core, it manipulates binary code (1s and 0s) through transistors and logic gates, performing complex tasks incredibly fast, making it efficient for everything from simple calculations to running complex applications"
))

['What is a computer?', 'What is a computer?; answer:[Input]; wrong options:(Survey data; Record data; Store data)', 'What is a computer?; answer:[Really, a computer]; wrong options:(Really, a computer; Storage capacity; Operation speed)']
