In [1]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - /home/aalla4/SML


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import T5Config
import torch
import random

# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 1    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


2025-12-12 20:00:09.343708: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Total number of logical cores = 48
CUDA available:  True
GPU name: NVIDIA A100-SXM4-80GB


In [3]:
# Load the model and tokenizer 

model_name = "t5-base" 

save_path = "./T5base_Question_Generation"

config = T5Config.from_pretrained(
    model_name,
    dropout_rate=0.1,                 # encoder/decoder FFN dropout
    attention_dropout_rate=0.1,       # self-attention dropout
)

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

# tokenizer_input_max_length = 512
# tokenizer_label_max_length = 250

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250




Max positional embeddings supported by model - t5-base:  512


In [4]:
# Making some special tokens as placeholders/seperators for the input 

# we don't waste any additional tokens in this process to seperate the inputs into sections
special_tokens_description = {
    "<extra_id_99>": "[CONTEXT]",      # Represents the context section of the input
    "<extra_id_98>": "[DIFFICULTY]",   # Represents the difficulty section of the input - 'easy'/ 'medium'/ 'hard'
    "<extra_id_97>": "[TAG]"           # Represents the tag section of the input
}

print("Explanation of special tokens used:")
for token, description in special_tokens_description.items():
    print(f"- {token} is used as a placeholder for {description}")

# if we can simply use the existing tokens we don't wanna increase additional special tokens
# special_tokens = {'additional_special_tokens': ['[CONTEXT]', '[DIFFICULTY]', '[TAG]']}

# Get current additional special tokens from the tokenizer
# existing_special_tokens = tokenizer.special_tokens_map.get('additional_special_tokens', [])

# tokenizer.add_special_tokens(special_tokens)
# model.resize_token_embeddings(len(tokenizer))

Explanation of special tokens used:
- <extra_id_99> is used as a placeholder for [CONTEXT]
- <extra_id_98> is used as a placeholder for [DIFFICULTY]
- <extra_id_97> is used as a placeholder for [TAG]


In [None]:
# Preprocessing custom dataset

In [None]:
import json

# loading the custom datasets
# Loading the dataset
# Function to load the data from a JSON file

filepath_descriptive = "./dataset/descriptive.json" 
filepath_mcq = "./dataset/mcq.json" 
filepath_tf = "./dataset/true_false.json" 

def load_json_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)                    # This should be a list of dicts

        
descriptive_data = load_json_file(filepath_descriptive)
mcq_data = load_json_file(filepath_mcq)
tf_data = load_json_file(filepath_tf)

# Convert list of dicts to Hugging Face Dataset
dataset_descriptive = Dataset.from_list(descriptive_data)
dataset_mcq = Dataset.from_list(mcq_data)
dataset_tf = Dataset.from_list(tf_data)


In [None]:
# preprocessing steps

# Preprocessing function to tokenize the input and target text
def preprocess_descriptive(example):
    '''<extra_id_97>short answer question <extra_id_98>easy <extra_id_99>Drinking enough water each day helps regulate body temperature, keep joints lubricated, prevent infections, and keep organs functioning properly. Proper hydration also improves sleep quality, cognition, and mood. 
       List two ways drinking water benefits the human body.'''
    tag = example["tag"]
    difficulty = example["difficulty"]
    context = example["context"]
    question = example["question"]

    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = question

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # print(input_text, "\n", target_text)
    
    return model_inputs

def preprocess_mcq(example):
    '''there has been a mistake in the dataset so we have considered all the mcq to be of medium difficulty'''
    '''<extra_id_97>multiple choice question <extra_id_98>medium <extra_id_99>Rainforests are essential to Earth’s ecosystem. They produce oxygen, absorb carbon dioxide, and help regulate the global climate. Rainforests are also home to more than half of the world’s plant and animal species. Despite their importance, they are being destroyed at an alarming rate due to logging, agriculture, and mining. When rainforests are cleared, biodiversity is lost, and carbon is released into the atmosphere, contributing to global warming. Indigenous people who depend on these forests are also displaced. Preserving rainforests is vital for maintaining environmental balance and protecting wildlife. 
       Which of the following is a consequence of rainforest destruction? [C. Global warming] (A. Improved biodiversity; B. Carbon absorption; C. Global warming; D. Increased rainfall)'''
    tag = example["difficulty"]            # there is a problem with the dataset so had to keep it like this
    difficulty = "medium"                  # there is a problem with the dataset so had to keep this like it
    context = example["context"]
    question = example["question"]
    options = example["options"]
    answer = example["answer"]

    # Prepare formatted options
    # option_labels = ['A', 'B', 'C', 'D']
    # formatted_options = [f"{label}. {opt}" for label, opt in zip(option_labels, options)]
    # correct_index = options.index(answer)
    # correct_option = f"{option_labels[correct_index]}. {answer}"

    # don't use any options here as this might confuse the model; instead just give the answer and other options
    # Get ONLY the other options (no A/B/C/D labels)
    formatted_options = [opt for opt in options if opt != answer]
    correct_option = f"{answer}"

    # Prepare raw input and label strings
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = f"{question}; answer:[{correct_option}]; wrong options:({'; '.join(formatted_options)})"

    # Tokenize input and target
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    # Add labels
    model_inputs["labels"] = labels["input_ids"]
    
    # print(input_text, "\n", target_text)

    return model_inputs

def preprocess_tf(example):
    '''<extra_id_97>true or false question <extra_id_98>easy <extra_id_99>Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water. The process typically occurs in the chloroplasts of plant cells and releases oxygen as a byproduct. Chlorophyll, the green pigment in plants, plays a crucial role in capturing light energy. This energy is then used to convert water and carbon dioxide into glucose, which serves as the plant’s food source. 
        Photosynthesis releases oxygen as a byproduct. [true]'''
    tag = example["tag"]
    difficulty = example["difficulty"]
    context = example["context"]
    question = example["question"]
    answer = example["answer"].lower()  # ensure it's "true" or "false"

    # Format the input and target
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"
    target_text = f"{question}; answer: [{answer}]"

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # print(input_text, "\n", target_text)

    return model_inputs



In [None]:

processed_descriptive_dataset = dataset_descriptive.map(preprocess_descriptive, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=dataset_descriptive.column_names)
processed_mcq_dataset = dataset_mcq.map(preprocess_mcq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=dataset_mcq.column_names)
processed_tf_dataset = dataset_tf.map(preprocess_tf, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=dataset_tf.column_names)


In [None]:
# print(processed_descriptive_dataset)
# print(processed_mcq_dataset)
# print(processed_tf_dataset)

tokenized_train_dataset = concatenate_datasets([
    processed_descriptive_dataset,
    processed_mcq_dataset,
    processed_tf_dataset
]).shuffle(seed=92)

In [None]:

# Saving the custom dataset in a parquet format

tokenized_trainset_save_path = "./dataset/tokenized_custom_trainset.parquet"

tokenized_train_dataset.to_parquet(tokenized_trainset_save_path)

print(f" Saved the custom training dataset to {tokenized_trainset_save_path}")

In [5]:
# Preprocessing the 5 mixed datasets - not according to proportions

In [6]:
# Loading the dataset

# datasets used are squad v1, hotpotqa, openbookqa, drop, boolq
# the datasets returns a DatasetDict with "train" and "validation" splits

print("Loading the datasets ........")

# Load HotpotQA (distractor)
hotpotqa = load_dataset("hotpot_qa", "distractor",  trust_remote_code=True)

# SQUAD (V1)
squad = load_dataset("squad")

# OpenBookQA (additional)
openbookqa = load_dataset("openbookqa", "additional")

# Boolq dataset
boolq = load_dataset("boolq")

# Drop dataset
drop = load_dataset("drop")

print("Completed loading the datasets ........")


Loading the datasets ........
Completed loading the datasets ........


In [7]:
# Preprocessing functions of the datasets

# Preprocessing, combining and shuffling datasets only uses LOGICAL_CORES/ CPU's and not done on GPU's
print("Starting Preprocesing; Available CPU Cores to use - "+ str(USABLE_CPU_CORES))

def preprocess_hotpotqa(example):  
    "hotpotq - used for short answer question generation, with focus on multi-hop sentences rater than forming the question with just a single line in the whole context"      

    answer = example['answer']
    if len(answer.split()) < 12:                    
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = example['level']                   # Extract difficulty - 'easy'/ 'medium'/ 'hard'

    # thought of just keeping the supporting sentences instead of the all the sentences in the supporting titles, but the context length is too short for this approach           
    supporting_facts = example["supporting_facts"]
    supporting_titles = set([t for t in supporting_facts['title']])          # Use set to avoid duplicates
    context_titles = example['context']['title']
    context_sentences = example['context']['sentences']

    supporting_sentences = []

    for idx, title in enumerate(context_titles):
        if title in supporting_titles:
            # Add all sentences under this title
            req_sentences = []
            for sent in context_sentences[idx]:
                req_sentences.append(f"{sent}")
            sentence_block = " ".join(req_sentences)  
            supporting_sentences.append(f"{title}, {sentence_block}")

    short_context = " ".join(supporting_sentences)
    
    # Randomly decide whether to include the answer or not as not always the answer will be likely to be provided by the instructor
    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]

    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {short_context}"     # Prepare the model input
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{short_context}"
       
    target_text = f"{example['question']}"                                                                       # Prepare the target output

    # print(input_text, "\n", target_text) 
    
    # Tokenize both input and output
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    
    # Attach labels
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def preprocess_squad(example):
    "squad - used for short answer question generation, with multiple questions generated with the same context"
    
    answer = example['answers']['text'][0] if example['answers']['text'] else ""
    if len(answer.split()) < 12:
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = "easy"
    context = example['context']

    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]
    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {context}"
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"

    target_text = example["question"]

    # print(input_text, "\n", target_text)

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_openbookqa(example):
    "openbookqa - used for multiple choice question generation"
   
    # difficulty is calculated based on the human score and clarity
    clarity = example.get("clarity", 0)
    human_score = example.get("human_score", 1)
    
    # Filter out low clarity examples
    # if clarity <= 1:
    #     return None       # discard low-clarity / low-quality examples
        # return {}

    # Assign difficulty
    if clarity > 1.8 and human_score < 1:
        difficulty = "hard" 
    else:                             # [ 1 < clarity <= 1.8 ] and if [ clarity > 1.8 and human_score > 1 ]
        difficulty = "medium"

    tag = "multiple choice question"
    
    # Build the context using fact1 (from additional)
    fact1 = example.get("fact1", "")                # example.get("fact1", "") is safe — it gives you a default value if the key is missing, example["fact1"] will raise a KeyError if 'fact1' is missing.
    
    # Construct the multiple choice question format with answer marked
    question_stem = example["question_stem"]
    answer_key = example["answerKey"]  # 'A', 'B', 'C', 'D'
    choices = example["choices"]       # dict with "text" and "label"
    
    # Extract choice texts
    choices_text = choices["text"]
    
    # Get index of correct answer
    idx = ord(answer_key) - ord('A')
    # Correct answer text
    answer_text = choices_text[idx]
    
    # Remove correct answer
    other_options = [opt for i, opt in enumerate(choices_text) if i != idx]
    choices_str = "; ".join(other_options)

    
    target_text = f"{question_stem}; answer:[{answer_text}]; wrong options:({choices_str})"

    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{fact1}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["clarity"] = clarity                 # Add clarity for later filtering
    return model_inputs


def preprocess_boolq(example):
    "boolq - used for true/ false question generation"

    # difficulty tagging is not available in BoolQ, so we default it
    difficulty = "medium"
    tag = "true or false question"

    # Extract question and answer
    question = example["question"]
    answer = example["answer"]          # this is a bool value: True / False
    passage = example["passage"]

    target_text = f"{question}; answer:[{'true' if answer else 'false'}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_drop(example):
    "drop - used for very short answer question generation, with multiple questions generated with the same context, it  also includes some math calculations that has to be done by the model"

    tag = "one word answer"
    difficulty = "medium"
    question = example.get("question", "")
    passage = example.get("passage", "")
    
    # Retrieve the first available answer from 'answers_spans' (string or list of strings)
    answers = example.get("answers_spans", {}).get("spans", [])
    if not answers:
        return None                                           # Skip if no answer available

    answer = answers[0]      
    
    target_text = f"{question}; answer:[{answer}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs



Starting Preprocesing; Available CPU Cores to use - 47


In [8]:
# Splitting the datasets into train and validation & Preprocessing the datasets individually

# Preprocess both splits separately for both datasets
# put the .select(range(5)) steps for testing out the input and output results and printing them ******
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES)
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa)

# trail
# drop_train = drop["train"].select(range(5)).map(preprocess_drop)
# drop_val = drop["validation"].select(range(5)).map(preprocess_drop)
# boolq_train = boolq["train"].select(range(5)).map(preprocess_boolq)

In [9]:
hotpotqa_train = hotpotqa["train"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["train"].column_names)
hotpotqa_val = hotpotqa["validation"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["validation"].column_names)

In [10]:
squad_train = squad["train"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["train"].column_names)
squad_val = squad["validation"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["validation"].column_names)

In [11]:
openbookqa_train = openbookqa["train"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["train"].column_names).filter( lambda x: (x.get("clarity", 0) > 1)).remove_columns(["clarity"])
openbookqa_val = openbookqa["validation"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["validation"].column_names).filter( lambda x: (x.get("clarity", 0) > 1)).remove_columns(["clarity"])

Map (num_proc=47):   0%|          | 0/4957 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4957 [00:00<?, ? examples/s]

Map (num_proc=47):   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
boolq_train = boolq["train"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["train"].column_names)
boolq_val = boolq["validation"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["validation"].column_names)

In [13]:
drop_train = drop["train"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["train"].column_names)
drop_val = drop["validation"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["validation"].column_names)

In [14]:
# combining the datasets and shuffling them

# Combine and shuffle the datasets
tokenized_train_dataset = concatenate_datasets([
    hotpotqa_train,
    squad_train,
    openbookqa_train,
    boolq_train,
    drop_train
]).shuffle(seed=42)

tokenized_val_dataset = concatenate_datasets([
    hotpotqa_val,
    squad_val,
    openbookqa_val,
    boolq_val,
    drop_val
# ])                                        # no need to shuffle the validation dataset...
]).shuffle(seed=42)


In [15]:
# Save to Parquet (efficient for large datasets)

tokenized_trainset_save_path = "./dataset/tokenized_trainset.parquet"
tokenized_valset_save_path = "./dataset/tokenized_valset.parquet"


tokenized_train_dataset.to_parquet(tokenized_trainset_save_path)
tokenized_val_dataset.to_parquet(tokenized_valset_save_path)

Creating parquet from Arrow format:   0%|          | 0/270 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

142916148