In [None]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import random

# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 2    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
# Load the model and tokenizer 

model_name = "t5-base"

save_path = "./T5base_Question_Generation"
tokenized_trainset_save_path = "./dataset/tokenized_trainset.parquet"
tokenized_valset_save_path = "./dataset/tokenized_valset.parquet"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

In [None]:
# Loading the dataset

# num_training_examples = 4500       # to maintain uniformity when we mix all the various datasets, to get the model to understand the structure of different questions like mcq, t/f, short ans questions

# datasets used are squad v1, hotpotqa, openbookqa, drop, boolq
# the datasets returns a DatasetDict with "train" and "validation" splits

print("Loading the datasets ........")

# Load HotpotQA (distractor)
hotpotqa = load_dataset("hotpot_qa", "distractor",  trust_remote_code=True)

# SQUAD (V1)
squad = load_dataset("squad")

# OpenBookQA (additional)
openbookqa = load_dataset("openbookqa", "additional")

# Boolq dataset
boolq = load_dataset("boolq")

# Drop dataset
drop = load_dataset("drop")

print("Completed loading the datasets ........")

In [None]:
# Preprocessing functions of the datasets

# Preprocessing, combining and shuffling datasets only uses LOGICAL_CORES/ CPU's and not done on GPU's
print("Starting Preprocesing; Available CPU Cores to use - "+ str(USABLE_CPU_CORES))

def preprocess_hotpotqa(example):  
    "hotpotq - used for short answer question generation, with focus on multi-hop sentences rater than forming the question with just a single line in the whole context"      
    '''input -  <extra_id_97>very short answer <extra_id_98>medium <extra_id_99>Arthur's Magazine, Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.  Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.  In May 1846 it was merged into "Godey's Lady's Book". First for Women, First for Women is a woman's magazine published by Bauer Media Group in the USA.  The magazine was started in 1989.  It is based in Englewood Cliffs, New Jersey.  In 2011 the circulation of the magazine was 1,310,696 copies. 
        label - Which magazine was started first Arthur's Magazine or First for Women?'''

    answer = example['answer']
    if len(answer.split()) < 12:                    
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = example['level']                   # Extract difficulty - 'easy'/ 'medium'/ 'hard'

    # thought of just keeping the supporting sentences instead of the all the sentences in the supporting titles, but the context length is too short for this approach           
    supporting_facts = example["supporting_facts"]
    supporting_titles = set([t for t in supporting_facts['title']])          # Use set to avoid duplicates
    context_titles = example['context']['title']
    context_sentences = example['context']['sentences']

    supporting_sentences = []

    for idx, title in enumerate(context_titles):
        if title in supporting_titles:
            # Add all sentences under this title
            req_sentences = []
            for sent in context_sentences[idx]:
                req_sentences.append(f"{sent}")
            sentence_block = " ".join(req_sentences)  
            supporting_sentences.append(f"{title}, {sentence_block}")

    short_context = " ".join(supporting_sentences)
    
    # Randomly decide whether to include the answer or not as not always the answer will be likely to be provided by the instructor
    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]

    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {short_context}"     # Prepare the model input
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{short_context}"
       
    target_text = f"{example['question']}"                                                                       # Prepare the target output

    # print(input_text, "\n", target_text) 
    
    # Tokenize both input and output
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    
    # Attach labels
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def preprocess_squad(example):
    "squad - used for short answer question generation, with multiple questions generated with the same context"
    
    answer = example['answers']['text'][0] if example['answers']['text'] else ""
    if len(answer.split()) < 12:
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = "easy"
    context = example['context']

    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]
    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {context}"
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"

    target_text = example["question"]

    # print(input_text, "\n", target_text)

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_openbookqa(example):
    "openbookqa - used for multiple choice question generation"
    '''input - <extra_id_97>multiple choice question <extra_id_98>medium <extra_id_99>the sun is the source of energy for physical cycles on Earth 
       label - The sun is responsible for: [plants sprouting, blooming and wilting] (A. puppies learning new tricks; B. children growing up and getting old; C. flowers wilting in a vase; D. plants sprouting, blooming and wilting)'''
   
    # difficulty is calculated based on the human score and clarity
    clarity = example.get("clarity", 0)
    human_score = example.get("human_score", 1)
    
    # Filter out low clarity examples
    # if clarity <= 1:
    #     return None       # discard low-clarity / low-quality examples
        # return {}

    # Assign difficulty
    if clarity > 1.8 and human_score < 1:
        difficulty = "hard" 
    else:                             # [ 1 < clarity <= 1.8 ] and if [ clarity > 1.8 and human_score > 1 ]
        difficulty = "medium"

    tag = "multiple choice question"
    
    # Build the context using fact1 (from additional)
    fact1 = example.get("fact1", "")                # example.get("fact1", "") is safe — it gives you a default value if the key is missing, example["fact1"] will raise a KeyError if 'fact1' is missing.
    
    # Construct the multiple choice question format with answer marked
    question_stem = example["question_stem"]
    answer_key = example["answerKey"]              # 'A', 'B', 'C', or 'D'
    choices = example["choices"]["text"]
    options = example["choices"]["label"]            # ['A', 'B', 'C', 'D']
    # choices_str = "; ".join(example["choices"]["text"])
    choices_str = "; ".join([f"{option}. {text}" for option, text in zip(options, choices)])
    # answer_text = choices["text"][int(ord(answer_key) - ord('A'))]
    answer_text = choices[int(ord(answer_key) - ord('A'))]

    target_text = f"{question_stem}: [{answer_text}] ({choices_str})"

    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{fact1}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["clarity"] = clarity                 # Add clarity for later filtering
    model_inputs["answer"] = answer_text
    
    return model_inputs


def preprocess_boolq(example):
    "boolq - used for true/ false question generation"
    ''' input - <extra_id_97>true or false question <extra_id_98>medium <extra_id_99>Good Samaritan laws offer legal protection to people who give reasonable assistance to those who are, or who they believe to be, injured, ill, in peril, or otherwise incapacitated. The protection is intended to reduce bystanders' hesitation to assist, for fear of being sued or prosecuted for unintentional injury or wrongful death. An example of such a law in common-law areas of Canada: a good Samaritan doctrine is a legal principle that prevents a rescuer who has voluntarily helped a victim in distress from being successfully sued for wrongdoing. Its purpose is to keep people from being reluctant to help a stranger in need for fear of legal repercussions should they make some mistake in treatment. By contrast, a duty to rescue law requires people to offer assistance and holds those who fail to do so liable. 
        label - do good samaritan laws protect those who help at an accident [true] '''

    # difficulty tagging is not available in BoolQ, so we default it
    difficulty = "medium"
    tag = "true or false question"

    # Extract question and answer
    question = example["question"]
    answer = example["answer"]          # this is a bool value: True / False
    passage = example["passage"]

    target_text = f"{question} [{'true' if answer else 'false'}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_drop(example):
    "drop - used for very short answer question generation, with multiple questions generated with the same context, it  also includes some math calculations that has to be done by the model"

    tag = "one word answer"
    difficulty = "medium"
    question = example.get("question", "")
    passage = example.get("passage", "")
    
    # Retrieve the first available answer from 'answers_spans' (string or list of strings)
    answers = example.get("answers_spans", {}).get("spans", [])
    if not answers:
        return None                                           # Skip if no answer available

    answer = answers[0]      
    
    target_text = f"{question} [{answer}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [None]:
# Splitting the datasets into train and validation & Preprocessing the datasets individually

# Preprocess both splits separately for both datasets
# put the .select(range(5)) steps for testing out the input and output results and printing them ******
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES)
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa)

# trail
# drop_train = drop["train"].select(range(5)).map(preprocess_drop)
# drop_val = drop["validation"].select(range(5)).map(preprocess_drop)
# boolq_train = boolq["train"].select(range(5)).map(preprocess_boolq)

In [None]:
hotpotqa_train = hotpotqa["train"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["train"].column_names)
hotpotqa_val = hotpotqa["validation"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["validation"].column_names)

In [None]:
squad_train = squad["train"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["train"].column_names)
squad_val = squad["validation"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["validation"].column_names)

In [None]:
openbookqa_train = openbookqa["train"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["train"].column_names).filter( lambda x: (x.get("clarity", 0) > 1) and bool(x.get("answer"))).remove_columns(["clarity", "answer"])
openbookqa_val = openbookqa["validation"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["validation"].column_names).filter( lambda x: (x.get("clarity", 0) > 1) and bool(x.get("answer"))).remove_columns(["clarity", "answer"])

In [None]:
boolq_train = boolq["train"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["train"].column_names)
boolq_val = boolq["validation"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["validation"].column_names)

In [None]:
drop_train = drop["train"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["train"].column_names)
drop_val = drop["validation"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["validation"].column_names)

In [None]:
# combining the datasets and shuffling them

# Combine and shuffle the datasets
tokenized_train_dataset = concatenate_datasets([
    hotpotqa_train,
    squad_train,
    openbookqa_train,
    boolq_train,
    drop_train
]).shuffle(seed=92)

tokenized_val_dataset = concatenate_datasets([
    hotpotqa_val,
    squad_val,
    openbookqa_val,
    boolq_val,
    drop_val
]).shuffle(seed=42)

# tokenized_train_dataset = hotpotqa_train
# tokenized_val_dataset = hotpotqa_val


In [None]:
# Save to Parquet (efficient for large datasets)

tokenized_train_dataset.to_parquet(tokenized_trainset_save_path)
tokenized_val_dataset.to_parquet(tokenized_valset_save_path)

In [None]:
# If you already have the preprocessed dataset then run the below code

In [1]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - /home/aalla4


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import random

In [3]:
# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 1    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Total number of logical cores = 48
CUDA available:  True
GPU name: NVIDIA A100-SXM4-80GB


In [None]:
# Load the model and tokenizer 

model_name = "t5-base" 

# with extra openbookqa
save_path = "./T5base_Question_Generation"

tokenized_trainset_path = "./dataset/tokenized_trainset.parquet"
tokenized_valset_path = "./dataset/tokenized_valset.parquet"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

Max positional embeddings supported by model - ./T5base_Question_Generation_v5_mcq:  512


In [5]:
# Load the Tokenized Dataset if Preprocessing is already done

# Load from Parquet
tokenized_train_dataset = Dataset.from_parquet(tokenized_trainset_path)
tokenized_val_dataset = Dataset.from_parquet(tokenized_valset_path)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Defning the Training args 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("This is the current device: ", device)
print("Device updated to:", model.device)


training_args = TrainingArguments(
    output_dir=save_path,              # directory where the training logs, checkpoints, and evaluation results (like metrics) are saved during the training process   
    
    learning_rate=1e-5,
    warmup_steps=1000,
    num_train_epochs=3,                         
    weight_decay=1e-3,                          

    # generally Batch size = per_device_train_batch_size * per_device_train_batch_size
    per_device_train_batch_size=8,                       # ****** if using GPU's = 16; if using CPU's = 1 or 2
    # gradient_accumulation_steps=2,                     # Increase this if you need to simulate larger batch sizes, without running into 'Out or Memory' errors when memory is limited
    per_device_eval_batch_size=8,

    # this is not useful for CPU based training as hugging face trainer handles multi-core utilization automatically based on the system configuration
    dataloader_num_workers= USABLE_CPU_CORES,          # ****** for optimal use of CPU and not wasting GPU time [ this helps in loading the next batch of data into the VRAM ]

    # Print validation loss every epoch
    eval_strategy="epoch",            

    # Print and logs the training loss of the training data
    logging_strategy="steps",   
    logging_steps=1000,                                   # ****** if using GPU = 100 or more; if using CPU just for testing = 1 or 2 

    # saves model at the end of every epoch
    save_strategy="epoch",            
    # save_total_limit=2,
    save_total_limit=1,

    # report_to="none",  # Disable default logging
    
    logging_dir= save_path + "/logs",           # save logs to a directory
    report_to="tensorboard",                    # Reports to TensorBoard
    log_level='info',                           # Set logging level to 'info' to see the logs in the terminal
    # run this command in your terminal ~ tensorboard --logdir=./output_dir/runs
    # and open 'http://localhost:6006/' to monitor the logs [loss over the training]

    fp16=False                                   # ***** Mixed precision for faster training on A100; this won't work on CPU's; but i got grad_norm (tells if the model backpropogates or not) values as NaN if set this to True
    
)

This is the current device:  cuda
Device updated to: cuda:0


In [7]:
# Logging in the Terminal
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step {state.global_step}: {logs}")

In [8]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    
    train_dataset=tokenized_train_dataset,            # your test set
    eval_dataset=tokenized_val_dataset,               # your validation set

    # train_dataset=tokenized_train_dataset.select(range(100)),    
    # eval_dataset=tokenized_val_dataset.select(range(100)),
    
    callbacks=[LogCallback()]                   # *** to print the logs in the terminal
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Training the Model

# Train
trainer.train()

# Save the model
trainer.save_model(save_path)                           

tokenizer.save_pretrained(save_path)     

***** Running training *****
  Num examples = 90,447
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 2,826
  Number of trainable parameters = 222,903,552
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [None]:
# Running the finetuned question generation model with a sample context

# the finetuned model name
new_model = "./T5base_Question_Generation"


# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(new_model)
model = T5ForConditionalGeneration.from_pretrained(new_model)


def get_question(tag, difficulty, context, answer="", num_questions=3, max_length=150):
    """
    Generate questions using the fine-tuned T5 model.
    
    Parameters:
    - tag: Type of question (e.g., "short answer", "multiple choice question", "true or false question")
    - difficulty: "easy", "medium", "hard"
    - context: Supporting context or passage
    - answer: Optional — if you want targeted question generation
    - num_questions: Number of diverse questions to generate
    - max_length: Max token length of generated output
    
    Returns:
    - List of generated questions as strings
    """
    # Format input text based on whether answer is provided
    answer_part = f"[{answer}]" if answer else ""
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{answer_part} {context}"

    # Tokenize
    features = tokenizer([input_text], return_tensors='pt')

    # Generate questions
    output = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        max_length=max_length,

        # Beam Search
        # just prints only one question
        # num_beams = 5,
        # early_stopping=True,              # to stop when the first beam is finished 

        # Sampling
        num_return_sequences=num_questions,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )

    # Decode generated questions
    for i, out in enumerate(output):
        question = tokenizer.decode(out, skip_special_tokens=True)
        print(f"Question {i+1}: {question}")
    
    print("------------------------------------------------")
    

In [None]:
# Giving the context and difficulty


context = "Reinforcement Learning (RL) is a dynamic area of machine learning where agents are trained to make a sequence of decisions by interacting with an environment. Each interaction leads to a new state and a scalar reward, which indicates the quality of the action taken. The agent’s objective is to learn an optimal policy that maximizes the total accumulated reward over time. This is different from supervised learning, which requires labeled datasets. In RL, learning is driven by experience and the agent often learns from delayed rewards, making the credit assignment problem a central challenge. The environment is often modeled as a Markov Decision Process (MDP), characterized by states, actions, transition dynamics, and rewards. Algorithms such as Q-learning, SARSA, and Policy Gradient methods are used to find optimal policies. Modern applications employ deep learning to approximate complex functions, giving rise to Deep Reinforcement Learning. Techniques like Deep Q-Networks (DQN), Proximal Policy Optimization (PPO), and Actor-Critic methods have demonstrated state-of-the-art performance in domains ranging from game playing (e.g., Atari, Go) to robotics and recommendation systems. Exploration-exploitation trade-offs, sample efficiency, and generalization are ongoing challenges in the field. RL has significant potential in real-world decision-making systems."


difficulty = "hard"





In [None]:
# short answer question 

get_question(
    tag="short answer",
    difficulty=difficulty,
    context=context
)

In [None]:
# true or false question 

get_question(
    tag="true or false question",
    difficulty=difficulty,
    context=context
)

In [None]:
# multiple choice question 

get_question(
    tag="multiple choice question",
    difficulty=difficulty,
    context=context
)