In [None]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import random

# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 2    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
# Load the model and tokenizer 

model_name = "google/flan-t5-base" 

save_path = "./T5base_Question_Generation_v2"
tokenized_trainset_save_path = "./dataset/individual/boolq/tokenized_trainset.parquet"
tokenized_valset_save_path = "./dataset/individual/boolq/tokenized_valset.parquet"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

In [None]:
# Loading the dataset

# num_training_examples = 4500       # to maintain uniformity when we mix all the various datasets, to get the model to understand the structure of different questions like mcq, t/f, short ans questions

# datasets used are squad v1, hotpotqa, openbookqa, drop, boolq
# the datasets returns a DatasetDict with "train" and "validation" splits

print("Loading the datasets ........")

# Load HotpotQA (distractor)
hotpotqa = load_dataset("hotpot_qa", "distractor",  trust_remote_code=True)

# SQUAD (V1)
squad = load_dataset("squad")

# OpenBookQA (additional)
openbookqa = load_dataset("openbookqa", "additional")

# Boolq dataset
boolq = load_dataset("boolq")

# Drop dataset
drop = load_dataset("drop")

print("Completed loading the datasets ........")

In [None]:
# Preprocessing functions of the datasets

# Preprocessing, combining and shuffling datasets only uses LOGICAL_CORES/ CPU's and not done on GPU's
print("Starting Preprocesing; Available CPU Cores to use - "+ str(USABLE_CPU_CORES))

def preprocess_hotpotqa(example):  
    "hotpotq - used for short answer question generation, with focus on multi-hop sentences rater than forming the question with just a single line in the whole context"      

    answer = example['answer']
    if len(answer.split()) < 12:                    
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = example['level']                   # Extract difficulty - 'easy'/ 'medium'/ 'hard'

    # thought of just keeping the supporting sentences instead of the all the sentences in the supporting titles, but the context length is too short for this approach           
    supporting_facts = example["supporting_facts"]
    supporting_titles = set([t for t in supporting_facts['title']])          # Use set to avoid duplicates
    context_titles = example['context']['title']
    context_sentences = example['context']['sentences']

    supporting_sentences = []

    for idx, title in enumerate(context_titles):
        if title in supporting_titles:
            # Add all sentences under this title
            req_sentences = []
            for sent in context_sentences[idx]:
                req_sentences.append(f"{sent}")
            sentence_block = " ".join(req_sentences)  
            supporting_sentences.append(f"{title}, {sentence_block}")

    short_context = " ".join(supporting_sentences)
    
    # Randomly decide whether to include the answer or not as not always the answer will be likely to be provided by the instructor
    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]

    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {short_context}"     # Prepare the model input
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {short_context}"
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{short_context}"
       
    target_text = f"{example['question']}"                                                                       # Prepare the target output

    # print(input_text, "\n", target_text) 
    
    # Tokenize both input and output
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    
    # Attach labels
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def preprocess_squad(example):
    "squad - used for short answer question generation, with multiple questions generated with the same context"
    
    answer = example['answers']['text'][0] if example['answers']['text'] else ""
    if len(answer.split()) < 12:
        tag = "very short answer"
    else:
        tag = "short answer"
    difficulty = "easy"
    context = example['context']

    include_answer = random.choices([True, False], weights=[15, 85], k=1)[0]
    if include_answer:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>[{answer}] {context}"
    else:
        input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{context}"

    target_text = example["question"]

    # print(input_text, "\n", target_text)

    # Tokenize
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_openbookqa(example):
    "openbookqa - used for multiple choice question generation"
   
    # difficulty is calculated based on the human score and clarity
    clarity = example.get("clarity", 0)
    human_score = example.get("human_score", 1)
    
    # Filter out low clarity examples
    # if clarity <= 1:
    #     return None       # discard low-clarity / low-quality examples
        # return {}

    # Assign difficulty
    if clarity > 1.8 and human_score < 1:
        difficulty = "hard" 
    else:                             # [ 1 < clarity <= 1.8 ] and if [ clarity > 1.8 and human_score > 1 ]
        difficulty = "medium"

    tag = "multiple choice question"
    
    # Build the context using fact1 (from additional)
    fact1 = example.get("fact1", "")                # example.get("fact1", "") is safe — it gives you a default value if the key is missing, example["fact1"] will raise a KeyError if 'fact1' is missing.
    
    # Construct the multiple choice question format with answer marked
    question_stem = example["question_stem"]
    answer_key = example["answerKey"]              # 'A', 'B', 'C', or 'D'
    choices = example["choices"]["text"]
    options = example["choices"]["label"]            # ['A', 'B', 'C', 'D']
    # choices_str = "; ".join(example["choices"]["text"])
    choices_str = "; ".join([f"{option}. {text}" for option, text in zip(options, choices)])
    # answer_text = choices["text"][int(ord(answer_key) - ord('A'))]
    answer_text = choices[int(ord(answer_key) - ord('A'))]

    target_text = f"{question_stem}: [{answer_text}] ({choices_str})"

    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{fact1}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["clarity"] = clarity                 # Add clarity for later filtering
    model_inputs["answer"] = answer_text
    
    return model_inputs


def preprocess_boolq(example):
    "boolq - used for true/ false question generation"

    # difficulty tagging is not available in BoolQ, so we default it
    difficulty = "medium"
    tag = "true or false question"

    # Extract question and answer
    question = example["question"]
    answer = example["answer"]          # this is a bool value: True / False
    passage = example["passage"]

    target_text = f"{question} [{'true' if answer else 'false'}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def preprocess_drop(example):
    "drop - used for very short answer question generation, with multiple questions generated with the same context, it  also includes some math calculations that has to be done by the model"

    tag = "one word answer"
    difficulty = "medium"
    question = example.get("question", "")
    passage = example.get("passage", "")
    
    # Retrieve the first available answer from 'answers_spans' (string or list of strings)
    answers = example.get("answers_spans", {}).get("spans", [])
    if not answers:
        return None                                           # Skip if no answer available

    answer = answers[0]      
    
    target_text = f"{question} [{answer}]"
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{passage}"

    # print(input_text, "\n", target_text)

    # Tokenization
    model_inputs = tokenizer(input_text, max_length=tokenizer_input_max_length, padding="max_length", truncation=True)
    labels = tokenizer(target_text, max_length=tokenizer_label_max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [None]:
# Splitting the datasets into train and validation & Preprocessing the datasets individually

# Preprocess both splits separately for both datasets
# put the .select(range(5)) steps for testing out the input and output results and printing them ******
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES)
# hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa)

# trail
# drop_train = drop["train"].select(range(5)).map(preprocess_drop)
# drop_val = drop["validation"].select(range(5)).map(preprocess_drop)
# boolq_train = boolq["train"].select(range(5)).map(preprocess_boolq)

In [None]:
hotpotqa_train = hotpotqa["train"].select(range(5)).map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["train"].column_names)
hotpotqa_val = hotpotqa["validation"].map(preprocess_hotpotqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=hotpotqa["validation"].column_names)

In [None]:
squad_train = squad["train"].select(range(num_training_examples)).map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["train"].column_names)
squad_val = squad["validation"].map(preprocess_squad, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=squad["validation"].column_names)

In [None]:
openbookqa_train = openbookqa["train"].select(range(num_training_examples)).map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["train"].column_names).filter( lambda x: (x.get("clarity", 0) > 1) and bool(x.get("answer"))).remove_columns(["clarity", "answer"])
openbookqa_val = openbookqa["validation"].map(preprocess_openbookqa, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=openbookqa["validation"].column_names).filter( lambda x: (x.get("clarity", 0) > 1) and bool(x.get("answer"))).remove_columns(["clarity", "answer"])

In [None]:
boolq_train = boolq["train"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["train"].column_names)
boolq_val = boolq["validation"].map(preprocess_boolq, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=boolq["validation"].column_names)

In [None]:
drop_train = drop["train"].select(range(num_training_examples)).map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["train"].column_names)
drop_val = drop["validation"].map(preprocess_drop, batched=False, num_proc=USABLE_CPU_CORES, remove_columns=drop["validation"].column_names)

In [None]:
# combining the datasets and shuffling them

# Combine and shuffle the datasets
# tokenized_train_dataset = concatenate_datasets([
#     hotpotqa_train,
#     squad_train,
#     openbookqa_train,
#     boolq_train,
#     drop_train
# ]).shuffle(seed=92)

# tokenized_val_dataset = concatenate_datasets([
#     hotpotqa_val,
#     squad_val,
#     openbookqa_val,
#     boolq_val,
#     drop_val
# ]).shuffle(seed=42)

tokenized_train_dataset = boolq_train
tokenized_val_dataset = boolq_val


In [None]:
# Save to Parquet (efficient for large datasets)

tokenized_train_dataset.to_parquet(tokenized_trainset_save_path)
tokenized_val_dataset.to_parquet(tokenized_valset_save_path)

In [1]:
# setting up the directory where we want to store the models
import os

print("Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - " + os.getcwd())
os.environ['HF_HOME'] = "/home/aalla4"
# os.environ['TRANSFORMERS_CACHE'] = '/home/aalla4'

Setting up the current working directory as the place where to host the transformers models downloaded from hugging face - /home/aalla4


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import get_peft_model, LoraConfig, TaskType
import torch
import random

In [3]:
# getting the CPU and GPU count

print("Total number of logical cores = " + str(os.cpu_count()))  # This shows logical cores not the physical cores
LOGICAL_CORES = os.cpu_count()
USABLE_CPU_CORES = LOGICAL_CORES - 1    # YOU CAN CHANGE THIS ACCORDING TO THE CPU AVAILABILITIES

print("CUDA available: ", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Total number of logical cores = 48
CUDA available:  True
GPU name: NVIDIA A100-SXM4-80GB


In [4]:
# Load the model and tokenizer 

model_name = "google/flan-t5-base" 

# new model save path
save_path = "./T5base-flan-lora-adapter-custom-dataset"

tokenized_trainset_path = "./dataset/custom_dataset/tokenized_trainset.parquet"
# tokenized_valset_path = "./dataset/individual/hotpotqa/tokenized_valset.parquet"

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False, device_map="auto")
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Max positional embeddings supported by model - {model_name}: ", model.config.n_positions)

tokenizer_input_max_length = 512
tokenizer_label_max_length = 250

Max positional embeddings supported by model - google/flan-t5-base:  512


In [5]:
# Load the Tokenized Dataset if Preprocessing is already done

# Load from Parquet
tokenized_train_dataset = Dataset.from_parquet(tokenized_trainset_path)
# tokenized_val_dataset = Dataset.from_parquet(tokenized_valset_path)

In [6]:
# Prepare LoRA configuration
lora_config = LoraConfig(
    r=8,                         # LoRA rank
    lora_alpha=16,               # LoRA scaling factor
    target_modules=["q", "v"],   # Typically for T5: q and v projections
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

In [10]:
# Defning the Training args 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("This is the current device: ", device)
print("Device updated to:", model.device)


training_args = TrainingArguments(
    output_dir=save_path,              # directory where the training logs, checkpoints, and evaluation results (like metrics) are saved during the training process   
    
    learning_rate=1e-5,
    warmup_steps=10,
    num_train_epochs=5,                         
    weight_decay=1e-3,                          

    # generally Batch size = per_device_train_batch_size * per_device_train_batch_size
    per_device_train_batch_size=4,                       # ****** if using GPU's = 16; if using CPU's = 1 or 2
    # per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,                     # Increase this if you need to simulate larger batch sizes, without running into 'Out or Memory' errors when memory is limited
    

    # this is not useful for CPU based training as hugging face trainer handles multi-core utilization automatically based on the system configuration
    dataloader_num_workers= USABLE_CPU_CORES,          # ****** for optimal use of CPU and not wasting GPU time [ this helps in loading the next batch of data into the VRAM ]

    # Print validation loss every epoch
    # eval_strategy="epoch",            

    # Print and logs the training loss of the training data
    logging_strategy="steps",   
    logging_steps=10,                                   # ****** if using GPU = 100; if using CPU = 1 or 2 

    # saves model at the end of every epoch
    save_strategy="epoch",            
    save_total_limit=1,
    # save_total_limit=2,

    # report_to="none",  # Disable default logging
    
    logging_dir= save_path + "/logs",           # save logs to a directory
    report_to="tensorboard",                    # Reports to TensorBoard
    log_level='info',                           # Set logging level to 'info' to see the logs in the terminal
    # run this command in your terminal ~ tensorboard --logdir=./output_dir/runs
    # and open 'http://localhost:6006/' to monitor the logs [loss over the training]

    fp16=False                                   # ***** Mixed precision for faster training on A100; this won't work on CPU's
    
)

This is the current device:  cuda
Device updated to: cuda:0


In [11]:
# Logging in the Terminal
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step {state.global_step}: {logs}")

In [12]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    
    train_dataset=tokenized_train_dataset,            # your test set
    # eval_dataset=tokenized_val_dataset,               # your validation set

    # train_dataset=tokenized_train_dataset.select(range(100)),    
    # eval_dataset=tokenized_val_dataset.select(range(100)),
    
    callbacks=[LogCallback()]                   # *** to print the logs in the terminal
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
# Training the Model

# Train
trainer.train()

# Save the model
trainer.save_model(save_path)                           

tokenizer.save_pretrained(save_path)     

***** Running training *****
  Num examples = 328
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 410
  Number of trainable parameters = 884,736
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,47.2601
20,46.8314
30,45.1818
40,45.7363
50,46.4363
60,46.5481
70,46.0279
80,45.7577
90,45.7576
100,44.8286


Step 10: {'loss': 47.2601, 'grad_norm': 19.015579223632812, 'learning_rate': 9e-06, 'epoch': 0.12195121951219512}
Step 20: {'loss': 46.8314, 'grad_norm': 15.072036743164062, 'learning_rate': 9.775e-06, 'epoch': 0.24390243902439024}
Step 30: {'loss': 45.1818, 'grad_norm': 29.005718231201172, 'learning_rate': 9.525000000000001e-06, 'epoch': 0.36585365853658536}
Step 40: {'loss': 45.7363, 'grad_norm': 27.39539337158203, 'learning_rate': 9.275e-06, 'epoch': 0.4878048780487805}
Step 50: {'loss': 46.4363, 'grad_norm': 16.72978973388672, 'learning_rate': 9.025e-06, 'epoch': 0.6097560975609756}
Step 60: {'loss': 46.5481, 'grad_norm': 23.206083297729492, 'learning_rate': 8.775e-06, 'epoch': 0.7317073170731707}
Step 70: {'loss': 46.0279, 'grad_norm': 23.724437713623047, 'learning_rate': 8.525e-06, 'epoch': 0.8536585365853658}
Step 80: {'loss': 45.7577, 'grad_norm': 40.15237045288086, 'learning_rate': 8.275000000000001e-06, 'epoch': 0.975609756097561}


Saving model checkpoint to ./T5base-flan-lora-adapter-custom-dataset/checkpoint-82
loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty

Step 90: {'loss': 45.7576, 'grad_norm': 14.941641807556152, 'learning_rate': 8.025e-06, 'epoch': 1.0975609756097562}
Step 100: {'loss': 44.8286, 'grad_norm': 41.72953796386719, 'learning_rate': 7.775000000000001e-06, 'epoch': 1.2195121951219512}
Step 110: {'loss': 44.7675, 'grad_norm': 115.38069915771484, 'learning_rate': 7.525e-06, 'epoch': 1.3414634146341464}
Step 120: {'loss': 43.9772, 'grad_norm': 35.55817413330078, 'learning_rate': 7.275000000000001e-06, 'epoch': 1.4634146341463414}
Step 130: {'loss': 44.7124, 'grad_norm': 19.518259048461914, 'learning_rate': 7.0250000000000005e-06, 'epoch': 1.5853658536585367}
Step 140: {'loss': 43.0448, 'grad_norm': 23.692852020263672, 'learning_rate': 6.775e-06, 'epoch': 1.7073170731707317}
Step 150: {'loss': 43.8796, 'grad_norm': 19.552507400512695, 'learning_rate': 6.525e-06, 'epoch': 1.8292682926829267}
Step 160: {'loss': 44.6079, 'grad_norm': 32.34619140625, 'learning_rate': 6.275e-06, 'epoch': 1.951219512195122}


Saving model checkpoint to ./T5base-flan-lora-adapter-custom-dataset/checkpoint-164
loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalt

Step 170: {'loss': 44.3425, 'grad_norm': 30.581275939941406, 'learning_rate': 6.025000000000001e-06, 'epoch': 2.073170731707317}
Step 180: {'loss': 44.9565, 'grad_norm': 461.8547668457031, 'learning_rate': 5.775000000000001e-06, 'epoch': 2.1951219512195124}
Step 190: {'loss': 44.3882, 'grad_norm': 15.751220703125, 'learning_rate': 5.5250000000000005e-06, 'epoch': 2.317073170731707}
Step 200: {'loss': 42.0071, 'grad_norm': 45.706363677978516, 'learning_rate': 5.275e-06, 'epoch': 2.4390243902439024}
Step 210: {'loss': 42.2767, 'grad_norm': 15.913215637207031, 'learning_rate': 5.025e-06, 'epoch': 2.5609756097560976}
Step 220: {'loss': 43.873, 'grad_norm': 57.85839080810547, 'learning_rate': 4.775e-06, 'epoch': 2.682926829268293}
Step 230: {'loss': 42.9164, 'grad_norm': 36.830379486083984, 'learning_rate': 4.525000000000001e-06, 'epoch': 2.8048780487804876}
Step 240: {'loss': 43.0955, 'grad_norm': 22.85969352722168, 'learning_rate': 4.2750000000000006e-06, 'epoch': 2.926829268292683}


Saving model checkpoint to ./T5base-flan-lora-adapter-custom-dataset/checkpoint-246
loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalt

Step 250: {'loss': 42.0319, 'grad_norm': 46.067501068115234, 'learning_rate': 4.0250000000000004e-06, 'epoch': 3.048780487804878}
Step 260: {'loss': 44.5769, 'grad_norm': 35.68975067138672, 'learning_rate': 3.7750000000000003e-06, 'epoch': 3.1707317073170733}
Step 270: {'loss': 42.709, 'grad_norm': 24.773191452026367, 'learning_rate': 3.525e-06, 'epoch': 3.292682926829268}
Step 280: {'loss': 44.4552, 'grad_norm': 56.44078063964844, 'learning_rate': 3.2750000000000004e-06, 'epoch': 3.4146341463414633}
Step 290: {'loss': 42.2936, 'grad_norm': 47.7620849609375, 'learning_rate': 3.0250000000000003e-06, 'epoch': 3.5365853658536586}
Step 300: {'loss': 42.5762, 'grad_norm': 28.907278060913086, 'learning_rate': 2.7750000000000005e-06, 'epoch': 3.658536585365854}
Step 310: {'loss': 43.0203, 'grad_norm': 23.486928939819336, 'learning_rate': 2.5250000000000004e-06, 'epoch': 3.7804878048780486}
Step 320: {'loss': 43.6126, 'grad_norm': 63.90212631225586, 'learning_rate': 2.2750000000000002e-06, 'ep

Saving model checkpoint to ./T5base-flan-lora-adapter-custom-dataset/checkpoint-328
loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalt

Step 330: {'loss': 41.1653, 'grad_norm': 18.159597396850586, 'learning_rate': 2.025e-06, 'epoch': 4.024390243902439}
Step 340: {'loss': 42.5773, 'grad_norm': 21.777467727661133, 'learning_rate': 1.7750000000000002e-06, 'epoch': 4.146341463414634}
Step 350: {'loss': 41.805, 'grad_norm': 13.325637817382812, 'learning_rate': 1.525e-06, 'epoch': 4.2682926829268295}
Step 360: {'loss': 42.1247, 'grad_norm': 16.249387741088867, 'learning_rate': 1.275e-06, 'epoch': 4.390243902439025}
Step 370: {'loss': 42.7134, 'grad_norm': 43.47506332397461, 'learning_rate': 1.025e-06, 'epoch': 4.512195121951219}
Step 380: {'loss': 42.0563, 'grad_norm': 26.96151351928711, 'learning_rate': 7.750000000000001e-07, 'epoch': 4.634146341463414}
Step 390: {'loss': 43.2065, 'grad_norm': 28.312908172607422, 'learning_rate': 5.250000000000001e-07, 'epoch': 4.7560975609756095}
Step 400: {'loss': 41.5389, 'grad_norm': 13.661434173583984, 'learning_rate': 2.75e-07, 'epoch': 4.878048780487805}


Saving model checkpoint to ./T5base-flan-lora-adapter-custom-dataset/checkpoint-410
loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalt

Step 410: {'loss': 42.6645, 'grad_norm': 22.196388244628906, 'learning_rate': 2.5000000000000002e-08, 'epoch': 5.0}


Deleting older checkpoint [T5base-flan-lora-adapter-custom-dataset/checkpoint-328] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




Step 410: {'train_runtime': 72.3832, 'train_samples_per_second': 22.657, 'train_steps_per_second': 5.664, 'total_flos': 1127459428761600.0, 'train_loss': 43.86192314334032, 'epoch': 5.0}


Saving model checkpoint to ./T5base-flan-lora-adapter-custom-dataset
loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      

('./T5base-flan-lora-adapter-custom-dataset/tokenizer_config.json',
 './T5base-flan-lora-adapter-custom-dataset/special_tokens_map.json',
 './T5base-flan-lora-adapter-custom-dataset/spiece.model',
 './T5base-flan-lora-adapter-custom-dataset/added_tokens.json')

In [18]:
# Running the finetuned question generation model with a sample context

from peft import LoraConfig, PeftModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# the finetuned model name
base_model_name = "google/flan-t5-base"
adapter_path = "./T5base-flan-lora-adapter-custom-dataset"

# Load model and tokenizer
# tokenizer = T5Tokenizer.from_pretrained(base_model_name)
# base_model = T5ForConditionalGeneration.from_pretrained(base_model_name)


# Load the LoRA config from adapter
lora_config = LoraConfig.from_pretrained(adapter_path)

# Load base model (T5 base in this case)
base_model = AutoModelForSeq2SeqLM.from_pretrained(lora_config.base_model_name_or_path)

# Apply LoRA adapter
model = PeftModel.from_pretrained(base_model, adapter_path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(lora_config.base_model_name_or_path)
# peft doesn't change the model's tokenizer but why do we do this here, this guarantees consistency — the tokenizer used during LoRA fine-tuning and during inference remains the same.

# Set to eval mode
model.eval()

def get_question(tag, difficulty, context, answer="", num_questions=3, max_length=150):
    """
    Generate questions using the fine-tuned T5 model.
    
    Parameters:
    - tag: Type of question (e.g., "short answer", "multiple choice question", "true or false question")
    - difficulty: "easy", "medium", "hard"
    - context: Supporting context or passage
    - answer: Optional — if you want targeted question generation
    - num_questions: Number of diverse questions to generate
    - max_length: Max token length of generated output
    
    Returns:
    - List of generated questions as strings
    """
    # Format input text based on whether answer is provided
    answer_part = f"[{answer}]" if answer else ""
    input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99>{answer_part} {context}"

    # Tokenize
    features = tokenizer([input_text], return_tensors='pt')

    # Generate questions
    output = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        max_length=max_length,
        num_return_sequences=num_questions,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )

    # Decode generated questions
    for i, out in enumerate(output):
        question = tokenizer.decode(out, skip_special_tokens=True)
        print(f"Question {i+1}: {question}")
    
    print("------------------------------------------------")
    

loading configuration file config.json from cache at /home/aalla4/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_siz

In [19]:
# Giving the context and difficulty


context = "Reinforcement Learning (RL) is a dynamic area of machine learning where agents are trained to make a sequence of decisions by interacting with an environment. Each interaction leads to a new state and a scalar reward, which indicates the quality of the action taken. The agent’s objective is to learn an optimal policy that maximizes the total accumulated reward over time. This is different from supervised learning, which requires labeled datasets. In RL, learning is driven by experience and the agent often learns from delayed rewards, making the credit assignment problem a central challenge. The environment is often modeled as a Markov Decision Process (MDP), characterized by states, actions, transition dynamics, and rewards. Algorithms such as Q-learning, SARSA, and Policy Gradient methods are used to find optimal policies. Modern applications employ deep learning to approximate complex functions, giving rise to Deep Reinforcement Learning. Techniques like Deep Q-Networks (DQN), Proximal Policy Optimization (PPO), and Actor-Critic methods have demonstrated state-of-the-art performance in domains ranging from game playing (e.g., Atari, Go) to robotics and recommendation systems. Exploration-exploitation trade-offs, sample efficiency, and generalization are ongoing challenges in the field. RL has significant potential in real-world decision-making systems."


difficulty = "hard"





In [20]:
# short answer question 

get_question(
    tag="short answer",
    difficulty=difficulty,
    context=context
)

Question 1: Reinforcement Learning
Question 2: Reinforcement Learning is a dynamic area of machine learning where agents are trained to make a sequence of decisions by interacting with an environment.
Question 3: Reinforcement learning
------------------------------------------------


In [None]:
# true or false question 

get_question(
    tag="true or false question",
    difficulty=difficulty,
    context=context
)

In [None]:
# multiple choice question 

get_question(
    tag="multiple choice question",
    difficulty=difficulty,
    context=context
)