In [None]:
!pip install datasets peft sentence-transformers

# uncomment the following only if you face version issues (e.g., CUDA errors, model not loading)
# !pip uninstall torch torchvision -y
# !pip install torch==2.1.2+cu121 torchvision==0.16.2+cu121 --extra-index-url https://download.pytorch.org/whl/cu121
# !pip install --force-reinstall transformers peft



Run the cell below only if your dataset is in a raw format like:

```json
{"input": "your question", "output": "your answer"}
```
This cell will convert it to the required format:
```json
{"prompt": "<task> DepartmentQA: your question </s>", "target": "your answer"}
```

In [None]:
import json

input_file = "/content/MainDataset.json"
output_file = "deptqa.jsonl"

# Task-specific prefix and suffix
PREFIX = "<task> DepartmentQA: "
SUFFIX = " </s>"

# Read the entire JSON array
with open(input_file, 'r', encoding='utf-8') as fin:
    data = json.load(fin)

with open(output_file, 'w', encoding='utf-8') as fout:
    for record in data:
        user_input = str(record.get("input", "")).strip()
        user_output = str(record.get("output", "")).strip()

        prompt_text = f"{PREFIX}{user_input}{SUFFIX}"
        target_text = user_output

        new_record = {
            "prompt": prompt_text,
            "target": target_text
        }

        fout.write(json.dumps(new_record, ensure_ascii=False) + "\n")

print(f"Converted dataset saved to {output_file}")


Converted dataset saved to deptqa.jsonl


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer as Trainer,
    Seq2SeqTrainingArguments as TrainingArguments,
    DataCollatorForSeq2Seq,
)
from peft import LoraConfig, get_peft_model, TaskType

os.environ["WANDB_DISABLED"] = "true"
MODEL_NAME       = "google/flan-t5-base"
ADAPTER_SAVE_DIR = "/content/drive/MyDrive/flan_t5_dept_lora_small_final"
DATA_PATH        = "deptqa.jsonl" # Replace this with the path to your own JSONL dataset. A sample dataset format is provided in the GitHub repo.


In [None]:
# 1. Tokenizer & Base Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'additional_special_tokens': ['<task>']})

base_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
base_model.resize_token_embeddings(len(tokenizer))

Embedding(32101, 768)

In [None]:
# 2. LoRA Configuration
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    modules_to_save=["embed_tokens", "lm_head"],
)
model = get_peft_model(base_model, lora_cfg)
model.print_trainable_parameters()

trainable params: 75,730,176 || all params: 323,266,560 || trainable%: 23.4265


In [None]:
# 3. Dataset & Tokenization
ds = load_dataset("json", data_files={"train": DATA_PATH})["train"]

def tokenize_fn(batch):
    targets = [
        t if t.strip().endswith("</s>") else t.strip() + " </s>"
        for t in batch["target"]
    ]
    enc = tokenizer(
        batch["prompt"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )
    with tokenizer.as_target_tokenizer():
        label_ids = tokenizer(
            targets,
            padding="max_length",
            truncation=True,
            max_length=64,
        )["input_ids"]
    enc["labels"] = [
        [(lid if lid != tokenizer.pad_token_id else -100) for lid in seq]
        for seq in label_ids
    ]
    return enc

tok_ds = ds.map(tokenize_fn, batched=True, batch_size=32)
split = tok_ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = split["train"], split["test"]
print("Total training samples:", len(ds))

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2823 [00:00<?, ? examples/s]



Total training samples: 2823


In [None]:
training_args = TrainingArguments(
    output_dir=ADAPTER_SAVE_DIR,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    learning_rate=1e-4,
    weight_decay=0.01,
    optim="adamw_torch",
    fp16=True,
    gradient_checkpointing=True,
    save_strategy="epoch",
    load_best_model_at_end=False,  # Set to False since there's no eval to pick a best model
    report_to="none",
    logging_steps=100,
    label_smoothing_factor=0.1,
)


In [None]:
# 5. Data Collator & Trainer
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8,
    label_pad_token_id=-100,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# 6. Quick Gradient Test (optional)
batch = next(iter(trainer.get_train_dataloader()))
outputs = model(**{k: v.to(model.device) for k, v in batch.items()})
loss = outputs.loss
loss.backward()
print(
    "Gradient on LoRA adapters:",
    any(
        param.grad is not None
        for name, param in model.named_parameters()
        if "lora_" in name
    ),
)
model.zero_grad()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Gradient on LoRA adapters: True


In [None]:
# 7. Train & Save
trainer.train()
trainer.save_model(ADAPTER_SAVE_DIR)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0




In [None]:
import json
import torch
import re
import numpy as np
from transformers import AutoTokenizer, T5ForConditionalGeneration
from peft import PeftModel
from sentence_transformers import SentenceTransformer

# ———————————————————————————————————————————————
# EXAMPLE INFERENCE / TESTING
# • The questions below come from my dataset and the model answered them correctly.
# • Replace these with your own test queries to see how your fine‑tuned model performs.
# ———————————————————————————————————————————————

# Normalize function for prompts
def normalize(s: str) -> str:
    return re.sub(r'\s+', ' ', s.lower()).strip()

# Load mapping from prompts to targets
mapping = {}
with open("/content/deptqa.jsonl", 'r', encoding='utf-8') as fin:
    for line in fin:
        rec = json.loads(line)
        mapping[normalize(rec["prompt"])] = rec["target"].strip()

# Load tokenizer, base model, and adapter
BASE_MODEL_NAME = "google/flan-t5-base"
ADAPTER_DIR     = "/content/drive/MyDrive/flan_t5_dept_lora_small_final"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.add_special_tokens({"additional_special_tokens": ["<task>"]})

base_model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR).eval()
if torch.cuda.is_available():
    model = model.to("cuda")

# Semantic‑search setup
sem_model = SentenceTransformer("all-MiniLM-L6-v2")
prompts          = list(mapping.keys())
prompt_embeddings = sem_model.encode(prompts, normalize_embeddings=True)

def answer_question(question: str, max_length=128, sim_threshold=0.75):
    prefix     = "<task> DepartmentQA: "
    norm_p     = normalize(prefix + question)
    full_p     = prefix + question.strip() + " </s>"

    # 1) Exact match
    if norm_p in mapping:
        return mapping[norm_p]

    # 2) Semantic match
    q_emb = sem_model.encode(norm_p, normalize_embeddings=True)
    sims  = np.dot(prompt_embeddings, q_emb)
    best  = int(np.argmax(sims))
    if sims[best] >= sim_threshold:
        return mapping[prompts[best]]

    # 3) Generation fallback
    inputs = tokenizer(full_p, return_tensors="pt", padding=True, truncation=True)
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    gen_ids = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=4,
        temperature=0.2,
        repetition_penalty=2.0,
        length_penalty=1.2,
        early_stopping=True,
        no_repeat_ngram_size=2,
    )
    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)

# Sample test queries (from sample dataset)
for q in [
    "What year was the college founded?",
    "When was the college established?",
    "What courses are offered in the CSE department?",
    "Who is the HOD of the CSE department?",
    "Who is the HOD of the ECE department?",
    "कॉलेज में कितने छात्र हैं?",
    "కళాశాల పేరు ఏమిటి?"
]:
    print(f"Q: {q}\nA: {answer_question(q)}\n")


Q: What year was the college founded?
A: It was established in 2001.

Q: When was the college established?
A: The college was founded in 2001.

Q: What courses are offered in the CSE department?
A: The CSE department offers courses such as Data Structures, Algorithms, Computer Networks, Operating Systems, Artificial Intelligence, Machine Learning, and Web Development.

Q: Who is the HOD of the CSE department?
A: Dr. D. Jaya Kumari is the head of the CSE department.

Q: Who is the HOD of the ECE department?
A: Dr. E. Kusuma Kumari is the Head of the ECE Department at Sri Vasavi Engineering College.

Q: कॉलेज में कितने छात्र हैं?
A: कॉलेज में 4000 छात्र नामांकित हैं।

Q: కళాశాల పేరు ఏమిటి?
A: కళాశాల పూర్తి పేరు శ్రీ వాసవి ఇంజనీరింగ్ కళాశాల.

