In [1]:
# ============================================================
# End-to-end: Fine-tune Flan-T5 with LoRA on CSV ‚Üí Agentic RAG
# ============================================================
# Install dependencies (run once)
!pip install -q transformers datasets peft accelerate sentencepiece \
             langchain langchain-community langgraph faiss-cpu sentence-transformers

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/2.5 MB[0m [31m50.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.5/2.5 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m155.4/155.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.1/46.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m

In [2]:
# -------------------------
# 0. Imports & basic config
# -------------------------
import os, random, math, json
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq, pipeline
)
from peft import LoraConfig, get_peft_model
from tqdm.auto import tqdm

# For embeddings + vector DB
from sentence_transformers import SentenceTransformer
import faiss
# LangChain / LangGraph for agentic flow
from langchain import PromptTemplate, LLMChain
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langgraph.graph import StateGraph, START, END
from typing import TypedDict

In [18]:
# -------------------------
# Config - change as needed
# -------------------------
CSV_PATH = "/content/Cleaned Dataset w-o code.csv"   # <--- adjust if different
FINETUNE_DIR = "./finetuned_flan_csv"
EPOCHS = 5
BS = 4                      # per device batch size (reduce for low VRAM)
LR = 3e-4
MAX_INPUT_LEN = 192
MAX_TARGET_LEN = 128
LORA_R = 8
SEED = 42
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(SEED)
random.seed(SEED)
os.makedirs(FINETUNE_DIR, exist_ok=True)

In [19]:
# -------------------------
# 1) Load & prepare CSV
# -------------------------
df = pd.read_csv(CSV_PATH)
print("Loaded CSV shape:", df.shape)
print("Columns:", df.columns.tolist())

# Pick input/output columns automatically:
# - prefer columns named 'input' and 'output'
# - else use first col as input and last col as target
if {"input", "output"}.issubset(df.columns):
    inp_col, out_col = "input", "output"
elif df.shape[1] >= 2:
    inp_col, out_col = df.columns[0], df.columns[-1]
else:
    raise ValueError("CSV must have at least 2 columns (input & target).")

# Fill NaNs and cast to strings to avoid Arrow errors
df = df[[inp_col, out_col]].fillna("").astype(str)
df = df.rename(columns={inp_col: "input_text", out_col: "target_text"})
print("Using columns:", "input_text -> target_text")
display(df.head(3))

# Create simple instruction-style prompt template for fine-tuning:
def make_pair(inp, tgt):
    # adapt template to your task; keep concise so the model focuses on content
    prompt = f"Instruction: Answer the question based on the CSV row data.\nInput: {inp}\n\nAnswer:"
    return prompt, tgt

pairs = [make_pair(i, t) for i, t in zip(df["input_text"], df["target_text"])]
train_df = pd.DataFrame(pairs, columns=["input_text", "target_text"])

# small train/test split
from sklearn.model_selection import train_test_split
train_pd, test_pd = train_test_split(train_df, test_size=0.06, random_state=SEED)
hf_dset = DatasetDict({
    "train": Dataset.from_pandas(train_pd.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_pd.reset_index(drop=True))
})
print("Train/test sizes:", len(hf_dset["train"]), len(hf_dset["test"]))

Loaded CSV shape: (45, 13)
Columns: ['case_number', 'date', 'time', 'mine', 'owner', 'district', 'state', 'code', 'cause', 'fatalities', 'persons', 'narrative', 'summary']
Using columns: input_text -> target_text


Unnamed: 0,input_text,target_text
0,,On 16 May 2015 at Khetri Copper Mine (Jhunjhun...
1,2.0,On 28 November 2015 at Kayad Underground Mine ...
2,3.0,On 14 January 2015 at Chikla Manganese Mine (B...


Train/test sizes: 42 3


In [20]:
# -------------------------
# 2) Tokenizer & tokenization
# -------------------------
MODEL_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"<|pad|>"})

def preprocess(batch):
    inputs = tokenizer(batch["input_text"], truncation=True, padding="max_length", max_length=MAX_INPUT_LEN)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target_text"], truncation=True, padding="max_length", max_length=MAX_TARGET_LEN)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized = hf_dset.map(preprocess, batched=True, remove_columns=hf_dset["train"].column_names)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Sample tokenized batch:", {k: v.shape for k,v in tokenized["train"][0].items() if hasattr(v,'shape')})

Map:   0%|          | 0/42 [00:00<?, ? examples/s]



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Sample tokenized batch: {'input_ids': torch.Size([192]), 'attention_mask': torch.Size([192]), 'labels': torch.Size([128])}


In [21]:
# -------------------------
# 3) Load model & apply LoRA (PEFT)
# -------------------------
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# resize token embeddings if tokenizer added tokens
model.resize_token_embeddings(len(tokenizer))

lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=16,
    target_modules=["q", "v"],   # reasonable default for T5-style attention
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_cfg)
model = model.to(device)
model.print_trainable_parameters()

trainable params: 884,736 || all params: 248,419,584 || trainable%: 0.3561


In [22]:
# -------------------------
# 4) TrainingArguments & Trainer
# -------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir=FINETUNE_DIR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    seed=SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator,
)


In [23]:
# -------------------------
# 5) Fine-tune
# -------------------------
print("Starting fine-tuning... (this can take time)")
trainer.train()
print("Training finished. Saving...")
model.save_pretrained(FINETUNE_DIR)
tokenizer.save_pretrained(FINETUNE_DIR)
print("Saved fine-tuned model at", FINETUNE_DIR)

Starting fine-tuning... (this can take time)




Epoch,Training Loss,Validation Loss
1,No log,16.103046
2,No log,14.27937
3,No log,13.090337
4,No log,12.413056
5,10.066200,12.146858




Training finished. Saving...




Saved fine-tuned model at ./finetuned_flan_csv


In [24]:
# ======================================================
# ‚úÖ 6) Quick Evaluation of Fine-Tuned Flan-T5 (LoRA)
# ======================================================

from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

BASE = "google/flan-t5-base"
ADAPTER_DIR = "./finetuned_flan_csv"   # folder with your LoRA adapter + tokenizer
SEED = 42
torch.manual_seed(SEED)

# ‚úÖ Load tokenizer from adapter folder (includes pad token from training)
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=True)

# ‚úÖ Load base model and resize embeddings to match adapter tokenizer
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE)
base_model.resize_token_embeddings(len(tokenizer))

# ‚úÖ Attach the LoRA adapter weights
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)

# ‚úÖ Create a text2text-generation pipeline
gen_pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

print("‚úÖ Model & pipeline loaded successfully.")

# ------------------------------------------------------
# Simple evaluation: generate predictions on test samples
# ------------------------------------------------------

def gen_from_input(inp: str) -> str:
    """Generate model prediction for one input string."""
    prompt = (
        f"Instruction: Answer the question based on the CSV row data.\n"
        f"Input: {inp}\n\nAnswer:"
    )
    out = gen_pipe(
        prompt,
        max_new_tokens=128,
        num_return_sequences=1,
        temperature=0.3,
        do_sample=False,
    )[0]["generated_text"]
    return out.strip()

# Sample a few test examples from your held-out test set (from Section 5)
test_samples = test_pd.sample(min(20, len(test_pd)), random_state=SEED)

matches = 0
for i, row in test_samples.iterrows():
    inp = row["input_text"]
    tgt = row["target_text"]
    pred = gen_from_input(inp)
    print(f"\nüîπ Example {i+1}")
    print(f"Input: {inp[:150]}...")
    print(f"Target: {tgt}")
    print(f"Prediction: {pred}")
    if tgt.lower() in pred.lower():
        matches += 1

print(f"\n‚úÖ Simple containment accuracy: {matches}/{len(test_samples)} "
      f"= {matches/len(test_samples):.3f}")


Device set to use cpu


‚úÖ Model & pipeline loaded successfully.

üîπ Example 40
Input: Instruction: Answer the question based on the CSV row data.
Input: 40.0

Answer:...
Target: On 16 August 2015 at Tripura Drilling Mine (West Tripura, Tripura), a Dy.S.Engineer, Deb Das Chkraborty (58 M), was fatally injured when an 81kg diving board extension fell 26m. The accident occurred during drill pipe operations and was in contravention of OMR, 1984 regulations 23(3) and 25(3).
Prediction: 0

üîπ Example 26
Input: Instruction: Answer the question based on the CSV row data.
Input: 26.0

Answer:...
Target: On 25 June 2015 at Billi Markundi Stone Mine (Sonebhadra, Uttar Pradesh), a Contract Labour, Nisha Kumari (19 F), was instantly killed when her scarf got entangled in an unguarded compressor belt-drive. The accident contravened Regulation 174(2) & (5) of the Metalliferous Mines Regulations, 1961, as the moving parts were not fenced.
Prediction: 

üîπ Example 27
Input: Instruction: Answer the question based on th

In [25]:
# ===========================
# Sections 7, 8 and 9 (full)
# ===========================
import os, json, re
from typing import TypedDict
import faiss
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langgraph.graph import StateGraph, START, END
from transformers import pipeline

# ---- Assumptions: these exist from previous steps ----
# df         : pandas.DataFrame with your CSV rows
# gen_pipe   : pipeline("text2text-generation", model=..., tokenizer=..., device=...)
# FINETUNE_DIR: path to save artifacts (string)
# If any of these are missing, set them appropriately before running.

if "df" not in globals():
    raise RuntimeError("df (DataFrame) not found in globals. Load your CSV into df before running this cell.")
if "gen_pipe" not in globals():
    raise RuntimeError("gen_pipe not found. Ensure you created the generation pipeline in Section 6.")

os.makedirs(FINETUNE_DIR, exist_ok=True)

# -------------------------
# 7) Build FAISS vector store
# -------------------------
def row_to_text(row):
    # convert a pandas Series (row) to a single string
    return " | ".join([f"{col}: {row[col]}" for col in row.index.tolist()])

docs = [row_to_text(row) for _, row in df.iterrows()]
print(f"Prepared {len(docs)} documents for FAISS.")

# Use sentence-transformers mini model for embeddings (small & fast)
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

# Build LangChain FAISS vectorstore (wraps a faiss index internally)
lc_faiss = FAISS.from_texts(docs, embedding=embedder)
print("‚úÖ FAISS vectorstore built (LangChain wrapper).")

# For optional raw faiss index saving later, access lc_faiss.index
try:
    raw_index = lc_faiss.index
except AttributeError:
    raw_index = None
    print("‚ö†Ô∏è Could not find raw index attribute on lc_faiss (may be implementation-specific).")




Prepared 45 documents for FAISS.
‚úÖ FAISS vectorstore built (LangChain wrapper).


In [26]:
# -------------------------
# 8) Build LangGraph Agentic RAG
# -------------------------
# wrap the pipeline into a LangChain LLM wrapper
llm_hf = HuggingFacePipeline(pipeline=gen_pipe)

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are a data expert. Use ONLY the provided CSV context to answer the question. "
        "If the context does not contain the answer, say you are not sure.\n\n"
        "Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    ),
)
qa_chain = LLMChain(llm=llm_hf, prompt=prompt)

class RAGState(TypedDict):
    question: str
    context: str
    answer: str
    attempt: int

def retrieve_node(state: RAGState) -> RAGState:
    q = state["question"]
    # get top-k docs (k adjustable)
    results = lc_faiss.similarity_search(q, k=6)
    context = "\n".join([doc.page_content for doc in results])
    return {"question": q, "context": context, "answer": "", "attempt": 0}

def filter_node(state: RAGState) -> RAGState:
    # If question contains a year (e.g., 2015) filter context lines to that year
    years = re.findall(r"\b(19|20)\d{2}\b", state["question"])
    if years:
        # extract full year strings from question
        matched_years = re.findall(r"\b(19|20)\d{2}\b", state["question"])
        # matched_years returns list of captured groups; re-run to get full
        matches_full = re.findall(r"\b(?:19|20)\d{2}\b", state["question"])
        if matches_full:
            filtered = [line for line in state["context"].split("\n") if any(year in line for year in matches_full)]
            if filtered:
                return {**state, "context": "\n".join(filtered)}
    return state

def reason_node(state: RAGState) -> RAGState:
    ans = qa_chain.run(context=state["context"], question=state["question"])
    return {**state, "answer": ans}

def reflect_node(state: RAGState) -> RAGState:
    # Simple reflection: if question asked for a year and answer doesn't mention it,
    # attempt one retry with a more explicit instruction.
    years = re.findall(r"\b(?:19|20)\d{2}\b", state["question"])
    if years and state.get("attempt", 0) < 1 and not any(y in state["answer"] for y in years):
        refl_prompt = PromptTemplate(
            input_variables=["context", "question", "prev_answer"],
            template=(
                "Previous answer:\n{prev_answer}\n\n"
                "Using the same context below, re-check the answer and correct it such that the final answer "
                "explicitly references the requested year if the data supports it. If not supported, say 'not enough data'.\n\n"
                "Context:\n{context}\n\nQuestion: {question}\n\nRevised Answer:"
            )
        )
        refl_chain = LLMChain(llm=llm_hf, prompt=refl_prompt)
        revised = refl_chain.run(context=state["context"], question=state["question"], prev_answer=state["answer"])
        return {**state, "answer": revised, "attempt": state.get("attempt", 0) + 1}
    return state

def output_node(state: RAGState) -> RAGState:
    print("\nüß† Retrieved context (snippet):\n", (state["context"][:1000] + "...") if len(state["context"])>1000 else state["context"])
    print("\nüí¨ Final Answer:\n", state["answer"])
    return state

# Build the StateGraph
graph = StateGraph(RAGState)
graph.add_node("Retriever", retrieve_node)
graph.add_node("Filter", filter_node)
graph.add_node("Reasoner", reason_node)
graph.add_node("Reflector", reflect_node)
graph.add_node("Output", output_node)

graph.add_edge(START, "Retriever")
graph.add_edge("Retriever", "Filter")
graph.add_edge("Filter", "Reasoner")
graph.add_edge("Reasoner", "Reflector")
graph.add_edge("Reflector", "Output")
graph.add_edge("Output", END)

rag_agent = graph.compile()
print("‚úÖ LangGraph RAG agent compiled.")


‚úÖ LangGraph RAG agent compiled.


In [28]:
# -------------------------
# 9) Run queries & save artifacts
# -------------------------
queries = [
    "What is the main cause of accidents in 2015?",
    "List the cause and place of accident for case_number 23.",
    "Which place has the most fatalities?"
]

for q in queries:
    print("\n" + "="*60)
    print("QUERY:", q)
    rag_agent.invoke({"question": q, "context": "", "answer": "", "attempt": 0})

# Save FAISS index and docs for reuse
# Try to save raw index if available
if raw_index is not None:
    index_path = os.path.join(FINETUNE_DIR, "faiss.index")
    faiss.write_index(raw_index, index_path)
    print("‚úÖ Saved raw FAISS index to", index_path)
# Save docs JSON (rows -> texts)
docs_path = os.path.join(FINETUNE_DIR, "docs.json")
with open(docs_path, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)
print("‚úÖ Saved docs to", docs_path)

print("All done.")



QUERY: What is the main cause of accidents in 2015?

üß† Retrieved context (snippet):
 input_text: 25.0 | target_text: On 25 February 2015 at Billi Markundi Stone mine (Sonebhadra, Uttar Pradesh), a Labour, Anita Kumari (22 F), was fatally strangulated when her scarf got entangled in an unguarded compressor belt-drive. The accident was in contravention of Regulations 174(2) & (5) (regarding fencing of machinery) and 34(7)(a) (regarding mine management) of the Metalliferous Mines Regulations, 1961.
input_text: 38.0 | target_text: On 18 December 2015 at Geleki Production Oil Mine (Sibsagar, Assam), a Scrapper Mazdoor, Kanak Hatimuria (40 M), was fatally injured after losing his balance and falling 3.5m onto an iron gas pipe. The accident was in contravention of OMR 84 regulations (Reg. 18(3), (98)) and OMR' 84 regulations (Reg. 16(1), 27, 87, 88) regarding safe operations and personal protective equipment.
input_text: 35.0 | target_text: On 07 July 2015 at Agaria Marble Mine (Rajsamand