In [None]:
import os
os.environ["EURI_API_KEY"] = "use your own key"

In [2]:
# ================================
# Namma Metro RAG Demo (with 50% cutoff message)
# ================================
# REQUIREMENTS:
#   pip install pandas numpy nltk requests faiss-cpu
#
# IMPORTANT: Do NOT hardcode your key in code.
#   import os; os.environ["EURI_API_KEY"] = "YOUR_KEY"
# ================================

import os
import json
import re
import string
import unicodedata
from typing import List, Dict, Tuple, Any

import pandas as pd
import numpy as np
import requests

# --- Read API key from env (preferred). ---
EURI_API_KEY = os.getenv("EURI_API_KEY", "")
if not EURI_API_KEY:
    raise ValueError("EURI_API_KEY environment variable not set. Please set it to your API key.")


In [3]:
# -----------------------------
# 1) Dataset (original text)
# -----------------------------
import os
dir_path = "C:\Data Science\Assignments\Final\metro_rag_application\data"
file_path = os.path.join(dir_path, "dataset.txt")

with open(file_path, "r", encoding="utf-8") as f:
    dataset = f.read()
print(dataset)

1. A Heroic Moment—Fire Averted at Hebbagodi

On the night of August 23, 2025, a crisis was averted at the newly inaugurated Hebbagodi Metro station on the Yellow Line. Around 11:50 PM, a car stalled beneath the elevated station on the waterlogged Hosur Road. Smoke began billowing from the engine bay—a potential fire hazard. That’s when Shivakumar, the on-duty security guard, sprang into action. He grabbed a fire extinguisher from inside the station and swiftly doused the flames, likely caused by a short circuit. His quick response prevented damage to metro infrastructure and avoided additional traffic disruption. Officials praised the effectiveness of routine fire safety training in empowering such timely action. 
Hindustan Times
This incident underscores the vulnerability of metro infrastructure to external hazards and highlights how preparedness and vigilance—especially at newly operational stations—are crucial in maintaining safety.

2. Managing Crush: Barricades at RV Road Station

In [4]:
# Optional: save dataset to CSV
pd.DataFrame({"text": [dataset]}).to_csv("namma_metro_dataset.csv", index=False)

In [5]:
# -----------------------------
# 2) Text cleaning
# -----------------------------
import nltk
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(
    text: str,
    lower: bool = True,
    remove_punct: bool = True,
    remove_stop: bool = True,
    lemmatize_tokens: bool = True,
    remove_numbers: bool = True,
    keep_only_alpha: bool = True
):
    text = unicodedata.normalize("NFKD", text)
    if lower:
        text = text.lower()
    if remove_punct:
        text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = nltk.word_tokenize(text)

    cleaned_tokens: List[str] = []
    for token in tokens:
        if remove_numbers and token.isdigit():
            continue
        if keep_only_alpha and not token.isalpha():
            continue
        if remove_stop and token in stop_words:
            continue
        if lemmatize_tokens:
            token = lemmatizer.lemmatize(token)
        cleaned_tokens.append(token)
    return " ".join(cleaned_tokens)

cleaned_data = clean_text(dataset)

print(f"Cleaned length: {len(cleaned_data)}")
print(f"Clean data: {cleaned_data}")



Cleaned length: 5939
Clean data: heroic averted hebbagodi night august crisis averted newly inaugurated hebbagodi metro station yellow line around pm car stalled beneath elevated station waterlogged hosur road smoke began billowing engine potential fire hazard shivakumar onduty security guard sprang action grabbed fire extinguisher inside station swiftly doused flame likely caused short circuit quick response prevented damage metro infrastructure avoided additional traffic disruption official praised effectiveness routine fire safety training empowering timely action hindustan time incident underscore vulnerability metro infrastructure external hazard highlight preparedness newly operational crucial maintaining safety managing crush barricade rv road station opening yellow line significantly boosted ridership particularly rv road interchange passenger transfer green line head toward electronics city daily interchange volume surged estimated passenger manage resulting crowding bmrcl ins

In [6]:
# -----------------------------
# 3) Chunking
# -----------------------------
def chunk_text(text: str, max_char: int = 500, overlap: int = 100) -> List[str]:
    assert max_char > overlap >= 0, "max_char must be > overlap >= 0"
    chunks: List[str] = []
    i = 0
    n = len(text)
    while i < n:
        piece = text[i : i + max_char]
        chunks.append(piece)
        i += max_char - overlap
    return chunks

In [7]:
chunks = chunk_text(cleaned_data, max_char=500, overlap=100)
print(f"Total chunks created: {len(chunks)}")
print(f"chunk preview: {chunks}")

Total chunks created: 15
chunk preview: ['heroic averted hebbagodi night august crisis averted newly inaugurated hebbagodi metro station yellow line around pm car stalled beneath elevated station waterlogged hosur road smoke began billowing engine potential fire hazard shivakumar onduty security guard sprang action grabbed fire extinguisher inside station swiftly doused flame likely caused short circuit quick response prevented damage metro infrastructure avoided additional traffic disruption official praised effectiveness routine fire ', 'ro infrastructure avoided additional traffic disruption official praised effectiveness routine fire safety training empowering timely action hindustan time incident underscore vulnerability metro infrastructure external hazard highlight preparedness newly operational crucial maintaining safety managing crush barricade rv road station opening yellow line significantly boosted ridership particularly rv road interchange passenger transfer green line hea

In [8]:
# 4) Embeddings via Euron API
#now generate embeddings for each chunk using euron api
import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"{EURI_API_KEY}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding

In [9]:
def create_embeddings(all_chunks: List[str]):
    emb_list: List[np.ndarray] = []
    meta: List[Dict[str, Any]] = []
    for idx, chunk in enumerate(all_chunks):
        vec = generate_embeddings(chunk)
        emb_list.append(vec.astype("float32"))
        meta.append({"id": idx, "text": chunk})
    return emb_list, meta

emb_list, meta = create_embeddings(chunks)

In [10]:
emb_list

[array([ 0.00190056, -0.02314615,  0.01010014, ..., -0.01120654,
         0.04015619,  0.00729002], dtype=float32),
 array([ 0.00660523, -0.01757628,  0.04887142, ..., -0.00424478,
        -0.01780224,  0.01797978], dtype=float32),
 array([-0.01517036, -0.02738903,  0.06595332, ...,  0.00469526,
        -0.01883596,  0.01227358], dtype=float32),
 array([ 0.00380984, -0.03213087,  0.06771402, ...,  0.00286046,
         0.01881493,  0.01088701], dtype=float32),
 array([0.01935416, 0.00617793, 0.06675509, ..., 0.00952975, 0.01972918,
        0.00410188], dtype=float32),
 array([0.04742255, 0.03163421, 0.07885545, ..., 0.01760012, 0.03873752,
        0.02069165], dtype=float32),
 array([ 0.01055104,  0.03592876,  0.05575153, ..., -0.0038447 ,
        -0.00438673, -0.00160084], dtype=float32),
 array([ 0.01873054, -0.00930195,  0.04663005, ..., -0.00080458,
        -0.01184115,  0.00035282], dtype=float32),
 array([ 0.02773855, -0.04637218,  0.04445823, ...,  0.02564687,
        -0.01295331

In [11]:
meta

[{'id': 0,
  'text': 'heroic averted hebbagodi night august crisis averted newly inaugurated hebbagodi metro station yellow line around pm car stalled beneath elevated station waterlogged hosur road smoke began billowing engine potential fire hazard shivakumar onduty security guard sprang action grabbed fire extinguisher inside station swiftly doused flame likely caused short circuit quick response prevented damage metro infrastructure avoided additional traffic disruption official praised effectiveness routine fire '},
 {'id': 1,
  'text': 'ro infrastructure avoided additional traffic disruption official praised effectiveness routine fire safety training empowering timely action hindustan time incident underscore vulnerability metro infrastructure external hazard highlight preparedness newly operational crucial maintaining safety managing crush barricade rv road station opening yellow line significantly boosted ridership particularly rv road interchange passenger transfer green line h

In [12]:
# 5) FAISS index save/load (cosine via IP)
# -----------------------------
import faiss

def save_faiss_index(
    emb_list: List[np.ndarray],
    meta: List[Dict[str, Any]],
    index_path: str = "index_vecass1.faiss",
    meta_path: str = "meta_vecass1.jsonl"
) -> None:
    xb = np.vstack(emb_list).astype("float32")
    faiss.normalize_L2(xb)
    dim = xb.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(xb)
    faiss.write_index(index, index_path)

    with open(meta_path, "w", encoding="utf-8") as f:
        for item in meta:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"FAISS index saved -> {index_path}")
    print(f"Metadata saved   -> {meta_path} (records: {len(meta)})")

In [13]:
save_faiss_index(emb_list, meta, "index_vecass1.faiss", "meta_vecass1.jsonl")

FAISS index saved -> index_vecass1.faiss
Metadata saved   -> meta_vecass1.jsonl (records: 15)


In [14]:
# Load FAISS index and metadata
from typing import Tuple, List, Dict, Any

def load_faiss_index(index_path: str, meta_path: str):
    index = faiss.read_index(index_path)
    meta_list: List[Dict[str, Any]] = []
    with open(meta_path, "r", encoding="utf-8") as f:
        for line in f:
            meta_list.append(json.loads(line))
    print(f"Loaded index: {index_path} | metadata records: {len(meta_list)}")
    return index, meta_list




In [15]:
index, meta = load_faiss_index("index_vecass1.faiss", "meta_vecass1.jsonl")

Loaded index: index_vecass1.faiss | metadata records: 15


In [16]:
index

<faiss.swigfaiss_avx2.IndexFlat; proxy of <Swig Object of type 'faiss::IndexFlat *' at 0x0000017364F8BC90> >

In [17]:
meta

[{'id': 0,
  'text': 'heroic averted hebbagodi night august crisis averted newly inaugurated hebbagodi metro station yellow line around pm car stalled beneath elevated station waterlogged hosur road smoke began billowing engine potential fire hazard shivakumar onduty security guard sprang action grabbed fire extinguisher inside station swiftly doused flame likely caused short circuit quick response prevented damage metro infrastructure avoided additional traffic disruption official praised effectiveness routine fire '},
 {'id': 1,
  'text': 'ro infrastructure avoided additional traffic disruption official praised effectiveness routine fire safety training empowering timely action hindustan time incident underscore vulnerability metro infrastructure external hazard highlight preparedness newly operational crucial maintaining safety managing crush barricade rv road station opening yellow line significantly boosted ridership particularly rv road interchange passenger transfer green line h

In [18]:
# 6) Search + threshold logic
# -----------------------------
THRESHOLD = 0.30  # 50%

def search_faiss(index: faiss.IndexFlatIP, query: str, top_k: int = 6):
    q = generate_embeddings(query).reshape(1, -1).astype("float32")
    faiss.normalize_L2(q)
    scores, idxs = index.search(q, top_k)
    return scores, idxs

In [19]:
def results_to_paragraph(
    data_output: Tuple[np.ndarray, np.ndarray],
    meta: List[Dict[str, Any]],
    threshold: float = THRESHOLD
):
    scores, idxs = data_output
    scores = np.asarray(scores).ravel().astype(float)
    idxs = np.asarray(idxs).ravel().astype(int)

    meta_by_id = {int(m["id"]): m["text"] for m in meta}

    selected: List[str] = []
    for i, s in zip(idxs, scores):
        if s >= threshold and i in meta_by_id:
            selected.append(meta_by_id[i])

    # Custom message per your requirement:
    if not selected:
        return "I am not trend on given data"

    return " ".join(selected)

In [20]:
def retrieve_context(
    data_output: Tuple[np.ndarray, np.ndarray],
    meta: List[Dict[str, Any]],
    top_n: int = 3,
    threshold: float = THRESHOLD
):
    scores, idxs = data_output
    scores = np.asarray(scores).ravel().astype(float)
    idxs = np.asarray(idxs).ravel().astype(int)

    meta_by_id = {int(m["id"]): m["text"] for m in meta}

    pairs = [(i, s) for i, s in zip(idxs, scores) if s >= threshold and i in meta_by_id]
    pairs = pairs[:top_n]

    return [(meta_by_id[i], float(s)) for i, s in pairs]

In [21]:

# 7) Prompt + completion (only if threshold passed)

def generate_completion(prompt, model="gpt-4.1-nano"):
    url = "https://api.euron.one/api/v1/euri/chat/completions"
    headers = {
        "Authorization": f"Bearer {EURI_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 500,
        "temperature": 0.3
    }
    response = requests.post(url, headers=headers, json=payload)
    return response.json()['choices'][0]['message']['content']
	
	


In [22]:
def build_prompt(query: str, context_chunks: List[Tuple[str, float]]) -> str:
    context_texts = "\n\n".join([c[0] for c in context_chunks])
    prompt = f"""You are a helpful assistant. 
Here is some context from the knowledge base:

{context_texts}

Based on the above, answer this question:

{query}
"""
    return prompt

In [23]:
# 8) Demo run
# -----------------------------
query = "Kempapura to JP Nagar 4th Phase"

data_output = search_faiss(index, query, top_k=6)

# First, check the thresholded paragraph:
paragraph = results_to_paragraph(data_output, meta, threshold=THRESHOLD)
print("\n--- Thresholded Paragraph or Message ---\n", paragraph, "\n")
if paragraph.strip() == "I am not trend on given data":
    # Nothing passed the 50% threshold; stop here as requested.
    pass
else:
    # We have relevant context; proceed to build a prompt and call the model.
    context_chunks = retrieve_context(data_output, meta, top_n=3, threshold=THRESHOLD)
    # Safety: if, for some reason, context_chunks is empty, also show the same message.
    if not context_chunks:
        print("I am not trend on given data")
    else:
        prompt = build_prompt(query, context_chunks)
        print("\n--- Prompt Sent to Model ---\n", prompt, "\n")
        answer = generate_completion(prompt)
        print("--- Model Answer ---\n", answer)


--- Thresholded Paragraph or Message ---
 e tasked preparing detailed project report dpr evaluates cost benefit technical feasibility phase expansion environmental challenge delay progress meanwhile phase metro expansion encompassing fully elevated network split corridor jp nagar phase kempapura hosahalli kadabagere hit regulatory roadblock environmental impact assessment eia report delayed initial round public stakeholder meeting saw low turnout necessitated additional public consultation pushed back eia final dpr tree impact estimate  operational wikipedia ongoing future line pink line km route kalena agrahara nagawara including km elevated section open march km underground corridor via mg road dairy slated open september blue line airport link intended connect central silk board kempegowda international airport via kr pura nagawara entire line expected ready june phase corridor orange line orrwest kempapura jp nagar phase approved expected grey line also approved target completion 