In [6]:
# ====== Initialization Cell (Run ONCE) ======
# Loads embedding model, FAISS index, metadata, BM25, (optional) reranker.
# After this cell, call: answer_query("your question") as many times as you want.

import os
import json
import faiss
import numpy as np
import re
from datetime import datetime
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.preprocessing import MinMaxScaler

# -------- CONFIG (update paths if needed) ----------
API_MAPPING_FILE = r"C:\Users\Arpit.x.Tripathi\Downloads\Rag_chatbot\api_unified_data_full.json"    # output from Step 2
FAISS_INDEX_FILE = r"C:\Users\Arpit.x.Tripathi\Downloads\Rag_chatbot\updated_api_index.faiss"     # output from Step 2
EMBED_MODEL_NAME = r"C:\Users\Arpit.x.Tripathi\Downloads\Rag_chatbot\models\all-MiniLM-L6-v2"

# Optional local reranker (cross-encoder).
# If this path is invalid or missing HF files, the code will safely fall back to RRF (non-LLM).
RERANKER_MODEL = CrossEncoder(r"C:\Users\Arpit.x.Tripathi\Downloads\Rag_chatbot\Reranking-Model\local-ms-marco")
# You can also use the hub id directly (requires internet):
# RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

print("Loading API metadata...")
with open(API_MAPPING_FILE, "r", encoding="utf-8") as f:
    api_metadata = json.load(f)
print(f"Loaded {len(api_metadata)} API entries")

print("Loading FAISS index...")
index = faiss.read_index(FAISS_INDEX_FILE)
print("FAISS index loaded. vectors:", index.ntotal)

print(f"Loading embedding model: {EMBED_MODEL_NAME} ...")
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
print("Embedding model loaded.")

# ---------- Text cleaning / tokenization ----------
def clean_text_tokens(text: str):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text).lower()
    tokens = text.split()
    return tokens

# Add parameters & tags into corpus text to improve relevance
def metadata_to_text(m):
    params = m.get('parameters', []) or []
    ptext = " ".join([f"{p.get('in','')}:{p.get('name','')}:{p.get('type','')}" for p in params])
    rtext = " ".join([f"{code}:{resp.get('description','')}" for code, resp in (m.get('responses') or {}).items()])
    return " | ".join([
        str(m.get('path','')),
        str(m.get('method','')),
        str(m.get('summary','')),
        str(m.get('description','')),
        "tags:" + " ".join(m.get('tags',[])),
        "params:" + ptext,
        "responses:" + rtext
    ])

print("Building BM25 index...")
bm25_corpus = []
for m in api_metadata:
    txt = metadata_to_text(m)
    bm25_corpus.append(clean_text_tokens(txt))
bm25 = BM25Okapi(bm25_corpus)
print("BM25 ready.")

# --------- (Optional) Reranker loading (safe) ----------
reranker = None
reranker_device = "cpu"

def _has_hf_files(model_dir: str) -> bool:
    """Check if a local folder looks like a valid HF model (minimal check)."""
    if not os.path.isdir(model_dir):
        return False
    needed = ["config.json"]  # minimal; you can add more like tokenizer_config.json, pytorch_model.bin
    return all(os.path.isfile(os.path.join(model_dir, f)) for f in needed)

try:
    from sentence_transformers import CrossEncoder
    if isinstance(RERANKER_MODEL, str) and os.path.isdir(RERANKER_MODEL) and _has_hf_files(RERANKER_MODEL):
        print(f"Loading reranker from local path: {RERANKER_MODEL} ...")
        reranker = CrossEncoder(RERANKER_MODEL, device=reranker_device, local_files_only=True)
        print("Reranker loaded (local).")
    elif isinstance(RERANKER_MODEL, str) and "/" in RERANKER_MODEL:
        print(f"Loading reranker from HF Hub: {RERANKER_MODEL} ...")
        reranker = CrossEncoder(RERANKER_MODEL, device=reranker_device)
        print("Reranker loaded (hub).")
    else:
        print(f"Reranker not available at '{RERANKER_MODEL}'. Using RRF fallback.")
        reranker = None
except Exception as e:
    print("Failed to load reranker; continuing without it. Error:", str(e))
    reranker = None

# --------- Utility functions ----------
def normalize_query(q: str) -> str:
    """Lightweight typo fix & phrasing cleanup to improve retrieval without LLMs."""
    q_norm = (q or "").lower().strip()
    fixes = {
        "dtails": "details",
        "responsible for get": "get",
        "api responsible for": "api for",
        "booking dtail": "booking detail",
        "bokking": "booking",
        "resrvation": "reservation",
    }
    for k, v in fixes.items():
        q_norm = q_norm.replace(k, v)
    q_norm = re.sub(r"\s+", " ", q_norm).strip()
    return q_norm

def embed_query(query: str):
    vec = embed_model.encode([query], convert_to_numpy=True)
    if vec.ndim == 1:
        vec = vec.reshape(1, -1)
    return vec.astype("float32")

def dense_search(query_vec, top_k=50):
    D, I = index.search(query_vec, top_k)
    results = []
    for idx, dist in zip(I[0], D[0]):
        if idx < 0:
            continue
        # Convert L2 distance to a similarity-like score (bounded, higher=better)
        sim = 1.0 / (1.0 + float(dist))
        results.append((int(idx), sim))
    return results

def bm25_search(query, top_k=50):
    tokens = clean_text_tokens(query)
    scores = bm25.get_scores(tokens)
    ranked = np.argsort(scores)[::-1][:top_k]
    return [(int(i), float(scores[i])) for i in ranked]

def _minmax(arr):
    """Safe MinMax scaling for list/np array; returns zeros if degenerate."""
    arr = np.array(arr, dtype=float).reshape(-1,1)
    if arr.size == 0:
        return np.array([])
    if np.allclose(arr.min(), arr.max()):
        return np.zeros_like(arr).flatten()
    return MinMaxScaler().fit_transform(arr).flatten()

def hybrid_retrieve(query, top_k=20, bm25_weight=0.35):
    """
    Hybrid retrieval combining dense (FAISS) and BM25 with MinMax normalization.
    Returns list of dicts: {'id', 'score', 'meta'}
    """
    # Use normalized query for retrieval (keep original for display)
    query_norm = normalize_query(query)

    qvec = embed_query(query_norm)
    dense = dense_search(qvec, top_k*2)
    bm25_res = bm25_search(query_norm, top_k*2)

    d_norm = _minmax([s for _, s in dense]) if len(dense) > 0 else np.array([])
    b_norm = _minmax([s for _, s in bm25_res]) if len(bm25_res) > 0 else np.array([])

    dense_dict = {dense[i][0]: float(d_norm[i]) for i in range(len(dense))} if len(dense) > 0 else {}
    bm25_dict = {bm25_res[i][0]: float(b_norm[i]) for i in range(len(bm25_res))} if len(bm25_res) > 0 else {}

    all_ids = set(list(dense_dict.keys()) + list(bm25_dict.keys()))
    hybrid = []
    for idx in all_ids:
        d = dense_dict.get(idx, 0.0)
        b = bm25_dict.get(idx, 0.0)
        score = (1 - bm25_weight) * d + bm25_weight * b
        hybrid.append((idx, float(score)))
    hybrid = sorted(hybrid, key=lambda x: x[1], reverse=True)[:top_k]
    return [{"id": idx, "score": s, "meta": api_metadata[idx]} for idx, s in hybrid]

def rerank_candidates(query: str, candidates: list, top_k=5):
    """
    Use cross-encoder to rerank candidate list if available.
    Returns top_k list with 'rerank_score'.
    """
    if not candidates:
        return []
    if reranker is None:
        ranked = sorted(candidates, key=lambda x: x["score"], reverse=True)[:top_k]
        for r in ranked:
            r["rerank_score"] = None
        return ranked

    pair_texts = []
    for c in candidates:
        m = c["meta"]
        text = metadata_to_text(m)
        pair_texts.append((normalize_query(query), text))

    scores = reranker.predict(pair_texts)
    for i, s in enumerate(scores):
        candidates[i]["rerank_score"] = float(s)
    ranked = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)[:top_k]
    return ranked

# ---------- Parameter helpers & Output formatting ----------
def _extract_params(m):
    """Normalized parameter records (optional; kept for future use)."""
    params = m.get("parameters", []) or []
    out = []
    for p in params:
        out.append({
            "name": p.get("name", ""),
            "in": p.get("in", "query"),
            "type": (p.get("type") or (p.get("schema", {}) or {}).get("type") or "string"),
            "required": bool(p.get("required", False)),
            "description": p.get("description", "")
        })
    return out

def _param_names_only(m):
    """Return a comma-separated list of parameter NAMES (no type/in/required)."""
    params = m.get("parameters", []) or []
    names = []
    for p in params:
        name = (p.get("name") or "").strip()
        if name and name != "(body)":
            names.append(name)
    # unique but keep order
    seen = set()
    uniq = [n for n in names if not (n in seen or seen.add(n))]
    return ", ".join(uniq)

def pretty_workflow_text(query: str, reranked: list) -> str:
    """
    Prints exactly in the requested format:
    User Query: ...
    
    Suggested API Workflow:
    
    Step i: <summary or path>.
      Endpoint: <path>  |  Method: <METHOD>
      Parameters: <CommaSeparatedParamNames>   # only if any
    """
    lines = []
    # Header
    lines.append(f"User Query: {query.strip()}\n")
    lines.append("Suggested API Workflow:\n")

    for i, c in enumerate(reranked, start=1):
        m = c["meta"]
        summary = (m.get("summary") or m.get("description") or m.get("path") or "").strip()
        if summary and not summary.endswith("."):
            summary += "."
        path = m.get("path", "").strip()
        method = (m.get("method", "GET") or "GET").upper()
        param_names = _param_names_only(m)

        lines.append(f"Step {i}: {summary}")
        lines.append(f"  Endpoint: {path}  |  Method: {method}")
        if param_names:
            lines.append(f"  Parameters: {param_names}")
        lines.append("")  # blank line after each step

    # Remove trailing blank line
    if lines and lines[-1] == "":
        lines.pop()
    return "\n".join(lines)

# ---------- Reciprocal Rank Fusion (fallback if no reranker) ----------
def rrf_fusion(dense_results, bm25_results, k=60, top_k=5):
    """
    dense_results: list[(idx, score)] sorted desc by dense
    bm25_results:  list[(idx, score)] sorted desc by bm25
    Returns list of dicts like hybrid_retrieve but using RRF.
    """
    def rank_map(res):
        return {i: r for r, (i, _) in enumerate(res, start=1)}

    d_sorted = sorted(dense_results, key=lambda x: x[1], reverse=True)
    b_sorted = sorted(bm25_results, key=lambda x: x[1], reverse=True)
    r_dense = rank_map(d_sorted)
    r_bm25 = rank_map(b_sorted)

    # Proper union of sets
    all_ids = set(r_dense.keys()) | set(r_bm25.keys())

    fused = []
    for i in all_ids:
        s = (1 / (k + r_dense.get(i, 9999))) + (1 / (k + r_bm25.get(i, 9999)))
        fused.append((i, s))
    fused = sorted(fused, key=lambda x: x[1], reverse=True)[:top_k]
    return [{"id": i, "score": s, "meta": api_metadata[i]} for i, s in fused]


print("\nInitialization complete âœ…  You can now call:")
print('  print(response)')

Loading API metadata...
Loaded 1024 API entries
Loading FAISS index...
FAISS index loaded. vectors: 1024
Loading embedding model: C:\Users\Arpit.x.Tripathi\Downloads\Rag_chatbot\models\all-MiniLM-L6-v2 ...
Embedding model loaded.
Building BM25 index...
BM25 ready.
Reranker not available at 'CrossEncoder(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 384, padding_idx=0)
        (position_embeddings): Embedding(512, 384)
        (token_type_embeddings): Embedding(2, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-5): 6 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=384, out_features=384, bias=True)
                (key): Linear(in_features

In [9]:

response = answer_query("Which api responsible for check in flow", use_reranker=False)
print("\nðŸ§© Final Developer Response:\n")
print(response)



ðŸ§© Final Developer Response:

User Query: Which api responsible for check in flow

Suggested API Workflow:

Step 1: Gets the document check status for a specific passenger if there are any.
  Endpoint: /api/nsk/v1/booking/passengers/{passengerKey}/documents/check  |  Method: GET
  Parameters: passengerKey

Step 2: Performs the health check.
  Endpoint: /api/v3/health  |  Method: GET

Step 3: Gets the passengers lift status for a specific journey based on the booking in state.
  Endpoint: /api/nsk/v1/booking/checkin/journey/{journeyKey}/status  |  Method: GET
  Parameters: journeyKey

Step 4: Gets the checkin pre-validation requirements for a specific journey.
  Endpoint: /api/nsk/v2/booking/checkin/journey/{journeyKey}/requirements  |  Method: GET
  Parameters: journeyKey

Step 5: Gets the document check status for a specific passenger's infant if there are any.
  Endpoint: /api/nsk/v1/booking/passengers/{passengerKey}/infant/documents/check  |  Method: GET
  Parameters: passengerKe