In [1]:

!pip install -qU sentence-transformers rank_bm25 scikit-learn gradio python-dotenv openai


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.6/21.6 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.2/315.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

import os
import json
import pickle
from typing import List, Dict, Any
from dataclasses import dataclass, field

import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity

# Gradio for UI
import gradio as gr

# Optional OpenAI client (only used if user sets key)
import openai
from dotenv import load_dotenv
load_dotenv()


False

In [3]:
# Colab cell (code)
MODEL_NAME = "all-MiniLM-L6-v2"  # small & fast
CHUNK_MIN_WORDS = 5  # avoid insane tiny chunks

class SimpleIndexer:
    def __init__(self, model_name=MODEL_NAME):
        self.model = SentenceTransformer(model_name)
        self.docs = []         # list of {id,title,text,meta}
        self.sentences = []    # list of {doc_id, text, title}
        self.bm25 = None
        self.embeddings = None

    def _chunk_text(self, text:str):
        # naive sentence-splitting and small-window combining to create chunks
        raw_sents = [s.strip() for s in text.replace("\n", " ").split(".") if s.strip()]
        chunks = []
        buffer = ""
        for s in raw_sents:
            if len((buffer + " " + s).split()) <= 40:
                buffer = (buffer + " " + s).strip()
            else:
                if len(buffer.split())>=CHUNK_MIN_WORDS:
                    chunks.append(buffer.strip())
                buffer = s
        if len(buffer.split())>=CHUNK_MIN_WORDS:
            chunks.append(buffer.strip())
        return chunks

    def ingest(self, items: List[Dict[str,Any]], rebuild: bool = True):
        """
        items: list of dicts with keys: id (optional), title (optional), text (required), meta (optional)
        """
        for it in items:
            doc_id = it.get("id") or f"doc_{len(self.docs)}"
            self.docs.append({"id": doc_id, "title": it.get("title",""), "text": it["text"], "meta": it.get("meta",{})})
        # build sentence chunks
        self.sentences = []
        for d in self.docs:
            chunks = self._chunk_text(d["text"])
            for c in chunks:
                self.sentences.append({"doc_id": d["id"], "title": d.get("title",""), "text": c})
        if rebuild:
            self._build_indexes()

    def _build_indexes(self):
        tokenized = [s["text"].lower().split() for s in self.sentences]
        self.bm25 = BM25Okapi(tokenized)
        texts = [s["text"] for s in self.sentences]
        if len(texts)>0:
            self.embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
        else:
            self.embeddings = np.zeros((0, self.model.get_sentence_embedding_dimension()))

    def save(self, path="index_state.pkl"):
        with open(path,"wb") as f:
            pickle.dump({"docs":self.docs, "sentences": self.sentences, "embeddings": self.embeddings}, f)

    def load(self, path="index_state.pkl"):
        with open(path,"rb") as f:
            data = pickle.load(f)
        self.docs = data["docs"]
        self.sentences = data["sentences"]
        self.embeddings = data["embeddings"]
        # rebuild bm25 tokenization
        tokenized = [s["text"].lower().split() for s in self.sentences]
        self.bm25 = BM25Okapi(tokenized)


In [4]:
# Colab cell (code)
# Tunable params
BM25_TOP_K = 10
EMB_TOP_K = 10
HYBRID_K = 6
HYBRID_WEIGHT = 0.5   # weight embedding vs bm25 (0..1)
COSINE_THRESHOLD = 0.55

def bm25_scores(indexer: SimpleIndexer, query: str):
    tokens = query.lower().split()
    return np.array(indexer.bm25.get_scores(tokens))

def embed_scores(indexer: SimpleIndexer, query: str):
    q_emb = indexer.model.encode([query], convert_to_numpy=True)[0]
    if indexer.embeddings is None or len(indexer.embeddings)==0:
        return np.array([])
    sims = cosine_similarity([q_emb], indexer.embeddings)[0]
    return sims

def hybrid_retrieve(indexer: SimpleIndexer, query: str, top_k: int=HYBRID_K):
    if len(indexer.sentences)==0:
        return []
    bm25 = bm25_scores(indexer, query)
    q_emb = indexer.model.encode([query], convert_to_numpy=True)[0]
    emb_sims = embed_scores(indexer, query)

    # normalize
    bm25_norm = (bm25 - bm25.min()) / (bm25.max() - bm25.min() + 1e-9) if bm25.size>0 else np.zeros_like(bm25)
    emb_norm = (emb_sims - emb_sims.min()) / (emb_sims.max() - emb_sims.min() + 1e-9) if emb_sims.size>0 else np.zeros_like(emb_sims)

    combined = HYBRID_WEIGHT * emb_norm + (1 - HYBRID_WEIGHT) * bm25_norm
    idxs = combined.argsort()[::-1][:top_k]

    results = []
    for i in idxs:
        results.append({
            "idx": int(i),
            "text": indexer.sentences[i]["text"],
            "doc_id": indexer.sentences[i]["doc_id"],
            "title": indexer.sentences[i].get("title",""),
            "bm25_score": float(bm25[i]),
            "emb_sim": float(emb_sims[i]) if emb_sims.size>0 else 0.0,
            "combined_score": float(combined[i])
        })
    return results

def validate_retrieval(retrieved):
    strong = [r for r in retrieved if r["emb_sim"] >= COSINE_THRESHOLD]
    return {
        "valid": len(strong) > 0,
        "supporting_chunks": strong,
        "support_count": len(strong)
    }


In [5]:
# Colab cell (code)
import os
from textwrap import dedent

# set OPENAI_API_KEY below if you'd like to use OpenAI. Otherwise the system returns concatenated context.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # in Colab, you can set env with %env or input
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY

SYSTEM_PROMPT = dedent("""
You are an expert product support assistant. Use only the provided context (evidence snippets) to answer the user's question.
If the answer cannot be found in the provided evidence, respond with "I don't know based on the provided documentation." Be concise and cite supporting chunk indices in square brackets.
""").strip()

def build_prompt(query: str, retrieved: List[Dict[str,Any]]):
    ctx = "\n\n".join([f"[{i}] {r['text']}" for i,r in enumerate(retrieved)])
    prompt = f"{SYSTEM_PROMPT}\n\nQUESTION: {query}\n\nCONTEXT:\n{ctx}\n\nAnswer briefly and cite chunk numbers like [0], [1]. If unsure say you don't know."
    return prompt

def call_llm_openai(prompt: str, model="gpt-3.5-turbo", max_tokens=256, temperature=0.0):
    if not openai.api_key:
        return {"text": None, "error": "OPENAI_API_KEY not set"}
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role":"system","content": SYSTEM_PROMPT},
            {"role":"user","content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature
    )
    return {"text": resp.choices[0].message.content}


In [6]:
# Colab cell (code)
indexer = SimpleIndexer()

def run_query(query: str, use_llm: bool = True, top_k: int = HYBRID_K):
    retrieved = hybrid_retrieve(indexer, query, top_k=top_k)
    validation = validate_retrieval(retrieved)
    result = {"query": query, "retrieved": retrieved, "validation": validation}
    if use_llm and openai.api_key:
        prompt = build_prompt(query, [r["text"] for r in retrieved])
        llm = call_llm_openai(prompt)
        if llm.get("text"):
            result["answer"] = llm["text"]
        else:
            result["answer"] = "LLM call failed or key not set. See retrieved context below."
    else:
        # fallback deterministic "answer" - concatenated supporting chunks (or message)
        if validation["support_count"]>0:
            s = "\n\n".join([c["text"] for c in validation["supporting_chunks"][:3]])
            result["answer"] = f"(LLM disabled) Best-supported context snippets:\n\n{s}"
        else:
            result["answer"] = "(LLM disabled) No strongly supporting chunks found. Try rephrasing or ingest more docs."

    return result


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# Colab cell (code)
sample_faqs = [
    {"id":"faq_1","title":"Reset password","text":"To reset your password go to Settings > Account > Reset password. You will receive an email with a reset link that is valid for 1 hour."},
    {"id":"faq_2","title":"API keys location","text":"API keys are under Project > Settings > API Keys. You can create, revoke, and rotate keys there. Never share keys publicly."},
    {"id":"faq_3","title":"Rate limits","text":"The API rate limit is 100 requests per minute per account. Exceeding it returns 429 Too Many Requests. Use exponential backoff."},
    {"id":"faq_4","title":"Billing cycle","text":"Billing is monthly. You will be charged on the first of each month for the previous month's usage."},
    {"id":"faq_5","title":"SAML SSO setup","text":"To set up SAML SSO, provide ACS URL and Entity ID from your identity provider in the SSO settings and upload the IdP certificate."},
    {"id":"faq_6","title":"Supported file types","text":"Our import supports CSV, JSON, and XLSX files for data import. CSV must contain header row. Maximum file size is 5MB."}
]

# ingest into indexer
indexer.ingest(sample_faqs)
print(f"Indexed docs: {len(indexer.docs)}, sentence chunks: {len(indexer.sentences)}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Indexed docs: 6, sentence chunks: 6


In [8]:
# Colab cell (code)
res = run_query("How long is the password reset link valid?", use_llm=False)
print("Answer:\n", res["answer"], "\n")
print("Top retrieved chunks:")
for r in res["retrieved"][:6]:
    print(f"- (bm25={r['bm25_score']:.2f}, emb_sim={r['emb_sim']:.2f}) [{r['doc_id']}] {r['text'][:150]}...")


Answer:
 (LLM disabled) Best-supported context snippets:

To reset your password go to Settings > Account > Reset password You will receive an email with a reset link that is valid for 1 hour 

Top retrieved chunks:
- (bm25=5.24, emb_sim=0.63) [faq_1] To reset your password go to Settings > Account > Reset password You will receive an email with a reset link that is valid for 1 hour...
- (bm25=0.30, emb_sim=0.23) [faq_3] The API rate limit is 100 requests per minute per account Exceeding it returns 429 Too Many Requests Use exponential backoff...
- (bm25=0.32, emb_sim=0.17) [faq_4] Billing is monthly You will be charged on the first of each month for the previous month's usage...
- (bm25=0.00, emb_sim=0.13) [faq_5] To set up SAML SSO, provide ACS URL and Entity ID from your identity provider in the SSO settings and upload the IdP certificate...
- (bm25=0.00, emb_sim=0.07) [faq_2] API keys are under Project > Settings > API Keys You can create, revoke, and rotate keys there Never share 

In [9]:
# Colab cell (code)
from io import StringIO, BytesIO
import pandas as pd

def ingest_file(file):
    """
    Accepts a file object from Gradio:
    - If JSON: expects list of {id,title,text,meta}
    - If txt: treat entire file as single doc
    - If csv: assume columns 'id','title','text' or at least 'text'
    """
    if file is None:
        return "No file provided."
    fn = file.name.lower()
    content = file.read().decode("utf-8")
    items = []
    try:
        if fn.endswith(".json"):
            parsed = json.loads(content)
            if isinstance(parsed, list):
                items = parsed
            elif isinstance(parsed, dict):
                items = [parsed]
        elif fn.endswith(".csv"):
            df = pd.read_csv(StringIO(content))
            # map required column names heuristically
            if "text" in df.columns:
                for _,row in df.iterrows():
                    items.append({"id": row.get("id"), "title": row.get("title",""), "text": str(row["text"])})
            else:
                # fallback: create one doc per row with concatenated fields
                for _,row in df.iterrows():
                    txt = " ".join([f"{c}: {row[c]}" for c in df.columns])
                    items.append({"title": row.get("title",""), "text": txt})
        else:
            # treat as plain text single doc
            items = [{"id": file.name, "title": file.name, "text": content}]
    except Exception as e:
        return f"Failed to parse file: {e}"

    indexer.ingest(items)
    return f"Ingested {len(items)} docs. Total docs: {len(indexer.docs)}, chunks: {len(indexer.sentences)}"

def gradio_query(q, use_llm):
    out = run_query(q, use_llm=use_llm)
    # format retrieved into readable table
    rows = []
    for r in out["retrieved"]:
        rows.append([r["idx"], r["doc_id"], f"{r['bm25_score']:.2f}", f"{r['emb_sim']:.2f}", r["text"]])
    return out["answer"], out["validation"]["valid"], rows

with gr.Blocks() as demo:
    gr.Markdown("## RAG Support Copilot — Colab Demo")
    with gr.Row():
        with gr.Column(scale=1):
            file_in = gr.File(label="Upload docs (.json/.csv/.txt) to ingest", file_count="single")
            ingest_btn = gr.Button("Ingest file")
            ingest_out = gr.Textbox(label="Ingest output")
            stats = gr.Markdown(value=f"**Indexed docs:** {len(indexer.docs)}  \n**Chunks:** {len(indexer.sentences)}")
        with gr.Column(scale=1):
            query_txt = gr.Textbox(label="Ask a support question", placeholder="e.g. Where can I find API keys?")
            use_llm_checkbox = gr.Checkbox(label="Use OpenAI LLM (requires OPENAI_API_KEY)", value=bool(openai.api_key))
            ask_btn = gr.Button("Ask")
            answer_out = gr.Textbox(label="Answer", lines=6)
            valid_badge = gr.Label(value="Validation")
            retrieved_table = gr.Dataframe(headers=["idx","doc_id","bm25","emb_sim","text"], datatype="array")
    ingest_btn.click(fn=ingest_file, inputs=[file_in], outputs=[ingest_out,], show_progress=True)
    # update stats after ingest by chaining a small function (reuse ingest_out then update)
    def ingest_and_stats(f):
        msg = ingest_file(f)
        return msg
    ask_btn.click(fn=gradio_query, inputs=[query_txt, use_llm_checkbox], outputs=[answer_out, valid_badge, retrieved_table])

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bc28ac8c262830c1f9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [10]:
# Colab cell (code)
# Save
indexer.save("index_state.pkl")
print("Saved index_state.pkl")

# To load later in a fresh runtime:
# indexer = SimpleIndexer()
# indexer.load("index_state.pkl")


Saved index_state.pkl


In [11]:
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
openai.api_key = os.environ["OPENAI_API_KEY"]
