In [None]:
%run 07_memory.ipynb

# Without Guardrails

- No Redaction (No masking personal info)
- Answers Off-Topic
- Model is itself very good at - jailbreaks, harmful content checks 

In [5]:
answer_query("9566123456 is my phone number")

Router decision: direct


"I understand you've shared your phone number as 9566123456. How can I assist you further today?"

In [6]:
answer_query("what was my phone number")

Router decision: direct


'You shared your phone number as 9566123456. How can I assist you further?'

In [15]:
answer_query("give me a cooking recipe for butter chicken")

Router decision: direct


"Certainly! Here's a delicious recipe for Butter Chicken (Murgh Makhani):\n\n### Ingredients:\n\n#### For the chicken marinade:\n- 500 grams boneless chicken, cut into bite-sized pieces\n- 1/2 cup yogurt\n- 1 tablespoon lemon juice\n- 1 tablespoon ginger-garlic paste\n- 1 teaspoon red chili powder\n- 1/2 teaspoon turmeric powder\n- 1 teaspoon garam masala\n- Salt to taste\n\n#### For the gravy:\n- 3 tablespoons butter\n- 1 large onion, finely chopped\n- 2 teaspoons ginger-garlic paste\n- 2 large tomatoes, pureed\n- 1 teaspoon red chili powder\n- 1 teaspoon coriander powder\n- 1/2 teaspoon turmeric powder\n- 1 teaspoon garam masala\n- 1/2 cup heavy cream\n- 1 tablespoon dried fenugreek leaves (kasuri methi) – optional\n- Salt to taste\n- Fresh coriander leaves for garnish\n\n### Instructions:\n\n1. **Marinate the chicken:**\n   - In a bowl, mix yogurt, lemon juice, ginger-garlic paste, red chili powder, turmeric, garam masala, and salt.\n   - Add the chicken pieces and coat well.\n   - 

In [26]:
import re

# ---------------- Strong Off-Topic Blockers ---------------- #
OFF_TOPIC_KEYWORDS = [
    "recipe", "cook", "cooking", "biryani", "chicken", "pizza",
    "buy", "shopping", "cheapest", "amazon", "flipkart",
    "travel", "flight", "hotel",
    "doctor", "medicine", "health",
    "gym", "workout",
    "python code", "write code", "script", "bug", "program",
    "movie", "netflix", "song", "music",
    "love", "relationship", "dating"
]

# Allowed conversational follow-ups
FOLLOWUP_KEYWORDS = [
    "previous message", "last answer", "repeat", "continue",
    "explain that", "explain more", "summarize", "what did i ask",
    "what did you say"
]

# Allowed greetings
GREETINGS = ["hi", "hello", "hey", "thanks", "ok"]


# ====================== INBOUND CHECK ====================== #
def inbound_check(query: str):
    """
    Runs BEFORE sending query to router/LLM.
    Validates, sanitizes, and blocks harmful or strongly off-topic input.
    """

    q = query.lower().strip()

    # 0. Allow greetings
    if q in GREETINGS or q.startswith(tuple(GREETINGS)):
        return {"status": "ok", "cleaned_query": query}

    # 1. Allow follow-up questions (important!)
    if any(key in q for key in FOLLOWUP_KEYWORDS):
        return {"status": "ok", "cleaned_query": query}

    # 2. Block harmful categories
    if detect_harmful_intent(query):
        return {"status": "blocked", "message": "I can't help with that."}

    # 3. Block jailbreak attempts
    if detect_jailbreak(query):
        return {"status": "blocked", "message": "Request denied for safety reasons."}

    # 4. Block strong off-topic content
    if any(key in q for key in OFF_TOPIC_KEYWORDS):
        return {
            "status": "blocked",
            "message": "I can only answer questions related to company annual reports and financial information."
        }

    # 5. Mask PII (always allowed)
    cleaned = mask_pii(query)

    return {"status": "ok", "cleaned_query": cleaned}



# ====================== OUTBOUND CHECK ====================== #
def outbound_check(response: str):
    """
    Runs AFTER LLM finishes but BEFORE sending output to the user.
    """

    # prevent revealing system instructions
    if "system prompt" in response.lower() and "gpt" in response.lower():
        return "[Output filtered by safety policy]"

    # mask any PII generated by mistake
    response = mask_pii(response)

    return response



# ====================== UTILITIES ====================== #
def detect_harmful_intent(text: str):
    harmful_keywords = [
        "kill myself", "harm", "make a bomb", "hack", "steal",
        "ddos", "suicide", "drug", "weapon"
    ]
    t = text.lower()
    return any(k in t for k in harmful_keywords)


def detect_jailbreak(text: str):
    jailbreak_triggers = [
        "ignore previous instructions",
        "pretend to be",
        "act as dan",
        "reveal your system prompt",
        "bypass",
        "jailbreak"
    ]
    t = text.lower()
    return any(k in t for k in jailbreak_triggers)


def mask_pii(text: str):
    # Mask phone numbers
    text = re.sub(r"\b\d{10}\b", "[PHONE_REDACTED]", text)
    # Mask email addresses
    text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z.-]+\.[A-Za-z]{2,}", "[EMAIL_REDACTED]", text)
    return text


In [27]:
def answer_query(query, session_id="session_1"):

    inbound = inbound_check(query)
    if inbound["status"] == "blocked":
        return inbound["message"]    
    query = inbound["cleaned_query"]
    
    # STEP 0 — ROUTER
    decision = route_query(query)
    print("Router decision:", decision)

    # ---------------------------
    # DIRECT MODE → use MEMORY
    # ---------------------------
    if decision == "direct":
        answer = memory_chat(query, session_id=session_id)
        cache_set(query, answer)
        return answer

    # ---------------------------
    # RAG MODE
    # ---------------------------

    # STEP 1 — CACHE CHECK
    cached = cache_get(query)
    if cached:
        print("CACHE HIT")
        return cached

    print("CACHE MISS → Running full RAG pipeline...")

    # STEP 2 — RETRIEVE
    retrieved = retrieve_candidates(query)

    # STEP 3 — RERANK
    reranked = rerank_documents(query, retrieved, top_k=6)

    # STEP 4 — GENERATE
    answer = generate_answer(query, reranked)

    # STEP 5 — STORE FINAL ANSWER IN CACHE
    cache_set(query, answer)

    # STEP 6 — STORE BOTH QUERY & ANSWER IN MEMORY
    with_memory.invoke(
        {"input": f"USER: {query}"},
        config={"configurable": {"session_id": session_id}}
    )

    with_memory.invoke(
        {"input": f"ASSISTANT: {answer}"},
        config={"configurable": {"session_id": session_id}}
    )
    safe_output = outbound_check(assistant_response)
    return safe_output

# After Guardrails

- Redaction (masking personal info) Done
- Ignored Off-Topic
- jailbreaks, harful content checks (Even though th was already very good at this)

In [24]:
answer_query("9566123456 is my phone number")

Router decision: direct


'Thank you for sharing your phone number. How can I assist you today?'

In [25]:
answer_query("in my previous message, was my phone number redacted?") 

Router decision: direct


'Yes, in your previous message, your phone number was shown as "[PHONE_REDACTED]," indicating that it was redacted for privacy reasons. How can I assist you further?'

In [28]:
answer_query("give me a cooking recipe for butter chicken")

'I can only answer questions related to company annual reports and financial information.'