In [20]:
# This cell creates the recommended project folders and renames "interface" to "client" if it exists.
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
PROJECT_ROOT.mkdir(parents=True, exist_ok=True)

# Rename interface -> client (only if interface exists and client does not)
interface_dir = PROJECT_ROOT / "interface"
client_dir = PROJECT_ROOT / "client"
if interface_dir.exists() and not client_dir.exists():
    interface_dir.rename(client_dir)

folders_to_create = [
    PROJECT_ROOT / "backend",                     # Backend code (Python + RAG + Ollama calls)
    PROJECT_ROOT / "client",                      # UI layer (Gradio/Streamlit/Web)

    PROJECT_ROOT / "data" / "raw",                # Raw sources (docx/pdf/xlsx)
    PROJECT_ROOT / "data" / "processed",          # Cleaned text outputs (optional)

    PROJECT_ROOT / "data" / "domain_documents" / "traditional_dishes",      # Main dishes (one dish per file)
    PROJECT_ROOT / "data" / "domain_documents" / "desserts",                # Desserts (one dish per file)
    PROJECT_ROOT / "data" / "domain_documents" / "cooking_methods",         # Cooking methods explanations
    PROJECT_ROOT / "data" / "domain_documents" / "ingredients_reference",   # Ingredients/spices reference
    PROJECT_ROOT / "data" / "domain_documents" / "metadata",                # Sources and notes

    PROJECT_ROOT / "vectorstore",                 # Vector database files (Chroma/FAISS/Qdrant)
    PROJECT_ROOT / "rag_store",                   # Cached chunks/exports (optional)
    PROJECT_ROOT / "exports",                     # Output exports
    PROJECT_ROOT / "logs",                        # Logs
    PROJECT_ROOT / "notebooks",                   # Jupyter notebooks
]

for p in folders_to_create:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Raw data folder:", PROJECT_ROOT / "data" / "raw")
print("Domain documents folder:", PROJECT_ROOT / "data" / "domain_documents")
print("Done.")

Project root: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project
Raw data folder: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\data\raw
Domain documents folder: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\data\domain_documents
Done.


In [21]:
# Cell 1: Check that Ollama is installed and running

import subprocess

def run_cmd(cmd: str) -> None:
    # Run a shell command and print its output
    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
    print(result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

# Show Ollama version
run_cmd("ollama -v")

# List installed local models
run_cmd("ollama list")

ollama version is 0.14.2

NAME                       ID              SIZE      MODIFIED       
llama3:latest              365c0bd3c000    4.7 GB    20 minutes ago    
nomic-embed-text:latest    0a109f422b47    274 MB    6 days ago        
llama3.1:latest            46e0c10c039e    4.9 GB    6 days ago        
llama2:latest              78e26419b446    3.8 GB    7 days ago        



In [22]:
# Cell 2: Download the required Ollama model

import subprocess

def run_cmd(cmd: str) -> None:
    # Run a shell command and print its output
    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        shell=True,
        encoding="utf-8",
        errors="ignore"
    )
    print(result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

# Download the llama3 model locally via Ollama
run_cmd("ollama pull llama3")


STDERR: [?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠇ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 6a0746a1ec1a: 100% ▕██████████████████▏ 4.7 GB                         [K
pulling 4fa551d4f938: 100% ▕██████████████████▏  12 KB                         [K
pulling 8ab4849b038c: 100% ▕██████████████████▏  254 B                         [K
pulling 577073ffcc6c: 100% ▕██████████████████▏  110 B                         [K
pulling 3f8eb4da87fa: 100% ▕██████████████████▏  485 B                         [K
verifying sha256 digest [K
writing manifest [K
su

In [23]:
# Save Ollama text generation helper into the backend folder

from pathlib import Path

# Project root path
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Target file path
TARGET_FILE = PROJECT_ROOT / "backend" / "ollama_client.py"

# Ensure backend folder exists
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

# Code to be saved
code = """\
# Ollama text generation helper for the RAG pipeline

import subprocess

def ollama_generate(prompt: str, model: str = "llama3") -> str:
    # Run the model and return the generated text
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    return (result.stdout or "").strip()
"""

# Write file
TARGET_FILE.write_text(code, encoding="utf-8")

print(f"File saved successfully at: {TARGET_FILE}")

File saved successfully at: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\ollama_client.py


In [24]:
# Inspect the Ollama client file and preview its content

from pathlib import Path

# Define the project root path
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Point to the Ollama client file
p = PROJECT_ROOT / "backend" / "ollama_client.py"

# Print the file path
print("File path:", p)

# Check if the file exists
print("File exists:", p.exists())

# Print the first 200 lines of the file for inspection
print("\n--- File content (first 200 lines) ---\n")
print("\n".join(
    p.read_text(encoding="utf-8", errors="ignore").splitlines()[:200]
))

File path: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\ollama_client.py
File exists: True

--- File content (first 200 lines) ---

# Ollama text generation helper for the RAG pipeline

import subprocess

def ollama_generate(prompt: str, model: str = "llama3") -> str:
    # Run the model and return the generated text
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    return (result.stdout or "").strip()


In [25]:
# Reload the backend module and run a quick test to avoid using an old cached version

import sys
import importlib

PROJECT_ROOT = r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project"

# Add the project root to Python path so imports like backend.* work
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Import the module and reload it to apply the latest file changes
import backend.ollama_client as oc
importlib.reload(oc)

# Print available generation functions to confirm they exist
print("Functions found:", [x for x in dir(oc) if "ollama_generate" in x])

# Test the helper function and print the result
test_prompt = "اكتب كلمة واحدة فقط بدون أي شرح: مرحبا"
print(oc.ollama_generate_one_word_ar(test_prompt, model="llama3"))

Functions found: ['ollama_generate', 'ollama_generate_one_word_ar']
مرحبا


In [27]:
from pathlib import Path
from docx import Document

# Define the project root directory
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Define the raw data folder and expected DOCX file path
RAW_DIR = PROJECT_ROOT / "data" / "raw"
DOCX_PATH = RAW_DIR / "dhofar flavor.docx"

# Define where the reusable loader script will be saved
TARGET_FILE = PROJECT_ROOT / "backend" / "load_docx.py"

def read_docx_text(docx_path: str) -> str:
    # Read a DOCX file and return non-empty paragraphs as a single string
    doc = Document(docx_path)
    parts = []
    for p in doc.paragraphs:
        t = (p.text or "").strip()
        if t:
            parts.append(t)
    return "\n".join(parts)

# Show expected file location and raw folder status
print("Expected DOCX path:", DOCX_PATH)
print("Raw folder exists:", RAW_DIR.exists())

# List files inside data/raw to help debugging
if RAW_DIR.exists():
    print("Files inside data/raw:", [x.name for x in RAW_DIR.iterdir()])

# Run the reader now in Jupyter and print a preview
if not DOCX_PATH.exists():
    print("DOCX not found. Please move or rename the file.")
    raw_text = ""
else:
    raw_text = read_docx_text(str(DOCX_PATH))
    print("DOCX loaded. Characters:", len(raw_text))
    print("Preview (first 800 chars):")
    print(raw_text[:800])

# Create the backend folder if needed and save the same logic as a Python file
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = f"""\
# Read domain text from a DOCX file (used as a knowledge source for RAG)

from pathlib import Path
from docx import Document

# Define the project root directory
PROJECT_ROOT = Path(r"{str(PROJECT_ROOT)}")

# Define the path to the domain DOCX file
DOCX_PATH = PROJECT_ROOT / "data" / "raw" / "dhofar flavor.docx"

def read_docx_text(docx_path: str) -> str:
    # Read a DOCX file and return non-empty paragraphs as a single string
    doc = Document(docx_path)
    parts = []
    for p in doc.paragraphs:
        t = (p.text or "").strip()
        if t:
            parts.append(t)
    return "\\n".join(parts)

# Run a small test when executed directly
if __name__ == "__main__":
    if not DOCX_PATH.exists():
        print("DOCX not found at:", DOCX_PATH)
    else:
        raw_text = read_docx_text(str(DOCX_PATH))
        print("DOCX loaded. Characters:", len(raw_text))
        print(raw_text[:800])
"""

# Write the file to disk
TARGET_FILE.write_text(code_to_save, encoding="utf-8")

# Confirm that the file was saved
print("Saved:", TARGET_FILE)

Expected DOCX path: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\data\raw\dhofar flavor.docx
Raw folder exists: True
Files inside data/raw: ['dhofar flavor.docx', 'dhofar flavor.pdf']
DOCX loaded. Characters: 41906
Preview (first 800 chars):
شوربة الجريش
ID: DF-001
نوع الأكلة: شوربة لحم
المنطقة: محافظة ظفار
طريقة الطهي: سلق / طبخ بطيء
الوصف:
تُعد شوربة الجريش من الأكلات التقليدية المعروفة في محافظة ظفار، وتُؤكل غالبًا كوجبة يومية دافئة، خاصة في الأجواء الباردة أو في المساء، ولا ترتبط بمناسبة محددة، لكنها حاضرة بشكل متكرر في البيوت لما تتميز به من قيمة غذائية عالية وقوام مشبع. تُحضَّر من الجريش المطبوخ مع اللحم والسمن والتوابل مثل القرفة واللومي اليابس، مما يمنحها نكهة غنية ومميّزة. تُقدَّم شوربة الجريش عادةً ساخنة، ويمكن تناولها بمفردها أو مع الخبز، وأحيانًا تُقدَّم إلى جانب أطباق خفيفة أخرى، وتعكس بساطة المطبخ الظفاري واعتماده على المكونات المحلية المتوفرة.
المكونات:
2 رأس بصل متوسط الحجم مفروم
نصف كيلو لحم بدون عظم، مقطع مكعبات
6 حبات لومي يابس مقشّر
ملح حسب الرغبة
1 

In [28]:
# This cell does two things:
# 1) Runs the recipe chunking now and prints a preview in Jupyter
# 2) Saves the same logic into backend/chunking.py at the end

from pathlib import Path
import re

# Define the project root directory
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Define where the chunking module will be saved
TARGET_FILE = PROJECT_ROOT / "backend" / "chunking.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

def split_recipes_docx(text: str):
    # Split text into recipe blocks using the DF-ID markers
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    blocks = []
    current = []

    id_pattern = re.compile(r"^ID\s*:\s*DF-\S+", re.IGNORECASE)

    for line in lines:
        if id_pattern.match(line):
            # Save the previous block when a new ID starts
            if current:
                blocks.append("\n".join(current).strip())
                current = []
            current.append(line)
        else:
            # Append normal lines to the current block
            current.append(line)

    # Save the last block
    if current:
        blocks.append("\n".join(current).strip())

    # Fix ordering if the ID line appears before the recipe name
    fixed_blocks = []
    for b in blocks:
        b_lines = b.splitlines()
        if len(b_lines) >= 2 and id_pattern.match(b_lines[0]) and not id_pattern.match(b_lines[1]):
            b_lines = [b_lines[1], b_lines[0]] + b_lines[2:]
        fixed_blocks.append("\n".join(b_lines))

    return fixed_blocks

# Run chunking now in Jupyter (raw_text must already exist from the DOCX loader cell)
blocks = split_recipes_docx(raw_text)
print("Recipes found:", len(blocks))
print("Sample preview:")
print(blocks[0][:800] if blocks else "No blocks found")

# Save the same code into backend/chunking.py
code_to_save = f"""\
# Chunking utilities for splitting domain text into recipe blocks

import re

def split_recipes_docx(text: str):
    # Split text into recipe blocks using the DF-ID markers
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    blocks = []
    current = []

    id_pattern = re.compile(r"^ID\\s*:\\s*DF-\\S+", re.IGNORECASE)

    for line in lines:
        if id_pattern.match(line):
            # Save the previous block when a new ID starts
            if current:
                blocks.append("\\n".join(current).strip())
                current = []
            current.append(line)
        else:
            # Append normal lines to the current block
            current.append(line)

    # Save the last block
    if current:
        blocks.append("\\n".join(current).strip())

    # Fix ordering if the ID line appears before the recipe name
    fixed_blocks = []
    for b in blocks:
        b_lines = b.splitlines()
        if len(b_lines) >= 2 and id_pattern.match(b_lines[0]) and not id_pattern.match(b_lines[1]):
            b_lines = [b_lines[1], b_lines[0]] + b_lines[2:]
        fixed_blocks.append("\\n".join(b_lines))

    return fixed_blocks

if __name__ == "__main__":
    # Minimal self-test
    sample = "Dish Name\\nID: DF-001\\nIngredients: ...\\n"
    out = split_recipes_docx(sample)
    print("Blocks:", len(out))
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

Recipes found: 45
Sample preview:
شوربة الجريش
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\chunking.py


In [29]:
# This cell does two things:
# 1) Runs the DOCX parsing now and prints a preview in Jupyter
# 2) Saves the same logic into backend/parser.py at the end

from pathlib import Path
import re

# Define the project root directory
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Define where the parser module will be saved
TARGET_FILE = PROJECT_ROOT / "backend" / "parser.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

# ================== NORMALIZATION ==================
def normalize_ar(text: str) -> str:
    # Normalize Arabic text for internal matching and keywords
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي").replace("ة", "ه")
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

# ================== PARSE RECIPES ==================
def parse_recipes_from_docx_text(raw_text: str):
    # Parse structured recipes from raw DOCX text using DF IDs and section headers
    lines = [l.strip() for l in raw_text.splitlines() if l.strip()]
    id_pat = re.compile(r"^ID\s*:\s*(DF-[A-Za-z0-9\-]+)\s*$", re.IGNORECASE)

    recipes = []
    cur = None
    section = None
    last_text_line = None

    def new_recipe():
        # Create a new recipe object
        return {
            "id": None,
            "name": None,
            "type": None,
            "region": None,
            "cook_method": None,
            "description": "",
            "ingredients": [],
            "prep": "",
            "keywords": []
        }

    def flush():
        # Finalize and store the current recipe if it has an ID
        nonlocal cur
        if not cur:
            return
        cur["description"] = cur["description"].strip()
        cur["prep"] = cur["prep"].strip()

        cur["keywords"] = list(set(
            normalize_ar(w)
            for w in ((cur["name"] or "").split() + (cur["type"] or "").split())
            if w
        ))

        if cur.get("id"):
            recipes.append(cur)
        cur = None

    for line in lines:
        m = id_pat.match(line)
        if m:
            flush()
            cur = new_recipe()
            cur["id"] = m.group(1).strip()

            if last_text_line and ":" not in last_text_line:
                cur["name"] = last_text_line.strip()
            else:
                cur["name"] = None

            section = None
            continue

        if not line.startswith(("نوع الأكلة", "المنطقة", "طريقة الطهي", "الوصف", "المكونات", "طريقة التحضير")):
            if len(line) <= 60:
                last_text_line = line

        if not cur:
            continue

        if "نوع الأكلة" in line:
            cur["type"] = line.split(":", 1)[-1].strip()
            continue
        if "المنطقة" in line:
            cur["region"] = line.split(":", 1)[-1].strip()
            continue
        if "طريقة الطهي" in line:
            cur["cook_method"] = line.split(":", 1)[-1].strip()
            continue

        if line.startswith("الوصف"):
            section = "description"
            continue
        if line.startswith("المكونات"):
            section = "ingredients"
            continue
        if line.startswith("طريقة التحضير"):
            section = "prep"
            continue

        if section == "description":
            cur["description"] += line + " "
        elif section == "ingredients":
            if ":" not in line and not line.startswith(("ملاحظات", "تكفي")):
                cur["ingredients"].append(line.strip())
        elif section == "prep":
            cur["prep"] += line + " "

    flush()
    return recipes

# ================== RUN NOW IN JUPYTER ==================
recipes = parse_recipes_from_docx_text(raw_text)
print("Parsed recipes:", len(recipes))
if recipes:
    print("First recipe ID:", recipes[0].get("id"))
    print("First recipe name:", recipes[0].get("name"))
    print("Ingredients sample:", (recipes[0].get("ingredients") or [])[:3])
else:
    print("No recipes parsed. Check the DOCX format and ID lines.")

# ================== SAVE MODULE FILE ==================
code_to_save = f"""\
# Parse recipes from DOCX text into structured objects

import re

def normalize_ar(text: str) -> str:
    # Normalize Arabic text for internal matching and keywords
    if not text:
        return ""
    text = re.sub(r"[\\u064B-\\u065F\\u0670]", "", text)
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    text = text.replace("ى","ي").replace("ة","ه")
    text = re.sub(r"\\s+"," ", text).strip().lower()
    return text

def parse_recipes_from_docx_text(raw_text: str):
    # Parse structured recipes from raw text using DF IDs and section headers
    lines = [l.strip() for l in raw_text.splitlines() if l.strip()]
    id_pat = re.compile(r"^ID\\s*:\\s*(DF-[A-Za-z0-9\\-]+)\\s*$", re.IGNORECASE)

    recipes = []
    cur = None
    section = None
    last_text_line = None

    def new_recipe():
        # Create a new recipe object
        return {{
            "id": None,
            "name": None,
            "type": None,
            "region": None,
            "cook_method": None,
            "description": "",
            "ingredients": [],
            "prep": "",
            "keywords": []
        }}

    def flush():
        # Finalize and store the current recipe if it has an ID
        nonlocal cur
        if not cur:
            return
        cur["description"] = cur["description"].strip()
        cur["prep"] = cur["prep"].strip()
        cur["keywords"] = list(set(
            normalize_ar(w)
            for w in ((cur["name"] or "").split() + (cur["type"] or "").split())
            if w
        ))
        if cur.get("id"):
            recipes.append(cur)
        cur = None

    for line in lines:
        m = id_pat.match(line)
        if m:
            flush()
            cur = new_recipe()
            cur["id"] = m.group(1).strip()
            if last_text_line and ":" not in last_text_line:
                cur["name"] = last_text_line.strip()
            else:
                cur["name"] = None
            section = None
            continue

        if not line.startswith(("نوع الأكلة", "المنطقة", "طريقة الطهي", "الوصف", "المكونات", "طريقة التحضير")):
            if len(line) <= 60:
                last_text_line = line

        if not cur:
            continue

        if "نوع الأكلة" in line:
            cur["type"] = line.split(":", 1)[-1].strip()
            continue
        if "المنطقة" in line:
            cur["region"] = line.split(":", 1)[-1].strip()
            continue
        if "طريقة الطهي" in line:
            cur["cook_method"] = line.split(":", 1)[-1].strip()
            continue

        if line.startswith("الوصف"):
            section = "description"
            continue
        if line.startswith("المكونات"):
            section = "ingredients"
            continue
        if line.startswith("طريقة التحضير"):
            section = "prep"
            continue

        if section == "description":
            cur["description"] += line + " "
        elif section == "ingredients":
            if ":" not in line and not line.startswith(("ملاحظات", "تكفي")):
                cur["ingredients"].append(line.strip())
        elif section == "prep":
            cur["prep"] += line + " "

    flush()
    return recipes

if __name__ == "__main__":
    # Minimal self-test
    sample = "اسم\\nID: DF-001\\nنوع الأكلة: ...\\nالمكونات\\nسكر\\nطريقة التحضير\\nخطوة\\n"
    out = parse_recipes_from_docx_text(sample)
    print("Parsed:", len(out))
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

Parsed recipes: 44
First recipe ID: DF-001
First recipe name: شوربة الجريش
Ingredients sample: ['2 رأس بصل متوسط الحجم مفروم', 'نصف كيلو لحم بدون عظم، مقطع مكعبات', '6 حبات لومي يابس مقشّر']
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\parser.py


In [30]:
print(recipes[0]["id"], recipes[0]["name"])

DF-001 شوربة الجريش


In [31]:
# This cell does two things:
# 1) Builds the keyword-to-recipes index now and prints a preview in Jupyter
# 2) Saves the same logic into backend/keyword_index.py at the end

from pathlib import Path
from collections import defaultdict
import re

# Ensure recipes exist (must be created by the parser cell)
assert "recipes" in globals(), "Variable 'recipes' is not defined. Run the parsing cell first."

# Define the project root directory
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Define where the keyword index module will be saved
TARGET_FILE = PROJECT_ROOT / "backend" / "keyword_index.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

def normalize_ar(text: str) -> str:
    # Normalize Arabic text for keyword matching
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي").replace("ة", "ه")
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

# Build keyword → recipes index
keyword_to_recipes = defaultdict(list)

for r in recipes:
    for kw in r.get("keywords", []):
        k = normalize_ar(kw)
        if k:
            keyword_to_recipes[k].append({
                "id": r.get("id"),
                "name": r.get("name")
            })

# Show results now in Jupyter
print("Unique keywords:", len(keyword_to_recipes))

test_kw = "شوربة"
print("Test:", test_kw, "=>",
      keyword_to_recipes.get(normalize_ar(test_kw), [])[:5])

# Save the same logic into backend/keyword_index.py
code_to_save = """\
# Build a keyword-to-recipes index for fast lookup

from collections import defaultdict
import re

def normalize_ar(text: str) -> str:
    # Normalize Arabic text for keyword matching
    if not text:
        return ""
    text = re.sub(r"[\\u064B-\\u065F\\u0670]", "", text)
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    text = text.replace("ى","ي").replace("ة","ه")
    text = re.sub(r"\\s+"," ", text).strip().lower()
    return text

def build_keyword_index(recipes):
    # Create a mapping from keyword to related recipes
    keyword_to_recipes = defaultdict(list)

    for r in recipes:
        for kw in r.get("keywords", []):
            k = normalize_ar(kw)
            if k:
                keyword_to_recipes[k].append({
                    "id": r.get("id"),
                    "name": r.get("name")
                })

    return keyword_to_recipes

if __name__ == "__main__":
    # Minimal self-test
    sample = [{
        "id": "DF-001",
        "name": "Test Dish",
        "keywords": ["شوربة", "تقليدي"]
    }]
    idx = build_keyword_index(sample)
    print("Keywords:", list(idx.keys()))
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

Unique keywords: 110
Test: شوربة => [{'id': 'DF-001', 'name': 'شوربة الجريش'}]
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\keyword_index.py


In [34]:
# ================== STRICT KEYWORD MATCHING (INGREDIENTS ONLY) ==================
# This cell:
# 1) Runs the keyword search now and prints results in Jupyter
# 2) Saves the same code into backend/keyword_router.py at the end

from pathlib import Path
import re

# ================== ORIGINAL CODE (UNCHANGED) ==================
def keyword_suggest(query: str, top_k: int = 12):
    q = normalize_ar(query)
    if not q:
        return [], "none"

    # Strict search inside ingredients only
    pattern = re.compile(rf"(^|\s){re.escape(q)}(\s|$)")
    out = []
    for r in recipes:
        ing_text = normalize_ar(" ".join(r.get("ingredients", [])))
        if pattern.search(ing_text):
            out.append({"id": r["id"], "name": r["name"]})

    # Remove duplicates
    seen = set()
    uniq = []
    for it in out:
        key = (it["id"], it["name"])
        if key not in seen:
            seen.add(key)
            uniq.append(it)

    return uniq[:top_k], "direct" if uniq else "none"

def show_keyword_results(query: str):
    items, mode = keyword_suggest(query, top_k=12)
    if not items:
        return "ما لقيت أكلات تحتوي هذه الكلمة داخل المكونات."
    text = f"أكلات تحتوي ({query}) في المكونات:\n"
    for i, it in enumerate(items, 1):
        text += f"{i}) {it['name']} — ({it['id']})\n"
    text += "\nاكتبي رقم الأكلة لعرض التفاصيل."
    return text

# ================== RUN NOW (JUPYTER OUTPUT) ==================
print(show_keyword_results("سمك"))
print("----")
print(show_keyword_results("لحم"))

# ================== SAVE SAME CODE TO FILE ==================
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "keyword_router.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
import re

def normalize_ar(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"[\\u064B-\\u065F\\u0670]", "", text)
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    text = text.replace("ى","ي").replace("ة","ه")
    text = re.sub(r"\\s+"," ", text).strip().lower()
    return text

def keyword_suggest(query: str, recipes, top_k: int = 12):
    q = normalize_ar(query)
    if not q:
        return [], "none"

    pattern = re.compile(rf"(^|\\s){re.escape(q)}(\\s|$)")
    out = []
    for r in recipes:
        ing_text = normalize_ar(" ".join(r.get("ingredients", [])))
        if pattern.search(ing_text):
            out.append({"id": r["id"], "name": r["name"]})

    seen = set()
    uniq = []
    for it in out:
        key = (it["id"], it["name"])
        if key not in seen:
            seen.add(key)
            uniq.append(it)

    return uniq[:top_k], "direct" if uniq else "none"

def show_keyword_results(query: str, recipes):
    items, mode = keyword_suggest(query, recipes, top_k=12)
    if not items:
        return "ما لقيت أكلات تحتوي هذه الكلمة داخل المكونات."
    text = f"أكلات تحتوي ({query}) في المكونات:\\n"
    for i, it in enumerate(items, 1):
        text += f"{i}) {it['name']} — ({it['id']})\\n"
    text += "\\nاكتبي رقم الأكلة لعرض التفاصيل."
    return text
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

أكلات تحتوي (سمك) في المكونات:
1) مرق السمك بالنارجيل — (DF-014)
2) عطراية (الحبار) — (DF-016)
3) الربيس — (DF-017)
4) الصيادية — (DF-018)
5) المضبي — (DF-023)
6) المالح — (DF-036)

اكتبي رقم الأكلة لعرض التفاصيل.
----
أكلات تحتوي (لحم) في المكونات:
1) شوربة الجريش — (DF-001)
2) سوب اللحم والطماطم — (DF-002)
3) كمباه مقشّد — (DF-008)
4) الجريــش باللحم — (DF-012)
5) رز مقزّح — (DF-013)
6) لحم مفور — (DF-022)
7) المضبي — (DF-023)
8) المقديد — (DF-024)
9) المعجين — (DF-025)
10) قبولي — (DF-040)

اكتبي رقم الأكلة لعرض التفاصيل.
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\keyword_router.py


In [35]:
# ================== EMBEDDINGS + FAISS (RUN + SAVE) ==================
# This cell:
# 1) Builds embeddings and FAISS index now and prints output in Jupyter
# 2) Saves the same logic into backend/vectorstore_faiss.py (single file)

from pathlib import Path
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ================== RUN NOW (JUPYTER OUTPUT) ==================

# Define embedding model
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# Load embedding model
embedder = SentenceTransformer(EMBED_MODEL)

# Prepare documents for vector database
texts = []
for r in recipes:
    doc = (
        f"اسم: {r['name']}\n"
        f"ID: {r['id']}\n"
        f"نوع: {r.get('type','')}\n"
        f"المنطقة: {r.get('region','')}\n"
        f"طريقة الطهي: {r.get('cook_method','')}\n"
        f"الوصف: {r.get('description','')}\n"
        f"المكونات: {', '.join(r.get('ingredients', [])[:20])}\n"
        f"الكلمات المفتاحية: {', '.join(r.get('keywords', []))}\n"
        f"طريقة التحضير: {r.get('prep','')}"
    )
    texts.append(doc)

# Generate embeddings
emb = embedder.encode(texts, show_progress_bar=True)
emb = np.array(emb, dtype=np.float32)

# Build FAISS index
index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)

# Print result in Jupyter
print("FAISS ready | vectors:", index.ntotal, "| dim:", emb.shape[1])

# ================== SAVE SAME CODE TO FILE ==================

PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "vectorstore_faiss.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
# Build embeddings and FAISS vector database for recipe documents

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

def build_faiss_index(recipes):
    # Load embedding model
    embedder = SentenceTransformer(EMBED_MODEL)

    # Prepare documents for vector database
    texts = []
    for r in recipes:
        doc = (
            f"اسم: {r['name']}\\n"
            f"ID: {r['id']}\\n"
            f"نوع: {r.get('type','')}\\n"
            f"المنطقة: {r.get('region','')}\\n"
            f"طريقة الطهي: {r.get('cook_method','')}\\n"
            f"الوصف: {r.get('description','')}\\n"
            f"المكونات: {', '.join(r.get('ingredients', [])[:20])}\\n"
            f"الكلمات المفتاحية: {', '.join(r.get('keywords', []))}\\n"
            f"طريقة التحضير: {r.get('prep','')}"
        )
        texts.append(doc)

    # Generate embeddings
    emb = embedder.encode(texts, show_progress_bar=True)
    emb = np.array(emb, dtype=np.float32)

    # Create FAISS index
    index = faiss.IndexFlatL2(emb.shape[1])
    index.add(emb)

    return index, texts

if __name__ == "__main__":
    # Minimal self-test
    sample = [{
        "id": "DF-001",
        "name": "Test Dish",
        "type": "Traditional",
        "region": "Dhofar",
        "cook_method": "Boiling",
        "description": "Test description",
        "ingredients": ["لحم", "ملح"],
        "keywords": ["تقليدي"],
        "prep": "Test steps"
    }]
    idx, docs = build_faiss_index(sample)
    print("FAISS vectors:", idx.ntotal)
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)




Batches:   0%|          | 0/2 [00:00<?, ?it/s]

FAISS ready | vectors: 44 | dim: 384
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\vectorstore_faiss.py


In [37]:
# Retrieval (RAG) + Ollama Generate (Domain-Restricted)

import subprocess
import numpy as np

def retrieve_top_chunks(query: str, top_k: int = 4):
    # Convert the user query into an embedding vector
    q_emb = embedder.encode([query])
    q_emb = np.array(q_emb, dtype=np.float32)

    # Search the FAISS index to get the nearest recipe documents
    D, I = index.search(q_emb, top_k)

    # Collect the matched recipe texts using their indices
    chunks = []
    for idx in I[0]:
        chunks.append(texts[idx])

    # Return the retrieved chunks as context for the LLM
    return chunks

def ollama_generate(prompt: str, model="llama3"):
    # Send the prompt to Ollama and return the generated response text
    r = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    return (r.stdout or "").strip()

# A domain warning to remind the assistant to answer only from the provided documents
DOMAIN_DISCLAIMER = (
    "تنبيه: هذا الشات يجيب فقط من وثائق (وصفات ظفار) داخل المشروع. "
    "إذا المعلومة غير موجودة في الوثائق، سيتم إرجاع: غير موجود في البيانات."
)

def rag_answer(user_q: str, model="llama3"):
    # Retrieve top relevant recipe chunks for the question
    chunks = retrieve_top_chunks(user_q, top_k=4)

    # Combine chunks into one context string
    context = "\n\n---\n\n".join(chunks)

    # Build a strict prompt that forces answers from context only
    prompt = f"""
{DOMAIN_DISCLAIMER}
قواعد مهمة:
- استخدم (السياق) فقط.
- إذا لم تجد الإجابة في السياق، اكتب بالضبط: غير موجود في البيانات
- لا تخمّن ولا تضف معلومات خارج السياق.
السياق:
{context}
سؤال المستخدم:
{user_q}
الإجابة:
"""

    # Generate the final answer using the LLM
    return ollama_generate(prompt, model=model)

In [39]:
# This cell defines a "smart router" for your chatbot:
# It detects what the user wants (ingredients/prep/description/all), finds recipes by name,
# suggests recipes by ingredient keyword (strict), handles number selection, and falls back to RAG for long questions.

import re  # Regular expressions for strict word matching

# ---------- 1) Intent Detection (decide what the user is asking for) ----------
def detect_intent(user_text: str) -> str:
    t = normalize_ar(user_text)  # Normalize the user text for easier matching

    wants_ing = any(w in t for w in ["مكونات", "مقادير", "المكونات", "ingredients"])  # Check if user wants ingredients
    wants_prep = any(w in t for w in ["طريقة", "تحضير", "اطبخ", "اسوي", "كيف", "خطوات", "prep", "cook"])  # Check if user wants steps
    wants_desc = any(w in t for w in ["وصف", "نبذه", "نبذة", "تعريف", "description", "about"])  # Check if user wants description
    wants_all  = any(w in t for w in ["كل", "كامل", "كامله", "كامله", "everything", "all"])  # Check if user wants everything

    # Use a clear priority order
    if wants_all:  # If user asked for everything
        return "all"
    if wants_ing and wants_prep and wants_desc:  # If user asked for all parts together
        return "all"
    if wants_ing:  # If user asked for ingredients
        return "ingredients"
    if wants_prep:  # If user asked for preparation steps
        return "prep"
    if wants_desc:  # If user asked for description
        return "description"

    # Default: when dish name is mentioned without a clear intent, show everything
    return "all"


# ---------- 2) Formatting (show only the requested part) ----------
def format_recipe_part(r: dict, intent: str) -> str:
    header = f"{r['name']} — ({r['id']})\n"  # Title line with recipe name and ID

    meta = ""  # Meta info block (type/region/cook method)
    if r.get("type"): 
        meta += f"نوع الأكلة: {r['type']}\n"  # Add recipe type if available
    if r.get("region"): 
        meta += f"المنطقة: {r['region']}\n"  # Add region if available
    if r.get("cook_method"): 
        meta += f"طريقة الطهي: {r['cook_method']}\n"  # Add cooking method if available

    if intent == "description":  # If user wants description only
        body = f"\nالوصف:\n{r.get('description','').strip()}" if r.get("description") else "\nالوصف:\nغير موجود في البيانات"
        return (header + meta + body).strip()

    if intent == "ingredients":  # If user wants ingredients only
        if r.get("ingredients"):
            ing = "\n".join([f"- {x}" for x in r["ingredients"]])  # Convert ingredient list into bullet lines
            return (header + meta + "\nالمكونات:\n" + ing).strip()
        return (header + meta + "\nالمكونات:\nغير موجود في البيانات").strip()

    if intent == "prep":  # If user wants preparation steps only
        body = f"\nطريقة التحضير:\n{r.get('prep','').strip()}" if r.get("prep") else "\nطريقة التحضير:\nغير موجود في البيانات"
        return (header + meta + body).strip()

    # intent == "all" (show all parts)
    parts = [header + meta]  # Start with header + meta

    if r.get("description"):
        parts.append("الوصف:\n" + r["description"].strip())  # Add description if available
    else:
        parts.append("الوصف:\nغير موجود في البيانات")  # Fallback if missing

    if r.get("ingredients"):
        parts.append("المكونات:\n" + "\n".join([f"- {x}" for x in r["ingredients"]]))  # Add ingredients list
    else:
        parts.append("المكونات:\nغير موجود في البيانات")  # Fallback if missing

    if r.get("prep"):
        parts.append("طريقة التحضير:\n" + r["prep"].strip())  # Add preparation steps
    else:
        parts.append("طريقة التحضير:\nغير موجود في البيانات")  # Fallback if missing

    return "\n\n".join(parts).strip()  # Join sections with blank lines


# ---------- 3) Find recipe by name (full or partial match) ----------
def find_recipes_in_text(user_text: str, top_k: int = 5):
    q = normalize_ar(user_text)  # Normalize the user query
    matches = []  # Store (score, recipe) matches

    for r in recipes:  # Loop through all recipes
        name_n = normalize_ar(r["name"])  # Normalize recipe name
        if name_n and (name_n in q or q in name_n):  # Match if the name is inside the message or the message is inside the name
            score = len(name_n)  # Longer name match usually means a more precise match
            matches.append((score, r))  # Save the match

    matches.sort(key=lambda x: x[0], reverse=True)  # Sort by score descending
    return [r for _, r in matches[:top_k]]  # Return top matches only


# ---------- 4) Strict keyword search in ingredients only ----------
def suggest_recipes_by_term(term: str, top_k: int = 12):
    t = normalize_ar(term)  # Normalize the keyword
    if not t:
        return []  # If empty after normalization, return nothing

    pattern = re.compile(rf"(^|\s){re.escape(t)}(\s|$)")  # Strict whole-word match
    results = []  # Store matching recipes (id + name)

    for r in recipes:  # Loop through all recipes
        ing_text = normalize_ar(" ".join(r.get("ingredients", [])))  # Join ingredients and normalize as one text
        if pattern.search(ing_text):  # Check if keyword exists in ingredients only
            results.append({"id": r["id"], "name": r["name"]})  # Add match

    seen = set()  # Track duplicates
    unique = []  # Store unique matches
    for it in results:
        key = (it["id"], it["name"])  # Unique key
        if key not in seen:
            seen.add(key)
            unique.append(it)

    return unique[:top_k]  # Return top results


# ---------- 5) Main dispatcher (decide what to do) ----------
def smart_router(user_text: str, state: dict):
    msg = user_text.strip()  # Remove extra spaces
    msg_n = normalize_ar(msg)  # Normalize message (kept for future expansions)

    # (A) If the user typed a number after ingredient suggestions
    if msg.isdigit() and state.get("last_suggestions"):
        choice = int(msg)  # Convert to integer
        opts = state["last_suggestions"]  # Get last suggestions list
        if 1 <= choice <= len(opts):
            picked_id = opts[choice - 1]["id"]  # Get chosen recipe ID
            picked = next((r for r in recipes if r["id"] == picked_id), None)  # Find full recipe by ID
            state["last_suggestions"] = None  # Clear suggestions after choosing

            if picked:
                return format_recipe_part(picked, "all"), state  # Show full recipe
            return "ما قدرت أجيب تفاصيل الأكلة.", state  # Fallback if not found
        return f"اختاري رقم بين 1 و {len(opts)}", state  # Ask for a valid range

    # (B) If the message contains a dish name (full or partial)
    name_hits = find_recipes_in_text(msg, top_k=6)  # Find recipes by name in the message
    if name_hits:
        intent = detect_intent(msg)  # Detect user intent (ingredients/prep/description/all)

        if len(name_hits) > 1:  # If multiple recipes match the name
            state["last_name_options"] = name_hits  # Save options for number selection
            state["last_suggestions"] = None  # Clear ingredient suggestions
            out = "لقيت أكثر من أكلة/نسخة مطابقة، اختاري رقم:\n\n"
            for i, r in enumerate(name_hits, 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الاختيار."
            return out, state

        state["last_name_options"] = None  # Clear name options
        state["last_suggestions"] = None  # Clear suggestions
        return format_recipe_part(name_hits[0], intent), state  # Show the matched recipe part

    # (C) If the user typed a number after name options list
    if msg.isdigit() and state.get("last_name_options"):
        choice = int(msg)  # Convert to integer
        opts = state["last_name_options"]  # Get the stored name options
        if 1 <= choice <= len(opts):
            picked = opts[choice - 1]  # Pick the selected recipe
            state["last_name_options"] = None  # Clear options
            return format_recipe_part(picked, "all"), state  # Show full recipe
        return f"اختاري رقم بين 1 و {len(opts)}", state  # Ask for a valid range

    # (D) If the message is short, treat it as a keyword for ingredient search
    if len(msg.split()) <= 2:
        items = suggest_recipes_by_term(msg, top_k=12)  # Get suggestions by ingredient keyword
        state["last_suggestions"] = items if items else None  # Store suggestions for number selection
        state["last_name_options"] = None  # Clear name options
        return render_suggestions(msg, items), state  # Show suggestions list (render_suggestions must exist)

    # (E) If the message is long, use RAG + Ollama if available
    state["last_suggestions"] = None  # Clear suggestions
    state["last_name_options"] = None  # Clear name options

    if "rag_answer" in globals():  # Check if RAG function exists in the notebook
        return rag_answer(msg, model=MODEL_NAME), state  # Use RAG answer

    return "rag_answer غير مُعرّفة. شغّلي خلية RAG (FAISS + Ollama) أولاً.", state  # Final fallback

In [40]:
# This cell implements the final rule-based router for the chatbot.
# It normalizes Arabic text, finds recipes by name or ingredient keyword,
# formats recipe output, keeps conversation state, and falls back to RAG for long questions.
# At the end, the same code is saved into backend/router.py

import re
from pathlib import Path


# ===================== NORMALIZATION =====================
def normalize_ar(text: str) -> str:
    # Normalize Arabic text to make matching consistent
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)  # Remove Arabic diacritics
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")  # Normalize Alef forms
    text = text.replace("ى","ي").replace("ة","ه")  # Normalize Ya and Ta Marbuta
    text = re.sub(r"\s+"," ", text).strip().lower()  # Remove extra spaces and lowercase
    return text


# ===================== FIND BY NAME (FULL/PARTIAL) =====================
def find_recipes_by_name(user_text: str):
    # Find recipes where the name matches the user text (full or partial)
    q = normalize_ar(user_text)
    hits = []
    for r in recipes:
        name_n = normalize_ar(r.get("name",""))
        if not name_n:
            continue
        if q == name_n or q in name_n or name_n in q:
            hits.append(r)

    # Prefer longer (more specific) names
    hits.sort(key=lambda x: len(normalize_ar(x.get("name",""))), reverse=True)
    return hits


# ===================== STRICT TERM IN INGREDIENTS ONLY =====================
def find_recipes_by_term(term: str):
    # Match keyword strictly inside ingredients only
    t = normalize_ar(term)
    if not t:
        return []

    pattern = re.compile(rf"(^|\s){re.escape(t)}(\s|$)")
    out = []
    for r in recipes:
        ing_text = normalize_ar(" ".join(r.get("ingredients", [])))
        if pattern.search(ing_text):
            out.append(r)
    return out


# ===================== FORMAT RECIPE (NO KEYWORDS SHOWN) =====================
def format_recipe(r: dict) -> str:
    # Convert a recipe dictionary into a readable response
    out = f"{r.get('name','')} — ({r.get('id','')})\n"

    if r.get("type"):
        out += f"نوع الأكلة: {r['type']}\n"
    if r.get("region"):
        out += f"المنطقة: {r['region']}\n"
    if r.get("cook_method"):
        out += f"طريقة الطهي: {r['cook_method']}\n"

    if r.get("description"):
        out += "\nالوصف:\n" + r["description"].strip()

    if r.get("ingredients"):
        out += "\n\nالمكونات:\n"
        for ing in r["ingredients"]:
            out += f"- {ing}\n"

    if r.get("prep"):
        out += "\nطريقة التحضير:\n" + r["prep"].strip()

    return out.strip()


# ===================== HELPERS =====================
def is_short_term(msg: str) -> bool:
    # One or two words are treated as a keyword search
    return len(msg.split()) <= 2


# ===================== STATE =====================
STATE = {
    "options": None,        # Last list of recipe options
    "options_mode": None    # Source of options (name or term)
}


# ===================== MAIN ROUTER =====================
def fixed_router(msg: str):
    # Main decision function that routes user input
    msg = (msg or "").strip()
    if not msg:
        return "اكتبي كلمة مفتاحية (مثل: لحم/سمك) أو اسم أكلة أو سؤال طويل."

    # 0) User selects a number from a previous list
    if msg.isdigit() and STATE["options"]:
        idx = int(msg) - 1
        opts = STATE["options"]
        if 0 <= idx < len(opts):
            picked = opts[idx]
            STATE["options"] = None
            STATE["options_mode"] = None
            return format_recipe(picked)
        return "اختاري رقم صحيح من القائمة."

    # 1) Search by recipe name
    name_matches = find_recipes_by_name(msg)
    if name_matches:
        if len(name_matches) == 1:
            STATE["options"] = None
            STATE["options_mode"] = None
            return format_recipe(name_matches[0])

        STATE["options"] = name_matches
        STATE["options_mode"] = "name"
        out = "لقيت أكثر من أكلة مطابقة للاسم، اختاري رقم:\n\n"
        for i, r in enumerate(name_matches, 1):
            out += f"{i}) {r['name']} — ({r['id']})\n"
        out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
        return out

    # 2) Short keyword → strict ingredient search
    if is_short_term(msg):
        term_matches = find_recipes_by_term(msg)
        if term_matches:
            STATE["options"] = term_matches
            STATE["options_mode"] = "term"
            out = f"أكلات تحتوي ({msg}) داخل المكونات:\n\n"
            for i, r in enumerate(term_matches, 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out
        return "ما لقيت أكلات تحتوي هذه الكلمة داخل المكونات."

    # 3) Long question → RAG
    if "rag_answer" in globals():
        return rag_answer(msg, model=MODEL_NAME)

    return "rag_answer غير معرّفة. شغّلي خلية RAG (FAISS + Ollama) أولاً."


# ===================== QUICK TESTS =====================
print(fixed_router("لحم"))
print("----")
print(fixed_router("1"))
print("----")
print(fixed_router("الجريش"))


# ===================== SAVE SAME CODE TO FILE =====================
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "router.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
import re

def normalize_ar(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"[\\u064B-\\u065F\\u0670]", "", text)
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    text = text.replace("ى","ي").replace("ة","ه")
    text = re.sub(r"\\s+"," ", text).strip().lower()
    return text

def find_recipes_by_name(user_text: str, recipes):
    q = normalize_ar(user_text)
    hits = []
    for r in recipes:
        name_n = normalize_ar(r.get("name",""))
        if name_n and (q == name_n or q in name_n or name_n in q):
            hits.append(r)
    hits.sort(key=lambda x: len(normalize_ar(x.get("name",""))), reverse=True)
    return hits

def find_recipes_by_term(term: str, recipes):
    t = normalize_ar(term)
    if not t:
        return []
    pattern = re.compile(rf"(^|\\s){re.escape(t)}(\\s|$)")
    out = []
    for r in recipes:
        ing_text = normalize_ar(" ".join(r.get("ingredients", [])))
        if pattern.search(ing_text):
            out.append(r)
    return out

def format_recipe(r: dict) -> str:
    out = f"{r.get('name','')} — ({r.get('id','')})\\n"
    if r.get("type"): out += f"نوع الأكلة: {r['type']}\\n"
    if r.get("region"): out += f"المنطقة: {r['region']}\\n"
    if r.get("cook_method"): out += f"طريقة الطهي: {r['cook_method']}\\n"
    if r.get("description"): out += "\\nالوصف:\\n" + r["description"].strip()
    if r.get("ingredients"):
        out += "\\n\\nالمكونات:\\n"
        for ing in r["ingredients"]:
            out += f"- {ing}\\n"
    if r.get("prep"): out += "\\nطريقة التحضير:\\n" + r["prep"].strip()
    return out.strip()
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

لقيت أكثر من أكلة مطابقة للاسم، اختاري رقم:

1) أرز مع اللوبيا الحمراء ( رز ودجر) — (DF-028)
2) سوب اللحم والطماطم — (DF-002)
3) الجريــش باللحم — (DF-012)
4) لحم مفور — (DF-022)
5) الحمضوت — (DF-034)

اكتبي رقم الأكلة لعرض التفاصيل.
----
أرز مع اللوبيا الحمراء ( رز ودجر) — (DF-028)
نوع الأكلة: أرز + بقوليات / طبق رئيسي
المنطقة: محافظة ظفار
طريقة الطهي: طبخ بالأرز

الوصف:
رز ودجر من الأكلات اليومية الشائعة في البيوت الظفارية، خاصة في المناطق الريفية. يُحضَّر من الأرز واللوبيا الحمراء ويُقدَّم غالبًا مع السمن البلدي، وقد يُحضَّر أحيانًا في المناسبات الخاصة.

المكونات:
- 3  أكواب أرز بسمتي
- 1.5 كوب دجر (لوبيا حمراء)
- 3 بصلة كبيرة
- 3 فصوص ثوم
- 3 ملاعق كبيرة سمن
- 1 ملعقة كبيرة ملح
- 6 أكواب ماء

طريقة التحضير:
يُنقع الدجر في الماء لمدة أربع ساعات، ثم يُسلق نصف سلقة. يُفرم البصل ويُحمّر بالسمن حتى يذبل. يُضاف الثوم ويُقلّب قليلًا. يُضاف الدجر ويُقلّب مع البصل جيدًا. يُغسل الأرز ويُضاف إلى القدر. يُضاف الماء والملح. يُترك على نار متوسطة حتى ينضج الأرز تمامًا. تكفي كم شخص: من 4 إلى 6 أشخ

In [41]:
# This cell cleans recipe text for display:
# It removes any part that starts with "الكلمات المفتاحية:" from description/prep,
# then formats the recipe without showing keywords.
# At the end, it tests the output in Jupyter and saves the same logic into backend/clean_display.py

import re
from pathlib import Path

# Cut any text starting from the "keywords" label (even if it appears mid-line)
def strip_keywords_anywhere(text: str) -> str:
    # Return empty string if input is empty
    if not text:
        return ""
    # Split the text at the first occurrence of the keywords label and keep the part before it
    text = re.split(r"(?:الكلمات\s+المفتاحية|كلمات\s+مفتاحية)\s*:\s*", text, maxsplit=1)[0]
    # Remove extra spaces
    return text.strip()

# Clean a recipe dict to ensure keywords are not leaked inside description/prep
def clean_recipe_for_display(r: dict) -> dict:
    # Make a shallow copy so we do not modify the original recipe object
    rr = dict(r)
    # Clean description field
    rr["description"] = strip_keywords_anywhere(rr.get("description", ""))
    # Clean preparation steps field
    rr["prep"] = strip_keywords_anywhere(rr.get("prep", ""))
    # Return cleaned recipe dict
    return rr

# Format a recipe for display (keywords never shown)
def format_recipe(r: dict) -> str:
    # Clean the recipe before formatting
    r = clean_recipe_for_display(r)

    # Build header line
    out = f"{r.get('name','')} — ({r.get('id','')})\n"

    # Add meta fields if they exist
    if r.get("type"):
        out += f"نوع الأكلة: {r['type']}\n"
    if r.get("region"):
        out += f"المنطقة: {r['region']}\n"
    if r.get("cook_method"):
        out += f"طريقة الطهي: {r['cook_method']}\n"

    # Add description if available
    if r.get("description"):
        out += "\nالوصف:\n" + r["description"].strip()

    # Add ingredients list if available
    if r.get("ingredients"):
        out += "\n\nالمكونات:\n"
        for ing in r["ingredients"]:
            out += f"- {ing}\n"

    # Add preparation steps if available
    if r.get("prep"):
        out += "\nطريقة التحضير:\n" + r["prep"].strip()

    # Return final formatted text
    return out.strip()

# ===================== QUICK TESTS (JUPYTER OUTPUT) =====================
# These prints should NOT show the "الكلمات المفتاحية" label
print(format_recipe(next(r for r in recipes if r["id"] == "DF-034")))  # الحمضوت
print("----")
print(format_recipe(next(r for r in recipes if r["id"] == "DF-024")))  # المقديد

# ===================== SAVE SAME CODE TO FILE =====================
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "clean_display.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
import re

def strip_keywords_anywhere(text: str) -> str:
    if not text:
        return ""
    text = re.split(r"(?:الكلمات\\s+المفتاحية|كلمات\\s+مفتاحية)\\s*:\\s*", text, maxsplit=1)[0]
    return text.strip()

def clean_recipe_for_display(r: dict) -> dict:
    rr = dict(r)
    rr["description"] = strip_keywords_anywhere(rr.get("description", ""))
    rr["prep"] = strip_keywords_anywhere(rr.get("prep", ""))
    return rr

def format_recipe(r: dict) -> str:
    r = clean_recipe_for_display(r)

    out = f"{r.get('name','')} — ({r.get('id','')})\\n"
    if r.get("type"): out += f"نوع الأكلة: {r['type']}\\n"
    if r.get("region"): out += f"المنطقة: {r['region']}\\n"
    if r.get("cook_method"): out += f"طريقة الطهي: {r['cook_method']}\\n"

    if r.get("description"):
        out += "\\nالوصف:\\n" + r["description"].strip()

    if r.get("ingredients"):
        out += "\\n\\nالمكونات:\\n"
        for ing in r["ingredients"]:
            out += f"- {ing}\\n"

    if r.get("prep"):
        out += "\\nطريقة التحضير:\\n" + r["prep"].strip()

    return out.strip()
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

الحمضوت — (DF-034)
نوع الأكلة: أكلة تقليدية
المنطقة: محافظة ظفار
طريقة الطهي: طبخ بطيء

الوصف:
الحمضوت أكلة ظفارية شهيرة تُحضَّر من القطميم (السمن البلدي الأبيض) مع جريش القمح وقليل من الملح. تُطبخ عادةً لتحويل القطميم إلى سمن عربي، وتُقدَّم في صحن ويمكن أكلها بالخبز أو بمفردها، ويكون طعمها مالحًا.

المكونات:
- القطميم (السمن البلدي الأبيض)
- جريش القمح
- الملح

طريقة التحضير:
يُطبخ القطميم مع جريش القمح وقليل من الملح حتى يتكوّن خليط متماسك. تُقدَّم مباشرة بعد النضج. تكفي كم شخص: حسب الكمية
----
المقديد — (DF-024)
نوع الأكلة: لحم محفوظ / طبق تقليدي
المنطقة: محافظة ظفار
طريقة الطهي: تمليح وتجفيف

الوصف:
المقديد من الأكلات التقليدية المعروفة في محافظة ظفار، ويُعد من طرق حفظ اللحم القديمة التي اعتمد عليها الأهالي قديمًا. يُحضَّر المقديد من شرائح اللحم التي تُملّح وتُجفَّف في الهواء الطلق لفترة حتى تنشف تمامًا، مما يسمح بحفظها لفترات طويلة دون الحاجة إلى التبريد. لا يرتبط المقديد بمناسبة معيّنة، بل يُحضَّر غالبًا للاستعمال لاحقًا عند الحاجة، ويُؤكل كوجبة رئيسية بعد طبخه أو تسخينه، وغالبًا

In [42]:
# This cell makes the router safer:
# It calls RAG in a protected way (try/except) so the UI will not crash if RAG fails.
# It defines a safe_rag wrapper, a fixed_router_safe that uses it, and a chat_fn for the UI.
# At the end, it saves the same logic into backend/router_safe.py

from pathlib import Path

# Set the Ollama model name you have installed
MODEL_NAME = "llama3"  # You can change to "llama3.1" if you have it

def safe_rag(msg: str):
    # If rag_answer is not defined, return a clear message instead of crashing
    if "rag_answer" not in globals():
        return "RAG غير جاهز. شغّلي خلية (FAISS + rag_answer) أولاً."
    try:
        # Call RAG normally
        return rag_answer(msg, model=MODEL_NAME)
    except Exception as e:
        # Catch errors so the interface stays working
        return f"صار خطأ في RAG: {type(e).__name__} — {e}"

def fixed_router_safe(msg: str):
    # Main router that uses safe_rag instead of calling RAG directly
    msg = (msg or "").strip()
    if not msg:
        return "اكتبي مكوّن (لحم/سمك/تمر) أو اسم أكلة أو سؤال."

    # 0) If user picks a number from the last options list
    if msg.isdigit() and STATE.get("options"):
        idx = int(msg) - 1
        opts = STATE["options"]
        if 0 <= idx < len(opts):
            picked = opts[idx]
            STATE["options"] = None
            return format_recipe(picked)
        return "اختاري رقم صحيح من القائمة."

    # 1) Short message: keyword search in ingredients only
    if is_short_term(msg):
        term_hits = find_recipes_by_term(msg)
        if term_hits:
            STATE["options"] = term_hits
            out = f"أكلات تحتوي داخل المكونات فقط ({msg}):\n\n"
            for i, r in enumerate(term_hits[:12], 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out

        # If nothing found, fallback to safe RAG instead of returning 'not found'
        return safe_rag(msg)

    # 2) Search by recipe name
    name_hits = find_recipes_by_name(msg)
    if name_hits:
        if len(name_hits) == 1:
            STATE["options"] = None
            return format_recipe(name_hits[0])

        STATE["options"] = name_hits
        out = "لقيت أكثر من أكلة مطابقة للاسم، اختاري رقم:\n\n"
        for i, r in enumerate(name_hits, 1):
            out += f"{i}) {r['name']} — ({r['id']})\n"
        out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
        return out

    # 3) Any other case: use safe RAG fallback
    return safe_rag(msg)

def chat_fn(message, history):
    # Gradio ChatInterface expects a function that returns one text string
    return fixed_router_safe(message)

# ===================== SAVE SAME CODE TO FILE =====================
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "router_safe.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
MODEL_NAME = "llama3"

def safe_rag(msg: str):
    if "rag_answer" not in globals():
        return "RAG غير جاهز. شغّلي خلية (FAISS + rag_answer) أولاً."
    try:
        return rag_answer(msg, model=MODEL_NAME)
    except Exception as e:
        return f"صار خطأ في RAG: {type(e).__name__} — {e}"

def fixed_router_safe(msg: str):
    msg = (msg or "").strip()
    if not msg:
        return "اكتبي مكوّن (لحم/سمك/تمر) أو اسم أكلة أو سؤال."

    if msg.isdigit() and STATE.get("options"):
        idx = int(msg) - 1
        opts = STATE["options"]
        if 0 <= idx < len(opts):
            picked = opts[idx]
            STATE["options"] = None
            return format_recipe(picked)
        return "اختاري رقم صحيح من القائمة."

    if is_short_term(msg):
        term_hits = find_recipes_by_term(msg)
        if term_hits:
            STATE["options"] = term_hits
            out = f"أكلات تحتوي داخل المكونات فقط ({msg}):\\n\\n"
            for i, r in enumerate(term_hits[:12], 1):
                out += f"{i}) {r['name']} — ({r['id']})\\n"
            out += "\\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out
        return safe_rag(msg)

    name_hits = find_recipes_by_name(msg)
    if name_hits:
        if len(name_hits) == 1:
            STATE["options"] = None
            return format_recipe(name_hits[0])

        STATE["options"] = name_hits
        out = "لقيت أكثر من أكلة مطابقة للاسم، اختاري رقم:\\n\\n"
        for i, r in enumerate(name_hits, 1):
            out += f"{i}) {r['name']} — ({r['id']})\\n"
        out += "\\nاكتبي رقم الأكلة لعرض التفاصيل."
        return out

    return safe_rag(msg)

def chat_fn(message, history):
    return fixed_router_safe(message)
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\router_safe.py


In [43]:
#نتاكد انه كل شئ محفوظ
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
files = [
    PROJECT_ROOT / "backend" / "ollama_client.py",
    PROJECT_ROOT / "backend" / "keyword_router.py",
    PROJECT_ROOT / "backend" / "vectorstore_faiss.py",
    PROJECT_ROOT / "backend" / "rag_pipeline.py",
    PROJECT_ROOT / "backend" / "router.py",
    PROJECT_ROOT / "backend" / "clean_display.py",
    PROJECT_ROOT / "backend" / "router_safe.py",
]

print("Project root:", PROJECT_ROOT)
ok = 0
for f in files:
    exists = f.exists()
    print(("OK " if exists else "MISSING "), f)
    ok += 1 if exists else 0

print("\nSaved files:", ok, "/", len(files))
print("Save completion %:", round((ok/len(files))*100, 1), "%")

Project root: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\ollama_client.py
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\keyword_router.py
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\vectorstore_faiss.py
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\rag_pipeline.py
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\router.py
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\clean_display.py
OK  C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\router_safe.py

Saved files: 7 / 7
Save completion %: 100.0 %


In [44]:
# This cell checks Ollama readiness:
# It verifies Ollama is installed, the service is running, the model exists,
# then runs a quick test prompt. At the end, it saves the same code into backend/check_ollama.py

import subprocess, shutil
from pathlib import Path

def check_ollama(model="llama3"):
    # Check if the ollama executable exists in PATH
    if shutil.which("ollama") is None:
        return "Ollama is not found in PATH. Install Ollama, then open a new terminal."

    # Check if the Ollama service responds to 'ollama list'
    r = subprocess.run(["ollama", "list"], text=True, capture_output=True, encoding="utf-8", errors="ignore")
    if r.returncode != 0:
        return "Ollama is installed but not running. Run: ollama serve\n" + (r.stderr or "").strip()[:400]

    # Check if the requested model exists in the local list
    out = (r.stdout or "").strip()
    if model not in out:
        return f"Ollama is running but model ({model}) is not available. Run: ollama pull {model}\n\nAvailable models:\n{out}"

    # Quick run test to confirm generation works
    t = subprocess.run(
        ["ollama", "run", model],
        input="Say only: OK",
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    if t.returncode == 0 and (t.stdout or "").strip():
        return "Ollama is ready.\n" + t.stdout.strip()[:200]

    return "Ollama is running but the test prompt returned an empty response.\n" + (t.stderr or "").strip()[:300]

# Run now (Jupyter output)
print(check_ollama("llama3"))

# Save the same code to a file
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "check_ollama.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
import subprocess, shutil

def check_ollama(model="llama3"):
    if shutil.which("ollama") is None:
        return "Ollama is not found in PATH. Install Ollama, then open a new terminal."

    r = subprocess.run(["ollama", "list"], text=True, capture_output=True, encoding="utf-8", errors="ignore")
    if r.returncode != 0:
        return "Ollama is installed but not running. Run: ollama serve\\n" + (r.stderr or "").strip()[:400]

    out = (r.stdout or "").strip()
    if model not in out:
        return f"Ollama is running but model ({model}) is not available. Run: ollama pull {model}\\n\\nAvailable models:\\n{out}"

    t = subprocess.run(
        ["ollama", "run", model],
        input="Say only: OK",
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    if t.returncode == 0 and (t.stdout or "").strip():
        return "Ollama is ready.\\n" + t.stdout.strip()[:200]

    return "Ollama is running but the test prompt returned an empty response.\\n" + (t.stderr or "").strip()[:300]
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

Ollama is ready.
OK
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\check_ollama.py


In [45]:
# This cell defines the final engine:
# It combines strict ingredient matching, name search, general search, and RAG (FAISS + Ollama).
# It also runs quick tests in Jupyter and saves the same code into backend/engine.py.

import re
import numpy as np
import subprocess
from pathlib import Path


# ===================== NORMALIZE =====================
def normalize_ar(text: str) -> str:
    # Normalize Arabic text for consistent matching
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)  # Remove Arabic diacritics
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")  # Normalize Alef forms
    text = text.replace("ى","ي").replace("ة","ه")  # Normalize Ya and Ta Marbuta
    text = re.sub(r"\s+"," ", text).strip().lower()  # Clean spaces and lowercase
    return text


# ===================== TOKENIZE (ingredients only) =====================
def tokenize_ar(text: str):
    # Tokenize Arabic text into a set of words, with optional "ال" removal
    t = normalize_ar(text)
    t = re.sub(r"[^0-9\u0621-\u064A]+", " ", t)  # Keep Arabic letters and numbers only
    words = [w for w in t.split() if w]  # Split into words
    tokens = set(words)  # Use set for fast lookup
    for w in words:
        if w.startswith("ال") and len(w) > 2:
            tokens.add(w[2:])  # Add the word without "ال"
    return tokens

def canonical_term(term: str):
    # Normalize a single search term and remove "ال" if it exists
    t = normalize_ar(term)
    t = re.sub(r"[^0-9\u0621-\u064A]+", " ", t).strip()
    if not t:
        return ""
    if t.startswith("ال") and len(t) > 2:
        return t[2:]
    return t


# ===================== STRICT ingredient search (100%) =====================
def find_recipes_by_ingredient_strict(term: str):
    # Strictly match the term as a full token inside ingredients only
    t = canonical_term(term)
    if not t:
        return []
    out = []
    for r in recipes:
        ing_text = " ".join(r.get("ingredients", []))  # Ingredients only
        tokens = tokenize_ar(ing_text)  # Token set of ingredients
        if t in tokens:  # Exact token match
            out.append(r)
    return out


# ===================== general search (name/type/region/desc/ingredients) =====================
def find_recipes_general(term: str):
    # General substring search across multiple fields (for words like "تقليدي")
    t = normalize_ar(term)
    if not t:
        return []
    out = []
    for r in recipes:
        hay = " ".join([
            r.get("name",""),
            r.get("type","") or "",
            r.get("region","") or "",
            r.get("cook_method","") or "",
            r.get("description","") or "",
            " ".join(r.get("ingredients", []))
        ])
        if t in normalize_ar(hay):
            out.append(r)
    return out


# ===================== name search (user input inside name) =====================
def find_recipes_by_name(user_text: str):
    # Find recipes where user text appears in the recipe name
    q = normalize_ar(user_text)
    hits = []
    for r in recipes:
        name_n = normalize_ar(r.get("name",""))
        if not name_n:
            continue
        if q == name_n or (q and q in name_n):
            hits.append(r)
    hits.sort(key=lambda x: len(normalize_ar(x.get("name",""))), reverse=True)
    return hits


# ===================== format =====================
def format_recipe(r: dict) -> str:
    # Format a recipe into a response text (without emojis)
    out = f"{r.get('name','')} — ({r.get('id','')})\n"
    if r.get("type"): out += f"نوع الأكلة: {r['type']}\n"
    if r.get("region"): out += f"المنطقة: {r['region']}\n"
    if r.get("cook_method"): out += f"طريقة الطهي: {r['cook_method']}\n"

    if r.get("description"):
        out += "\nالوصف:\n" + r["description"].strip()

    if r.get("ingredients"):
        out += "\n\nالمكونات:\n" + "\n".join([f"- {x}" for x in r["ingredients"]])

    if r.get("prep"):
        out += "\n\nطريقة التحضير:\n" + r["prep"].strip()

    return out.strip()

def is_short_term(msg: str) -> bool:
    # One or two words is treated as a keyword search
    return len((msg or "").split()) <= 2


# ===================== RAG (FAISS + Ollama) =====================
MODEL_NAME = "llama3"  # Change only if you use a different local model

DOMAIN_DISCLAIMER = (
    "تنبيه: هذا الشات يجيب فقط من وثائق (وصفات ظفار) داخل المشروع. "
    "إذا المعلومة غير موجودة في الوثائق، سيتم إرجاع: غير موجود في البيانات."
)

def retrieve_top_chunks(query: str, top_k: int = 4):
    # Embed the query and search the FAISS index
    q_emb = embedder.encode([query])
    q_emb = np.array(q_emb, dtype=np.float32)
    D, I = index.search(q_emb, top_k)

    chunks = []
    for idx in I[0]:
        chunks.append(texts[idx])
    return chunks

def ollama_generate(prompt: str, model=MODEL_NAME):
    # Call Ollama and return the model output
    r = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    return (r.stdout or "").strip()

def rag_answer(user_q: str, model=MODEL_NAME):
    # Retrieve context and ask the model to answer using context only
    chunks = retrieve_top_chunks(user_q, top_k=4)
    context = "\n\n---\n\n".join(chunks)

    prompt = f"""
{DOMAIN_DISCLAIMER}

قواعد مهمة:
- استخدم (السياق) فقط.
- إذا لم تجد الإجابة في السياق، اكتب بالضبط: غير موجود في البيانات
- لا تخمّن ولا تضف معلومات خارج السياق.

السياق:
{context}

سؤال المستخدم:
{user_q}

الإجابة:
"""
    out = ollama_generate(prompt, model=model)
    return out if out else "غير موجود في البيانات"


# ===================== STATE + ROUTER =====================
STATE = {"options": None}

def fixed_router(msg: str):
    # Main router that combines strict ingredients, general search, name search, and RAG
    msg = (msg or "").strip()
    if not msg:
        return "اكتبي مكوّن (لحم/سمك/تمر) أو كلمة عامة (تقليدي/تراث) أو اسم أكلة."

    # (0) User selects a number from the last options list
    if msg.isdigit() and STATE["options"]:
        idx = int(msg) - 1
        opts = STATE["options"]
        if 0 <= idx < len(opts):
            picked = opts[idx]
            STATE["options"] = None
            return format_recipe(picked)
        return "اختاري رقم صحيح من القائمة."

    # (1) Short keyword: strict ingredient search first
    if is_short_term(msg):
        strict_hits = find_recipes_by_ingredient_strict(msg)
        if strict_hits:
            STATE["options"] = strict_hits[:12]
            out = "أكلات تحتوي داخل المكونات فقط:\n\n"
            for i, r in enumerate(STATE["options"], 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out

        # If not found in ingredients, use general search
        general_hits = find_recipes_general(msg)
        if general_hits:
            STATE["options"] = general_hits[:12]
            out = "ما لقيت داخل المكونات، لكن لقيتها في البحث العام:\n\n"
            for i, r in enumerate(STATE["options"], 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out

        return "ما لقيت نتائج."

    # (2) Longer text: try name search first
    name_hits = find_recipes_by_name(msg)
    if name_hits:
        if len(name_hits) == 1:
            STATE["options"] = None
            return format_recipe(name_hits[0])

        STATE["options"] = name_hits[:12]
        out = "لقيت أكثر من أكلة مطابقة للاسم، اختاري رقم:\n\n"
        for i, r in enumerate(STATE["options"], 1):
            out += f"{i}) {r['name']} — ({r['id']})\n"
        out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
        return out

    # (3) Fallback: use RAG when no clear name match
    return rag_answer(msg, model=MODEL_NAME)


# ===================== quick tests (Jupyter output) =====================
print(fixed_router("لحم"))
print("----")
print(fixed_router("تقليدي"))
print("----")
print(fixed_router("المقديد"))


# ===================== SAVE SAME CODE TO FILE =====================
PROJECT_ROOT = Path(r"C:\\Users\\USER PC\\OneDrive\\Desktop\\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "engine.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = r'''import re
import numpy as np
import subprocess

def normalize_ar(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    text = text.replace("ى","ي").replace("ة","ه")
    text = re.sub(r"\s+"," ", text).strip().lower()
    return text

def tokenize_ar(text: str):
    t = normalize_ar(text)
    t = re.sub(r"[^0-9\u0621-\u064A]+", " ", t)
    words = [w for w in t.split() if w]
    tokens = set(words)
    for w in words:
        if w.startswith("ال") and len(w) > 2:
            tokens.add(w[2:])
    return tokens

def canonical_term(term: str):
    t = normalize_ar(term)
    t = re.sub(r"[^0-9\u0621-\u064A]+", " ", t).strip()
    if not t:
        return ""
    if t.startswith("ال") and len(t) > 2:
        return t[2:]
    return t

def find_recipes_by_ingredient_strict(term: str, recipes):
    t = canonical_term(term)
    if not t:
        return []
    out = []
    for r in recipes:
        ing_text = " ".join(r.get("ingredients", []))
        tokens = tokenize_ar(ing_text)
        if t in tokens:
            out.append(r)
    return out

def find_recipes_general(term: str, recipes):
    t = normalize_ar(term)
    if not t:
        return []
    out = []
    for r in recipes:
        hay = " ".join([
            r.get("name",""),
            r.get("type","") or "",
            r.get("region","") or "",
            r.get("cook_method","") or "",
            r.get("description","") or "",
            " ".join(r.get("ingredients", []))
        ])
        if t in normalize_ar(hay):
            out.append(r)
    return out

def find_recipes_by_name(user_text: str, recipes):
    q = normalize_ar(user_text)
    hits = []
    for r in recipes:
        name_n = normalize_ar(r.get("name",""))
        if not name_n:
            continue
        if q == name_n or (q and q in name_n):
            hits.append(r)
    hits.sort(key=lambda x: len(normalize_ar(x.get("name",""))), reverse=True)
    return hits

def format_recipe(r: dict) -> str:
    out = f"{r.get('name','')} — ({r.get('id','')})\n"
    if r.get("type"): out += f"نوع الأكلة: {r['type']}\n"
    if r.get("region"): out += f"المنطقة: {r['region']}\n"
    if r.get("cook_method"): out += f"طريقة الطهي: {r['cook_method']}\n"

    if r.get("description"):
        out += "\nالوصف:\n" + r["description"].strip()

    if r.get("ingredients"):
        out += "\n\nالمكونات:\n" + "\n".join([f"- {x}" for x in r["ingredients"]])

    if r.get("prep"):
        out += "\n\nطريقة التحضير:\n" + r["prep"].strip()

    return out.strip()

def is_short_term(msg: str) -> bool:
    return len((msg or "").split()) <= 2

MODEL_NAME = "llama3"

DOMAIN_DISCLAIMER = (
    "تنبيه: هذا الشات يجيب فقط من وثائق (وصفات ظفار) داخل المشروع. "
    "إذا المعلومة غير موجودة في الوثائق، سيتم إرجاع: غير موجود في البيانات."
)

def retrieve_top_chunks(query: str, top_k: int, embedder, index, texts):
    q_emb = embedder.encode([query])
    q_emb = np.array(q_emb, dtype=np.float32)
    D, I = index.search(q_emb, top_k)
    return [texts[idx] for idx in I[0]]

def ollama_generate(prompt: str, model=MODEL_NAME):
    r = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    return (r.stdout or "").strip()

def rag_answer(user_q: str, embedder, index, texts, top_k: int = 4, model=MODEL_NAME):
    chunks = retrieve_top_chunks(user_q, top_k=top_k, embedder=embedder, index=index, texts=texts)
    context = "\n\n---\n\n".join(chunks)

    prompt = f"""
{DOMAIN_DISCLAIMER}

قواعد مهمة:
- استخدم (السياق) فقط.
- إذا لم تجد الإجابة في السياق، اكتب بالضبط: غير موجود في البيانات
- لا تخمّن ولا تضف معلومات خارج السياق.

السياق:
{context}

سؤال المستخدم:
{user_q}

الإجابة:
"""
    out = ollama_generate(prompt, model=model)
    return out if out else "غير موجود في البيانات"

STATE = {"options": None}

def fixed_router(msg: str, recipes, embedder=None, index=None, texts=None):
    msg = (msg or "").strip()
    if not msg:
        return "اكتبي مكوّن (لحم/سمك/تمر) أو كلمة عامة (تقليدي/تراث) أو اسم أكلة."

    if msg.isdigit() and STATE["options"]:
        idx = int(msg) - 1
        opts = STATE["options"]
        if 0 <= idx < len(opts):
            picked = opts[idx]
            STATE["options"] = None
            return format_recipe(picked)
        return "اختاري رقم صحيح من القائمة."

    if is_short_term(msg):
        strict_hits = find_recipes_by_ingredient_strict(msg, recipes)
        if strict_hits:
            STATE["options"] = strict_hits[:12]
            out = "أكلات تحتوي داخل المكونات فقط:\n\n"
            for i, r in enumerate(STATE["options"], 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out

        general_hits = find_recipes_general(msg, recipes)
        if general_hits:
            STATE["options"] = general_hits[:12]
            out = "ما لقيت داخل المكونات، لكن لقيتها في البحث العام:\n\n"
            for i, r in enumerate(STATE["options"], 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out

        return "ما لقيت نتائج."

    name_hits = find_recipes_by_name(msg, recipes)
    if name_hits:
        if len(name_hits) == 1:
            STATE["options"] = None
            return format_recipe(name_hits[0])

        STATE["options"] = name_hits[:12]
        out = "لقيت أكثر من أكلة مطابقة للااسم، اختاري رقم:\n\n"
        for i, r in enumerate(STATE["options"], 1):
            out += f"{i}) {r['name']} — ({r['id']})\n"
        out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
        return out

    if embedder is not None and index is not None and texts is not None:
        return rag_answer(msg, embedder=embedder, index=index, texts=texts, top_k=4, model=MODEL_NAME)

    return "RAG غير جاهز. وفري embedder/index/texts أو شغلي خلية FAISS."
'''

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

أكلات تحتوي داخل المكونات فقط:

1) شوربة الجريش — (DF-001)
2) سوب اللحم والطماطم — (DF-002)
3) كمباه مقشّد — (DF-008)
4) القصابية — (DF-011)
5) الجريــش باللحم — (DF-012)
6) رز مقزّح — (DF-013)
7) لحم مفور — (DF-022)
8) المضبي — (DF-023)
9) المقديد — (DF-024)
10) المعجين — (DF-025)
11) قبولي — (DF-040)

اكتبي رقم الأكلة لعرض التفاصيل.
----
ما لقيت داخل المكونات، لكن لقيتها في البحث العام:

1) شوربة الجريش — (DF-001)
2) خبز الثخين — (DF-003)
3) العطراية — (DF-005)
4) خبز لَحوح — (DF-007)
5) عيش بالنارجيل — (DF-010)
6) القصابية — (DF-011)
7) الجريــش باللحم — (DF-012)
8) رز مقزّح — (DF-013)
9) لبنية الكزيب — (DF-015)
10) الربيس — (DF-017)
11) الصيادية — (DF-018)
12) القشاط — (DF-019)

اكتبي رقم الأكلة لعرض التفاصيل.
----
ما لقيت داخل المكونات، لكن لقيتها في البحث العام:

1) المقديد — (DF-024)

اكتبي رقم الأكلة لعرض التفاصيل.
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\engine.py


In [46]:
# This cell cleans and sanitizes the parsed recipes list.
# It removes any leaked text that accidentally got attached to description/prep/ingredients
# (مثل: "الكلمات المفتاحية", "تكفي كم شخص", أو بداية وصفة ثانية داخل نفس النص).
# It runs the cleaning now in Jupyter, then saves the same logic into backend/sanitize_recipes.py.

import re
from pathlib import Path

# Clean a single text field (description/prep/ingredient line) from leaked next-recipe text
def _clean_text_field(text: str) -> str:
    # Return empty string if the input is missing
    if not text:
        return ""

    # Convert to string and remove extra spaces
    t = str(text).strip()

    # Cut the text if these markers appear inside the field
    cut_markers = [
        r"\bتكفي\s*كم\s*شخص\b",
        r"\bالكلمات\s*المفتاحية\b",
    ]
    for m in cut_markers:
        t = re.split(m, t, maxsplit=1)[0].strip()

    # Cut if a new recipe title line appears inside the text (example: "... — (DF-026)")
    t = re.split(
        r"\n\s*[\u0600-\u06FF0-9 ()\-\–—_]+?\s*—\s*\(DF-\d+\)\s*\n",
        t,
        maxsplit=1
    )[0].strip()

    # Cut if a new recipe ID appears inside the text (a sign of a new record)
    t = re.split(r"\n\s*ID\s*:\s*DF-\d+\s*\n", t, maxsplit=1)[0].strip()
    t = re.split(r"\n\s*DF-\d+\s*\n", t, maxsplit=1)[0].strip()

    # Reduce very long blank-line sequences
    t = re.sub(r"\n{3,}", "\n\n", t).strip()
    return t

# Clean the ingredients list and remove leading bullet symbols
def _clean_ingredients_list(ings):
    # Return empty list if ingredients are missing
    if not ings:
        return []
    cleaned = []
    for x in ings:
        # Clean each ingredient line
        s = _clean_text_field(x)
        # Remove bullet prefixes like "-" or "•"
        s = re.sub(r"^\s*[-•]+\s*", "", s).strip()
        if s:
            cleaned.append(s)
    return cleaned

# Sanitize all recipes in-place and remove unwanted keys
def sanitize_recipes(recipes_list):
    """
    Removes leaked text (keywords/serves/next recipe) if it got attached to a recipe.
    Run once after recipes parsing.
    """
    for r in recipes_list:
        # Clean basic text fields
        r["name"] = (r.get("name") or "").strip()
        r["description"] = _clean_text_field(r.get("description", ""))
        r["prep"] = _clean_text_field(r.get("prep", ""))

        # Clean ingredients list
        r["ingredients"] = _clean_ingredients_list(r.get("ingredients", []))

        # Remove keys that should not exist or should not be shown
        for k in ["keywords", "key_words", "tags", "serves", "portion", "تكفي", "كلمات_مفتاحية"]:
            if k in r:
                r.pop(k, None)

    return recipes_list

# Run now (Jupyter output)
recipes = sanitize_recipes(recipes)
print("Recipes sanitized: leaked text removed from description/prep/ingredients.")

# Save the same code to a file
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "sanitize_recipes.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code_to_save = """\
import re

def _clean_text_field(text: str) -> str:
    if not text:
        return ""
    t = str(text).strip()

    cut_markers = [
        r"\\bتكفي\\s*كم\\s*شخص\\b",
        r"\\bالكلمات\\s*المفتاحية\\b",
    ]
    for m in cut_markers:
        t = re.split(m, t, maxsplit=1)[0].strip()

    t = re.split(
        r"\\n\\s*[\\u0600-\\u06FF0-9 ()\\-\\–—_]+?\\s*—\\s*\\(DF-\\d+\\)\\s*\\n",
        t,
        maxsplit=1
    )[0].strip()

    t = re.split(r"\\n\\s*ID\\s*:\\s*DF-\\d+\\s*\\n", t, maxsplit=1)[0].strip()
    t = re.split(r"\\n\\s*DF-\\d+\\s*\\n", t, maxsplit=1)[0].strip()

    t = re.sub(r"\\n{3,}", "\\n\\n", t).strip()
    return t

def _clean_ingredients_list(ings):
    if not ings:
        return []
    cleaned = []
    for x in ings:
        s = _clean_text_field(x)
        s = re.sub(r"^\\s*[-•]+\\s*", "", s).strip()
        if s:
            cleaned.append(s)
    return cleaned

def sanitize_recipes(recipes_list):
    for r in recipes_list:
        r["name"] = (r.get("name") or "").strip()
        r["description"] = _clean_text_field(r.get("description", ""))
        r["prep"] = _clean_text_field(r.get("prep", ""))

        r["ingredients"] = _clean_ingredients_list(r.get("ingredients", []))

        for k in ["keywords", "key_words", "tags", "serves", "portion", "تكفي", "كلمات_مفتاحية"]:
            if k in r:
                r.pop(k, None)

    return recipes_list
"""

TARGET_FILE.write_text(code_to_save, encoding="utf-8")
print("Saved:", TARGET_FILE)

Recipes sanitized: leaked text removed from description/prep/ingredients.
Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\sanitize_recipes.py


In [47]:
# This cell creates ONE final backend module that merges all improvements into one file.
# It writes backend/app_engine.py which includes:
# - recipe sanitization (remove leaked text)
# - clean display (remove keywords if leaked)
# - strict ingredient matching (100% token match)
# - name search + general search
# - safe RAG wrapper (FAISS + Ollama)
# - a stateful Engine class for chat routing

from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
TARGET_FILE = PROJECT_ROOT / "backend" / "app_engine.py"
TARGET_FILE.parent.mkdir(parents=True, exist_ok=True)

code = r'''
import re
import numpy as np
import subprocess

# ===================== NORMALIZATION =====================
def normalize_ar(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)  # Remove Arabic diacritics
    text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")  # Normalize Alef
    text = text.replace("ى","ي").replace("ة","ه")  # Normalize Ya / Ta Marbuta
    text = re.sub(r"\s+"," ", text).strip().lower()  # Clean spaces and lowercase
    return text

# ===================== SANITIZE RECIPES (remove leaked next-recipe text) =====================
def _clean_text_field(text: str) -> str:
    if not text:
        return ""

    t = str(text).strip()

    # Cut if markers appear inside description/prep
    cut_markers = [
        r"\bتكفي\s*كم\s*شخص\b",
        r"\bالكلمات\s*المفتاحية\b",
    ]
    for m in cut_markers:
        t = re.split(m, t, maxsplit=1)[0].strip()

    # Cut if a new recipe title line appears inside the text (example: "... — (DF-026)")
    t = re.split(r"\n\s*[\u0600-\u06FF0-9 ()\-\–—_]+?\s*—\s*\(DF-\d+\)\s*\n", t, maxsplit=1)[0].strip()

    # Cut if a new recipe ID appears inside the text
    t = re.split(r"\n\s*ID\s*:\s*DF-\d+\s*\n", t, maxsplit=1)[0].strip()
    t = re.split(r"\n\s*DF-\d+\s*\n", t, maxsplit=1)[0].strip()

    # Clean extra blank lines
    t = re.sub(r"\n{3,}", "\n\n", t).strip()
    return t

def _clean_ingredients_list(ings):
    if not ings:
        return []
    cleaned = []
    for x in ings:
        s = _clean_text_field(x)
        s = re.sub(r"^\s*[-•]+\s*", "", s).strip()  # Remove bullet prefixes
        if s:
            cleaned.append(s)
    return cleaned

def sanitize_recipes(recipes_list):
    # Clean recipes in-place and remove unwanted keys
    for r in recipes_list:
        r["name"] = (r.get("name") or "").strip()
        r["description"] = _clean_text_field(r.get("description", ""))
        r["prep"] = _clean_text_field(r.get("prep", ""))
        r["ingredients"] = _clean_ingredients_list(r.get("ingredients", []))

        for k in ["keywords", "key_words", "tags", "serves", "portion", "تكفي", "كلمات_مفتاحية"]:
            if k in r:
                r.pop(k, None)
    return recipes_list

# ===================== CLEAN DISPLAY (strip keywords if leaked) =====================
def strip_keywords_anywhere(text: str) -> str:
    if not text:
        return ""
    text = re.split(r"(?:الكلمات\s+المفتاحية|كلمات\s+مفتاحية)\s*:\s*", text, maxsplit=1)[0]
    return text.strip()

def clean_recipe_for_display(r: dict) -> dict:
    rr = dict(r)
    rr["description"] = strip_keywords_anywhere(rr.get("description", ""))
    rr["prep"] = strip_keywords_anywhere(rr.get("prep", ""))
    return rr

# ===================== TOKENIZE (ingredients only) =====================
def tokenize_ar(text: str):
    t = normalize_ar(text)
    t = re.sub(r"[^0-9\u0621-\u064A]+", " ", t)  # Keep Arabic letters and numbers only
    words = [w for w in t.split() if w]
    tokens = set(words)
    for w in words:
        if w.startswith("ال") and len(w) > 2:
            tokens.add(w[2:])  # Add without "ال"
    return tokens

def canonical_term(term: str):
    t = normalize_ar(term)
    t = re.sub(r"[^0-9\u0621-\u064A]+", " ", t).strip()
    if not t:
        return ""
    return t[2:] if t.startswith("ال") and len(t) > 2 else t

# ===================== SEARCH =====================
def find_recipes_by_ingredient_strict(term: str, recipes):
    # Strict token match in ingredients only
    t = canonical_term(term)
    if not t:
        return []
    out = []
    for r in recipes:
        ing_text = " ".join(r.get("ingredients", []))
        tokens = tokenize_ar(ing_text)
        if t in tokens:
            out.append(r)
    return out

def find_recipes_general(term: str, recipes):
    # General substring search across multiple fields
    t = normalize_ar(term)
    if not t:
        return []
    out = []
    for r in recipes:
        hay = " ".join([
            r.get("name",""),
            r.get("type","") or "",
            r.get("region","") or "",
            r.get("cook_method","") or "",
            r.get("description","") or "",
            " ".join(r.get("ingredients", []))
        ])
        if t in normalize_ar(hay):
            out.append(r)
    return out

def find_recipes_by_name(user_text: str, recipes):
    # Find if user text exists inside recipe name
    q = normalize_ar(user_text)
    hits = []
    for r in recipes:
        name_n = normalize_ar(r.get("name",""))
        if not name_n:
            continue
        if q == name_n or (q and q in name_n):
            hits.append(r)
    hits.sort(key=lambda x: len(normalize_ar(x.get("name",""))), reverse=True)
    return hits

# ===================== FORMAT =====================
def format_recipe(r: dict) -> str:
    # Format recipe for display (no keywords leaked)
    r = clean_recipe_for_display(r)

    out = f"{r.get('name','')} — ({r.get('id','')})\n"
    if r.get("type"): out += f"نوع الأكلة: {r['type']}\n"
    if r.get("region"): out += f"المنطقة: {r['region']}\n"
    if r.get("cook_method"): out += f"طريقة الطهي: {r['cook_method']}\n"

    if r.get("description"):
        out += "\nالوصف:\n" + r["description"].strip()

    if r.get("ingredients"):
        out += "\n\nالمكونات:\n" + "\n".join([f"- {x}" for x in r["ingredients"]])

    if r.get("prep"):
        out += "\n\nطريقة التحضير:\n" + r["prep"].strip()

    return out.strip()

def is_short_term(msg: str) -> bool:
    return len((msg or "").split()) <= 2

# ===================== RAG (FAISS + Ollama) =====================
DOMAIN_DISCLAIMER = (
    "تنبيه: هذا الشات يجيب فقط من وثائق (وصفات ظفار) داخل المشروع. "
    "إذا المعلومة غير موجودة في الوثائق، سيتم إرجاع: غير موجود في البيانات."
)

def retrieve_top_chunks(query: str, embedder, index, texts, top_k: int = 4):
    q_emb = embedder.encode([query])
    q_emb = np.array(q_emb, dtype=np.float32)
    _, I = index.search(q_emb, top_k)
    return [texts[idx] for idx in I[0]]

def ollama_generate(prompt: str, model: str = "llama3") -> str:
    r = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True,
        encoding="utf-8",
        errors="ignore"
    )
    return (r.stdout or "").strip()

def rag_answer(user_q: str, embedder, index, texts, model: str = "llama3", top_k: int = 4) -> str:
    chunks = retrieve_top_chunks(user_q, embedder, index, texts, top_k=top_k)
    context = "\n\n---\n\n".join(chunks)
    prompt = f"""
{DOMAIN_DISCLAIMER}

قواعد مهمة:
- استخدم (السياق) فقط.
- إذا لم تجد الإجابة في السياق، اكتب بالضبط: غير موجود في البيانات
- لا تخمّن ولا تضف معلومات خارج السياق.

السياق:
{context}

سؤال المستخدم:
{user_q}

الإجابة:
"""
    out = ollama_generate(prompt, model=model)
    return out if out else "غير موجود في البيانات"

# ===================== STATEFUL ENGINE =====================
class Engine:
    # This class stores state and routes messages
    def __init__(self, recipes, embedder=None, index=None, texts=None, model_name="llama3"):
        self.recipes = recipes
        self.embedder = embedder
        self.index = index
        self.texts = texts
        self.model_name = model_name
        self.state = {"options": None}

    def safe_rag(self, msg: str) -> str:
        # If RAG is not available, return a safe message
        if self.embedder is None or self.index is None or self.texts is None:
            return "RAG غير جاهز. شغّلي خلية FAISS + texts أولاً."
        try:
            return rag_answer(msg, self.embedder, self.index, self.texts, model=self.model_name, top_k=4)
        except Exception as e:
            return f"صار خطأ في RAG: {type(e).__name__} — {e}"

    def chat(self, msg: str) -> str:
        # Main router logic with state
        msg = (msg or "").strip()
        if not msg:
            return "اكتبي مكوّن (لحم/سمك/تمر) أو كلمة عامة (تقليدي/تراث) أو اسم أكلة."

        # If user selects a number from options
        if msg.isdigit() and self.state.get("options"):
            idx = int(msg) - 1
            opts = self.state["options"]
            if 0 <= idx < len(opts):
                picked = opts[idx]
                self.state["options"] = None
                return format_recipe(picked)
            return "اختاري رقم صحيح من القائمة."

        # Short term: strict ingredients first, then general, then safe RAG
        if is_short_term(msg):
            strict_hits = find_recipes_by_ingredient_strict(msg, self.recipes)
            if strict_hits:
                self.state["options"] = strict_hits[:12]
                out = "أكلات تحتوي داخل المكونات فقط:\n\n"
                for i, r in enumerate(self.state["options"], 1):
                    out += f"{i}) {r['name']} — ({r['id']})\n"
                out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
                return out

            general_hits = find_recipes_general(msg, self.recipes)
            if general_hits:
                self.state["options"] = general_hits[:12]
                out = "نتائج من البحث العام:\n\n"
                for i, r in enumerate(self.state["options"], 1):
                    out += f"{i}) {r['name']} — ({r['id']})\n"
                out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
                return out

            return self.safe_rag(msg)

        # Longer input: try name search, else safe RAG
        name_hits = find_recipes_by_name(msg, self.recipes)
        if name_hits:
            if len(name_hits) == 1:
                self.state["options"] = None
                return format_recipe(name_hits[0])

            self.state["options"] = name_hits[:12]
            out = "لقيت أكثر من أكلة مطابقة للاسم، اختاري رقم:\n\n"
            for i, r in enumerate(self.state["options"], 1):
                out += f"{i}) {r['name']} — ({r['id']})\n"
            out += "\nاكتبي رقم الأكلة لعرض التفاصيل."
            return out

        return self.safe_rag(msg)
'''

TARGET_FILE.write_text(code, encoding="utf-8")
print("Saved:", TARGET_FILE)

Saved: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\backend\app_engine.py


In [49]:
# This cell runs the Gradio UI and uses backend/app_engine.py as the chatbot engine.
# It loads the DOCX domain, parses recipes, sanitizes them, builds FAISS,
# then tests the engine and launches Gradio.

!pip -q install gradio python-docx sentence-transformers faiss-cpu

import os
import re
import numpy as np
import faiss
import gradio as gr
from pathlib import Path
from docx import Document
from sentence_transformers import SentenceTransformer

# Project paths
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")
DOCX_PATH = PROJECT_ROOT / "data" / "raw" / "dhofar flavor.docx"

# Import the merged engine module
import sys
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from backend.app_engine import Engine, sanitize_recipes

# Read DOCX as raw text
def read_docx_text(docx_path: str) -> str:
    doc = Document(docx_path)
    parts = []
    for p in doc.paragraphs:
        t = (p.text or "").strip()
        if t:
            parts.append(t)
    return "\n".join(parts)

# Parse recipes from raw text (same logic you used previously)
def parse_recipes_from_docx_text(raw_text: str):
    lines = [l.strip() for l in raw_text.splitlines() if l.strip()]
    id_pat = re.compile(r"^ID\s*:\s*(DF-[A-Za-z0-9\-]+)\s*$", re.IGNORECASE)

    recipes = []
    cur = None
    section = None
    last_text_line = None

    def new_recipe():
        return {
            "id": None,
            "name": None,
            "type": None,
            "region": None,
            "cook_method": None,
            "description": "",
            "ingredients": [],
            "prep": "",
            "keywords": []
        }

    def normalize_ar_local(text: str) -> str:
        if not text:
            return ""
        text = re.sub(r"[\u064B-\u065F\u0670]", "", text)
        text = text.replace("أ","ا").replace("إ","ا").replace("آ","ا")
        text = text.replace("ى","ي").replace("ة","ه")
        text = re.sub(r"\s+"," ", text).strip().lower()
        return text

    def flush():
        nonlocal cur
        if not cur:
            return
        cur["description"] = (cur["description"] or "").strip()
        cur["prep"] = (cur["prep"] or "").strip()
        cur["keywords"] = list(set(
            normalize_ar_local(w)
            for w in ((cur["name"] or "").split() + (cur["type"] or "").split())
            if w
        ))
        if cur.get("id"):
            recipes.append(cur)
        cur = None

    for line in lines:
        m = id_pat.match(line)
        if m:
            flush()
            cur = new_recipe()
            cur["id"] = m.group(1).strip()
            if last_text_line and ":" not in last_text_line:
                cur["name"] = last_text_line.strip()
            else:
                cur["name"] = None
            section = None
            continue

        if not line.startswith(("نوع الأكلة", "المنطقة", "طريقة الطهي", "الوصف", "المكونات", "طريقة التحضير")):
            if len(line) <= 60:
                last_text_line = line

        if not cur:
            continue

        if "نوع الأكلة" in line:
            cur["type"] = line.split(":", 1)[-1].strip()
            continue
        if "المنطقة" in line:
            cur["region"] = line.split(":", 1)[-1].strip()
            continue
        if "طريقة الطهي" in line:
            cur["cook_method"] = line.split(":", 1)[-1].strip()
            continue

        if line.startswith("الوصف"):
            section = "description"
            continue
        if line.startswith("المكونات"):
            section = "ingredients"
            continue
        if line.startswith("طريقة التحضير"):
            section = "prep"
            continue

        if section == "description":
            cur["description"] += line + " "
        elif section == "ingredients":
            if ":" not in line and not line.startswith(("ملاحظات", "تكفي")):
                cur["ingredients"].append(line.strip())
        elif section == "prep":
            cur["prep"] += line + " "

    flush()
    return recipes

# Load and build everything
if not DOCX_PATH.exists():
    raise FileNotFoundError(f"DOCX not found: {DOCX_PATH}")

raw_text = read_docx_text(str(DOCX_PATH))
recipes = parse_recipes_from_docx_text(raw_text)
recipes = sanitize_recipes(recipes)

# Build texts for FAISS
texts = []
for r in recipes:
    doc = (
        f"اسم: {r.get('name','')}\n"
        f"ID: {r.get('id','')}\n"
        f"نوع: {r.get('type','')}\n"
        f"المنطقة: {r.get('region','')}\n"
        f"طريقة الطهي: {r.get('cook_method','')}\n"
        f"الوصف: {r.get('description','')}\n"
        f"المكونات: {', '.join(r.get('ingredients', [])[:25])}\n"
        f"طريقة التحضير: {r.get('prep','')}"
    )
    texts.append(doc)

# Embeddings + FAISS
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(EMBED_MODEL)
emb = embedder.encode(texts, show_progress_bar=True)
emb = np.array(emb, dtype=np.float32)

index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)

# Create the engine
engine = Engine(recipes=recipes, embedder=embedder, index=index, texts=texts, model_name="llama3")

# Quick tests (Jupyter output)
print("Parsed recipes:", len(recipes))
print("Test 1:", engine.chat("سمك"))
print("----")

# Gradio UI
def chat_fn(message, history):
    return engine.chat(message)

demo = gr.ChatInterface(
    fn=chat_fn,
    title="Dhofar Flavor Chat",
    description=(
        "اكتبي مكوّن (لحم/سمك/تمر) لاقتراح أكلات.\n"
        "اكتبي اسم أكلة لعرض تفاصيلها.\n"
        "اكتبي سؤال طويل ليتم البحث باستخدام RAG."
    ),
)

demo.launch()

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Parsed recipes: 44
Test 1: أكلات تحتوي داخل المكونات فقط:

1) مرق السمك بالنارجيل — (DF-014)
2) عطراية (الحبار) — (DF-016)
3) الربيس — (DF-017)
4) الصيادية — (DF-018)
5) المضبي — (DF-023)
6) المالح — (DF-036)

اكتبي رقم الأكلة لعرض التفاصيل.
----
* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




In [50]:
# This cell creates required folders and saves:
# 1) data/processed/recipes.pkl from the "recipes" variable
# 2) vectorstore/faiss_index/index.faiss + index.pkl using LangChain FAISS if "vectordb" exists

from pathlib import Path
import pickle
import os

# Change only this if your project folder is different
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Target paths
RECIPES_PKL = PROJECT_ROOT / "data" / "processed" / "recipes.pkl"
FAISS_DIR   = PROJECT_ROOT / "vectorstore" / "faiss_index"

# Create folders
RECIPES_PKL.parent.mkdir(parents=True, exist_ok=True)
FAISS_DIR.mkdir(parents=True, exist_ok=True)

# -----------------------------
# 1) Save recipes.pkl
# -----------------------------
if "recipes" not in globals() or not isinstance(recipes, list) or len(recipes) == 0:
    print("recipes.pkl NOT saved because variable 'recipes' is missing or empty.")
    print("Run your parsing cell first so 'recipes' becomes a list of recipe dicts.")
else:
    with open(RECIPES_PKL, "wb") as f:
        pickle.dump(recipes, f)
    print("Saved recipes.pkl at:", str(RECIPES_PKL))
    print("Recipes count:", len(recipes))

# -----------------------------
# 2) Save FAISS index for the WEB engine (LangChain format)
# This must create: index.faiss + index.pkl
# -----------------------------
if "vectordb" in globals() and vectordb is not None:
    try:
        # LangChain FAISS object has save_local()
        vectordb.save_local(str(FAISS_DIR))
        print("Saved LangChain FAISS at:", str(FAISS_DIR))
        print("Expected files:",
              (FAISS_DIR / "index.faiss").exists(),
              (FAISS_DIR / "index.pkl").exists())
    except Exception as e:
        print("Failed to save LangChain FAISS:", type(e).__name__, "-", str(e)[:300])
        print("Make sure 'vectordb' is a LangChain FAISS object (not faiss.Index).")
else:
    print("FAISS NOT saved because variable 'vectordb' not found.")
    print("If your web engine uses LangChain FAISS.load_local(), you must build vectordb then run this cell.")
    print("Tip: build vectordb using LangChain FAISS.from_texts(...) or FAISS.from_documents(...).")

Saved recipes.pkl at: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\data\processed\recipes.pkl
Recipes count: 44
FAISS NOT saved because variable 'vectordb' not found.
If your web engine uses LangChain FAISS.load_local(), you must build vectordb then run this cell.
Tip: build vectordb using LangChain FAISS.from_texts(...) or FAISS.from_documents(...).


In [53]:
# This cell builds a LangChain FAISS index from data/processed/recipes.pkl
# and saves it to: vectorstore/faiss_index/index.faiss + index.pkl

import os
import re
import pickle
from pathlib import Path
from dotenv import load_dotenv

from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings

load_dotenv()

# Project root (change only if your folder is different)
PROJECT_ROOT = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project")

# Paths (same structure used by rag_engine.py)
FAISS_PATH_ENV = os.getenv("FAISS_PATH", str(PROJECT_ROOT / "vectorstore" / "faiss_index"))
FAISS_PATH = FAISS_PATH_ENV if os.path.isabs(FAISS_PATH_ENV) else str(PROJECT_ROOT / FAISS_PATH_ENV)

EMBED_OLLAMA_MODEL = os.getenv("EMBED_OLLAMA_MODEL", "nomic-embed-text")

RECIPES_PKL = PROJECT_ROOT / "data" / "processed" / "recipes.pkl"
if not RECIPES_PKL.exists():
    raise FileNotFoundError(f"recipes.pkl not found at: {RECIPES_PKL}")

with open(RECIPES_PKL, "rb") as f:
    recipes = pickle.load(f)

# Remove anything after "الكلمات المفتاحية:" if it leaked into description/prep
def strip_keywords_anywhere(text: str) -> str:
    if not text:
        return ""
    t = str(text)
    t = re.split(r"(?:الكلمات\s+المفتاحية|كلمات\s+مفتاحية)\s*:\s*", t, maxsplit=1)[0]
    return t.strip()

def clean_recipe_for_rag(r: dict) -> dict:
    rr = dict(r)
    rr["description"] = strip_keywords_anywhere(rr.get("description", ""))
    rr["prep"] = strip_keywords_anywhere(rr.get("prep", ""))
    return rr

# Arabic normalization (for aliases only)
def normalize_ar(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي").replace("ة", "ه")
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

# Aliases improve retrieval quality for known spelling variants (example: القماحة)
def build_name_aliases(dish_title: str):
    title = (dish_title or "").strip()
    if not title:
        return []
    title_n = normalize_ar(title)

    if any(x in title_n for x in ["القماحه", "قماحه", "القماحة", "قماحة", "قمحه"]):
        return ["القماحة", "قماحة", "قماحه", "قمحه"]

    return []

# Build Documents for FAISS
docs = []
for r in recipes:
    r = clean_recipe_for_rag(r)

    dish_id = (r.get("id") or "").strip()
    dish_title = (r.get("name") or "").strip()
    region = (r.get("region") or "").strip()

    text_parts = []

    if dish_title:
        text_parts.append(f"اسم الطبق: {dish_title}")

        aliases = build_name_aliases(dish_title)
        if aliases:
            text_parts.append("اسماء بديلة:\n" + "\n".join([f"- {a}" for a in aliases]))

    if dish_id:
        text_parts.append(f"ID: {dish_id}")

    if region:
        text_parts.append(f"المنطقة: {region}")

    if r.get("description"):
        text_parts.append("الوصف:\n" + str(r["description"]).strip())

    ings = r.get("ingredients") or []
    if ings:
        text_parts.append("المكونات:\n" + "\n".join([f"- {x}" for x in ings]))

    if r.get("prep"):
        text_parts.append("طريقة التحضير:\n" + str(r["prep"]).strip())

    content = "\n\n".join(text_parts).strip()

    docs.append(
        Document(
            page_content=content,
            metadata={
                "source": "recipes.pkl",
                "dish_id": dish_id,
                "dish_title": dish_title,
            },
        )
    )

# Build + save LangChain FAISS
embeddings = OllamaEmbeddings(model=EMBED_OLLAMA_MODEL)

os.makedirs(FAISS_PATH, exist_ok=True)

# Remove old index files if they exist
for fn in ["index.faiss", "index.pkl"]:
    fp = os.path.join(FAISS_PATH, fn)
    if os.path.exists(fp):
        os.remove(fp)

db = FAISS.from_documents(docs, embeddings)
db.save_local(FAISS_PATH)

print("FAISS index built successfully.")
print("FAISS_PATH:", FAISS_PATH)
print("EMBED_OLLAMA_MODEL:", EMBED_OLLAMA_MODEL)
print("Documents:", len(docs))
print("index.faiss exists:", os.path.exists(os.path.join(FAISS_PATH, "index.faiss")))
print("index.pkl exists:", os.path.exists(os.path.join(FAISS_PATH, "index.pkl")))

FAISS index built successfully.
FAISS_PATH: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\vectorstore\faiss_index
EMBED_OLLAMA_MODEL: nomic-embed-text
Documents: 44
index.faiss exists: True
index.pkl exists: True


In [52]:
!pip install -U langchain-ollama

Collecting langchain-ollama
  Using cached langchain_ollama-1.0.1-py3-none-any.whl.metadata (2.5 kB)
Collecting ollama<1.0.0,>=0.6.0 (from langchain-ollama)
  Using cached ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Using cached langchain_ollama-1.0.1-py3-none-any.whl (29 kB)
Using cached ollama-0.6.1-py3-none-any.whl (14 kB)
Installing collected packages: ollama, langchain-ollama

   ---------------------------------------- 2/2 [langchain-ollama]

Successfully installed langchain-ollama-1.0.1 ollama-0.6.1


In [2]:
import pickle
from pathlib import Path

PKL_PATH = Path(r"C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\data\processed\recipes.pkl")

with open(PKL_PATH, "rb") as f:
    recipes = pickle.load(f)

print("✅ Loaded:", PKL_PATH)
print("🔢 Count:", len(recipes))
print("🧾 Keys sample:", list(recipes[0].keys()))

# عرض أول 10 أسماء + ID
print("\n--- First 10 (name / id) ---")
for i, r in enumerate(recipes[:10], 1):
    print(f"{i:02d}) {r.get('name','')} | {r.get('id','')}")

def search_name(q):
    q = (q or "").strip()
    hits = [r for r in recipes if q in str(r.get("name",""))]
    print(f"\n🔎 '{q}' -> {len(hits)} result(s)")
    for i, r in enumerate(hits[:30], 1):
        print(f"{i:02d}) {r.get('name','')} | {r.get('id','')}")
    return hits

hits1 = search_name("قبولي")
hits2 = search_name("القبولي")

✅ Loaded: C:\Users\USER PC\OneDrive\Desktop\dhofar_flavor_chat project\data\processed\recipes.pkl
🔢 Count: 44
🧾 Keys sample: ['id', 'name', 'type', 'region', 'cook_method', 'description', 'ingredients', 'prep']

--- First 10 (name / id) ---
01) شوربة الجريش | DF-001
02) سوب اللحم والطماطم | DF-002
03) خبز الثخين | DF-003
04) الفندال المقصّص | DF-004
05) العطراية | DF-005
06) القراص | DF-006
07) خبز لَحوح | DF-007
08) كمباه مقشّد | DF-008
09) الدَّجَر مع الماش | DF-009
10) عيش بالنارجيل | DF-010

🔎 'قبولي' -> 1 result(s)
01) قبولي | DF-040

🔎 'القبولي' -> 0 result(s)
