In [1]:
import pandas as pd
import re
import unicodedata
from sentence_transformers import SentenceTransformer, util

# -----------------------------
# 1. Text Cleaning
# -----------------------------
def normalize_text(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = unicodedata.normalize("NFKD", s)
    s = re.sub(r"[^a-zA-Z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def simplify_ingredients(s):
    if not isinstance(s, str):
        return ""
    s = s.lower()

    # Kill Latin names and parenthesis content
    s = re.sub(r"\([^)]*\)", " ", s)
    s = re.sub(r"\b[a-z]{3,10}\s[a-z]{3,10}\b", "", s)
    s = re.sub(r"\b[a-z]{5,}\s*\(.*?\)", "", s)

    # Keyword simplifications
    replacements = {
        "bengal gram": "chickpeas",
        "black gram": "urad dal",
        "green gram": "moong dal",
        "kidney bean": "rajma",
        "curd": "yogurt",
        "garam masala powder": "garam masala",
        "red chilli powder": "chili powder",
        "turmeric powder": "turmeric",
        "mustard seeds": "rai",
        "coriander powder": "dhania powder",
        "cumin powder": "jeera powder",
        "cottage cheese": "paneer",
    }
    for k, v in replacements.items():
        s = s.replace(k, v)

    # Strip weird extra tokens
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


# -----------------------------
# 2. Load Dataset
# -----------------------------
df = pd.read_excel("data/recipes.xlsx")

def find_col(possible_names):
    for name in possible_names:
        if name in df.columns:
            return name
    return None

name_col = find_col(["_recipe_name_orig", "recipe_name", "Recipe_Name"])
ing_col = find_col(["ingredient_name_orig", "ingredient_name", "Ingredient_Name"])
food_col = find_col(["food_name_org", "food_name", "Food_Name"])

if not name_col:
    raise KeyError("No recipe name column found in your file.")

def combine_text(row):
    ing = str(row[ing_col]) if ing_col and ing_col in row else ""
    food = str(row[food_col]) if food_col and food_col in row else ""
    return (ing + " " + food).strip()

df["combined_text"] = df.apply(combine_text, axis=1)

grouped = (
    df.groupby(name_col)["combined_text"]
    .apply(lambda x: " ".join(x))
    .reset_index(name="full_text")
)

grouped["clean_text"] = grouped["full_text"].apply(simplify_ingredients)
grouped["recipe_clean"] = grouped[name_col].apply(normalize_text)

print("✅ Number of unique recipes:", len(grouped))

# -----------------------------
# 3. Embeddings
# -----------------------------
print("🔧 Loading model (this might take 20–30 seconds)...")
model = SentenceTransformer("all-mpnet-base-v2")
recipe_embeddings = model.encode(grouped["clean_text"].tolist(), convert_to_tensor=True)

# -----------------------------
# 4. Search Function
# -----------------------------
def find_similar_recipes(query, top_k=5, threshold=0.35):
    query_clean = normalize_text(query)
    query_emb = model.encode(query_clean, convert_to_tensor=True)
    scores = util.cos_sim(query_emb, recipe_embeddings)[0]
    top_results = scores.argsort(descending=True)

    print(f"\n🔍 Query -> '{query}'\n")
    found = 0
    for idx in top_results[:top_k * 3]:  # allow a wider net, then filter
        idx = int(idx)
        score = float(scores[idx])
        if score < threshold:
            continue
        found += 1
        name = grouped.iloc[idx]["recipe_clean"]
        ingredients = grouped.iloc[idx]["clean_text"][:180] + "..."
        print(f"🍲 Recipe: {name}  (Score: {score:.3f})")
        print(f"   Ingredients: {ingredients}\n")
        if found >= top_k:
            break

    if found == 0:
        print("❌ No strong matches found.\n")

# -----------------------------
# 5. Example Run
# -----------------------------
find_similar_recipes("Rajma Masala")


  from .autonotebook import tqdm as notebook_tqdm


✅ Number of unique recipes: 1015
🔧 Loading model (this might take 20–30 seconds)...

🔍 Query -> 'Rajma Masala'

🍲 Recipe: eggplant brinjal rice vangi bhat  (Score: 0.536)
   Ingredients: rice parboiled milled dal dal brinjal tamarind pulp asafoetida pepper black oil water distilled...

🍲 Recipe: plain khitchdi plain khichri khichdi  (Score: 0.533)
   Ingredients: rice parboiled milled dal ginger fresh butter...

🍲 Recipe: garlic chutney poondu chutney lahasun ki chutney  (Score: 0.532)
   Ingredients: dal tamarind pulp garlic asafoetida chillies red shallots...

🍲 Recipe: mixed pulse and vegetable salad  (Score: 0.523)
   Ingredients: rajmah red lettuce cucumber green elongate potato big chillies green onion big pepper black beans kabuli whole dried raw...

🍲 Recipe: kidney bean sandwich with cottage cheese  (Score: 0.520)
   Ingredients: rajmah red chillies green garlic white water unsalted...

