In [6]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------------
# Load and clean the dataset
# ---------------------------
df = pd.read_excel('data/recipes.xlsx')
df = df[['recipe_name', 'food_name_org']].copy()
df = df.dropna(subset=['recipe_name'])
df['food_name_org'] = df['food_name_org'].fillna('')

def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df['recipe_name'] = df['recipe_name'].apply(clean_text)
df['food_name_org'] = df['food_name_org'].apply(clean_text)

# ---------------------------
# Combine all ingredient names per recipe
# ---------------------------
recipes_grouped = (
    df.groupby('recipe_name')['food_name_org']
    .apply(lambda x: ' '.join(x))
    .reset_index()
)

recipes_grouped = recipes_grouped.drop_duplicates(subset=['recipe_name']).reset_index(drop=True)

# ---------------------------
# TF-IDF representation
# ---------------------------
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(recipes_grouped['food_name_org'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# ---------------------------
# Function to get similar recipes
# ---------------------------
def get_similar_recipes(recipe_name, cosine_sim=cosine_sim, recipes=recipes_grouped):
    recipe_name = clean_text(recipe_name)
    if recipe_name not in recipes['recipe_name'].values:
        return []

    idx = recipes[recipes['recipe_name'] == recipe_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    return [recipes.iloc[i[0]]['recipe_name'] for i in sim_scores]

# ---------------------------
# Test the model
# ---------------------------
results1 = get_similar_recipes('bhel puri')
print("Top similar recipes to 'bhel puri':")
print(results1)

#
results2 = get_similar_recipes('Bhel Poori')
print("Top similar recipes to 'grilled chicken':")
print(results2)


Top similar recipes to 'bhel puri':
['khakhra chaat', 'split bengal gram dal channa dal', 'oniongreen chilli paranthaparatha pyaaz aur hari mirch ka paranthaparatha', 'vegetarian nargisi kofta curry', 'potato samosa aloo ka samosa']
Top similar recipes to 'grilled chicken':
[]


In [15]:
def get_recipes_by_ingredients(ingredients, tfidf=tfidf, tfidf_matrix=tfidf_matrix, recipes=recipes_grouped):
    ingredients = clean_text(ingredients)
    ingredients_vec = tfidf.transform([ingredients])
    sim_scores = cosine_similarity(ingredients_vec, tfidf_matrix).flatten()
    top_indices = sim_scores.argsort()[::-1][:5]
    return recipes.iloc[top_indices]['recipe_name'].tolist()
get_recipes_by_ingredients("potato tomato onion")



['shepherds pie with minced meat',
 'spinach and potato palak aloo',
 'vegetable soup',
 'stuffed tomatoes bharwa tamatar',
 'potato with curd']

In [None]:


import os
from pathlib import Path
import pickle
import re
import unicodedata
import numpy as np
import pandas as pd


from sentence_transformers import SentenceTransformer, util
from rapidfuzz import fuzz


import torch


DATA_PATH = Path("data/recipes.xlsx")
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)


MODEL_NAME = "all-MiniLM-L6-v2"  
BATCH_SIZE = 64                   
TOP_PREFILTER = 200               
MIN_SCORE = 0.30                  
SEM_FUZZY_WEIGHT = (0.7, 0.3)     
# ---------- helpers ----------
def normalize_text(s, keep_digits=False):

    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = unicodedata.normalize("NFKD", s)
    if keep_digits:
        s = re.sub(r"[^0-9a-z\s]", " ", s)
    else:
        s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def find_col(df, options):
    for name in options:
        if name in df.columns:
            return name
    return None


if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing {DATA_PATH}. Put your recipes.xlsx in data/ or change DATA_PATH.")

df = pd.read_excel(DATA_PATH)

name_col = find_col(df, ["recipe_name_orig", "recipe_name", "_recipe_name_orig", "RecipeName", "translatedRecipeName", "recipe_name_org"])
ing_col  = find_col(df, ["ingredient_name_org", "ingredient_name", "ingredient", "Ingredient_Name", "food_name_org", "food_name"])
food_col = find_col(df, ["food_name_org", "food_name", "food", "Food_Name", "translatedIngredients"])

if not name_col:
    raise KeyError("No recipe name column found. Check your Excel headers (tried several options).")

def combine_row_text(row):
    parts = []
    if ing_col and ing_col in row and pd.notna(row[ing_col]):
        parts.append(str(row[ing_col]))
    if food_col and food_col in row and pd.notna(row[food_col]):
        parts.append(str(row[food_col]))
    return " ".join(parts).strip()

df = df.dropna(subset=[name_col]).reset_index(drop=True)
df["combined_text"] = df.apply(combine_row_text, axis=1)
print(f"Loaded {len(df)} rows. Using recipe-name column: '{name_col}'. ingredient/food columns: '{ing_col}', '{food_col}'")

grouped = (
    df.groupby(name_col)["combined_text"]
      .apply(lambda texts: " ".join([t for t in texts.astype(str) if t.strip() != ""]))
      .reset_index(name="full_text")
)

grouped["recipe_clean"] = grouped[name_col].apply(lambda x: normalize_text(x if pd.notna(x) else ""))
grouped["clean_text"] = grouped["full_text"].apply(lambda x: normalize_text(x if pd.notna(x) else "", keep_digits=True))

print(f"Number of unique recipes after grouping: {len(grouped)}")
if grouped[name_col].duplicated().any():
    print("Warning: duplicate recipe names found after grouping. Consider using a unique id to distinguish.")

# ---------- load model & compute embeddings ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}. Model: {MODEL_NAME} (batch_size={BATCH_SIZE})")
model = SentenceTransformer(MODEL_NAME, device=device)


texts_to_encode = (grouped["recipe_clean"] + " " + grouped["clean_text"]).tolist()


recipe_embeddings = model.encode(
    texts_to_encode,
    batch_size=BATCH_SIZE,
    convert_to_tensor=True,
    show_progress_bar=True,
    device=device
)

if recipe_embeddings.dtype != torch.float32 and hasattr(recipe_embeddings, "to"):
    recipe_embeddings = recipe_embeddings.to(torch.float32)

np.save(MODELS_DIR / "recipe_embeddings.npy", recipe_embeddings.cpu().numpy())
grouped.to_pickle(MODELS_DIR / "recipes_mapping.pkl")

print("Saved embeddings and mapping to models/")


In [None]:
# Improved, fixed, and commented semantic + fuzzy recipe search
# Save this as search_recipes.py and run. Adjust DATA_PATH and model_name as needed.

import os
from pathlib import Path
import re
import unicodedata
import numpy as np
import pandas as pd

# embeddings & fuzzy
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import fuzz

# optional torch only used for device detection and saving/loading tensors
import torch

# ---------- config ----------
DATA_PATH = Path("data/recipes.xlsx")
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Choose model: strong but slow -> "all-mpnet-base-v2"; fast & decent -> "all-MiniLM-L6-v2"
MODEL_NAME = "all-MiniLM-L6-v2"
BATCH_SIZE = 64
TOP_PREFILTER = 200
MIN_SCORE = 0.30
SEM_FUZZY_WEIGHT = (0.7, 0.3)

# ---------- helpers ----------
def normalize_text(s, keep_digits=False):
    """Lowercase, normalize unicode, remove punctuation."""
    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = unicodedata.normalize("NFKD", s)
    if keep_digits:
        s = re.sub(r"[^0-9a-z\s]", " ", s)
    else:
        s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def find_col(df, options):
    for name in options:
        if name in df.columns:
            return name
    return None

# ---------- load data ----------
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Missing {DATA_PATH}")

df = pd.read_excel(DATA_PATH)

name_col = find_col(df, [
    "recipe_name_orig", "recipe_name", "_recipe_name_orig",
    "RecipeName", "translatedRecipeName", "recipe_name_org"
])
ing_col = find_col(df, [
    "ingredient_name_org", "ingredient_name", "ingredient",
    "Ingredient_Name", "food_name_org", "food_name"
])
food_col = find_col(df, [
    "food_name_org", "food_name", "food",
    "Food_Name", "translatedIngredients"
])

if not name_col:
    raise KeyError("No recipe name column found.")

def combine_row_text(row):
    parts = []
    if ing_col and pd.notna(row.get(ing_col)):
        parts.append(str(row[ing_col]))
    if food_col and pd.notna(row.get(food_col)):
        parts.append(str(row[food_col]))
    return " ".join(parts).strip()

df = df.dropna(subset=[name_col]).reset_index(drop=True)
df["combined_text"] = df.apply(combine_row_text, axis=1)

print(
    f"Loaded {len(df)} rows. "
    f"Recipe column: '{name_col}', ingredient columns: '{ing_col}', '{food_col}'"
)

# ---------- group into one row per recipe ----------
grouped = (
    df.groupby(name_col)["combined_text"]
      .apply(lambda texts: " ".join(t for t in texts.astype(str) if t.strip()))
      .reset_index(name="full_text")
)

grouped["recipe_clean"] = grouped[name_col].apply(normalize_text)
grouped["clean_text"] = grouped["full_text"].apply(
    lambda x: normalize_text(x, keep_digits=True)
)

print(f"Number of unique recipes: {len(grouped)}")

# ---------- load model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = SentenceTransformer(MODEL_NAME, device=device)

# ---------- load or compute embeddings ----------
emb_path = MODELS_DIR / "recipe_embeddings.npy"
map_path = MODELS_DIR / "recipes_mapping.pkl"

texts_to_encode = (grouped["recipe_clean"] + " " + grouped["clean_text"]).tolist()

if emb_path.exists() and map_path.exists():
    recipe_embeddings = torch.from_numpy(np.load(emb_path)).to(device)
    grouped = pd.read_pickle(map_path)
    print("Loaded embeddings and mapping from models/")
else:
    recipe_embeddings = model.encode(
        texts_to_encode,
        batch_size=BATCH_SIZE,
        convert_to_tensor=True,
        show_progress_bar=True,
        device=device
    )

    if recipe_embeddings.dtype != torch.float32:
        recipe_embeddings = recipe_embeddings.to(torch.float32)

    np.save(emb_path, recipe_embeddings.cpu().numpy())
    grouped.to_pickle(map_path)
    print("Computed and saved embeddings to models/")

# ---------- semantic + fuzzy search ----------
def search_recipe_semantic(
    query,
    top_k=5,
    min_score=MIN_SCORE,
    boost_keywords=None,
    top_prefilter=TOP_PREFILTER,
    sem_fuzzy_weight=SEM_FUZZY_WEIGHT
):
    q = normalize_text(query, keep_digits=True)

    q_emb = model.encode(q, convert_to_tensor=True, device=device)
    scores = util.cos_sim(q_emb, recipe_embeddings)[0].cpu().numpy()

    # FIX 1: normalize semantic scores before blending
    sem = scores
    sem = (sem - sem.min()) / (sem.max() - sem.min() + 1e-8)

    N = len(sem)
    top_idx = np.argsort(-sem)[:min(top_prefilter, N)]

    combined_scores = sem.copy()
    sem_w, fuzzy_w = sem_fuzzy_weight

    for i in top_idx:
        # FIX 2: fuzzy match against name + ingredients
        fuzz_text = (
            str(grouped.loc[i, name_col]) + " " +
            str(grouped.loc[i, "clean_text"])
        )
        fuzz_score = fuzz.partial_ratio(query, fuzz_text) / 100.0
        combined_scores[i] = sem_w * sem[i] + fuzzy_w * fuzz_score

        # FIX 3: keyword boost at scoring level
        if boost_keywords:
            for kw in boost_keywords:
                if kw in grouped.loc[i, "clean_text"]:
                    combined_scores[i] += 0.05

    sorted_idx = np.argsort(-combined_scores)

    out = []
    for i in sorted_idx:
        if len(out) >= top_k:
            break
        if combined_scores[i] < min_score:
            continue
        out.append({
            "recipe_name": grouped.loc[i, name_col],
            "recipe_clean": grouped.loc[i, "recipe_clean"],
            "score": round(float(combined_scores[i]), 4),
            "ingredients_preview": (
                grouped.loc[i, "clean_text"][:250] +
                ("..." if len(grouped.loc[i, "clean_text"]) > 250 else "")
            )
        })

    # FIX 4: explicit fallback instead of silent failure
    if not out:
        fallback_idx = np.argsort(-sem)[:top_k]
        for i in fallback_idx:
            out.append({
                "recipe_name": grouped.loc[i, name_col],
                "recipe_clean": grouped.loc[i, "recipe_clean"],
                "score": round(float(sem[i]), 4),
                "fallback": True,
                "ingredients_preview": (
                    grouped.loc[i, "clean_text"][:250] +
                    ("..." if len(grouped.loc[i, "clean_text"]) > 250 else "")
                )
            })

    return out




  from .autonotebook import tqdm as notebook_tqdm


Loaded 10271 rows. Recipe column: 'recipe_name', ingredient columns: 'ingredient_name_org', 'food_name_org'
Number of unique recipes: 1015
Using device: cpu
Loaded embeddings and mapping from models/

Search results:
Cashewnut burfi (Kaju burfi/Kaju katli) (score=0.85)
  Ingredients preview: kewra essence kewra essence kaju cashew nut anacardium occidentale sugar sugar white water water distilled fat ghee butter

Khasta Kachori (score=0.7885)
  Ingredients preview: fat oil sunflower

Khoa ladoo (score=0.7827)
  Ingredients preview: kewra essence kewra essence almonds almond prunus amygdalus desiccated coconut coconut kernal dry cocos nucifera pistachionut pistachio nuts pistacla vera khoa khoa castor sugar sugar icing

Bottle gourd burfi (Ghiya/Lauki burfi) (score=0.7557)
  Ingredients preview: color green colour kewra essence kewra essence ghia bottle gourd elongate pale green lagenaria vulgaris khoya khoa sugar sugar white water water distilled fat ghee butter fat ghee butter

Daneda

In [4]:
if __name__ == "__main__":
    results = search_recipe_semantic(
        "chapati",
        top_k=7,
        min_score=0.25,
        boost_keywords=["curry", "rice", "indian"]
    )

    print("\nSearch results:")
    for r in results:
        print(f"{r['recipe_name']} (score={r['score']})")
        print("  Ingredients preview:", r["ingredients_preview"])
        if r.get("fallback"):
            print("  [fallback result]")
        print()


Search results:
Chapati/Roti (score=0.9571)
  Ingredients preview: whole wheat flour wheat flour atta triticum aestivum water water distilled ghee or butter ghee butter

Masala vada (score=0.613)
  Ingredients preview: channa dal bengal gram dal cicer arietinum arhar dal red gram dal cajanus cajan green chilli chillies green all varieties capsicum annum curry leaves curry leaves murraya koenigii ginger ginger fresh zingiber officinale onion onion big allium cepa fa...

Paneer soup (score=0.6128)
  Ingredients preview: cabbage cabbage green brassica oleracea var capitata f alba curry leaves curry leaves murraya koenigii garlic garlic small clove allium sativum black pepper pepper black piper nigrum mustard seed mustard seeds brassica nigra paneer paneer ginger ging...

Methi malai paneer (score=0.6127)
  Ingredients preview: kasoori methi fenugreek leaves trigonella foenum graecum peas shelled peas fresh pisum sativum onion onion big allium cepa black cardamom cardamom black elettaria 