# Hybrid Recommender System (Content + SVD + Basket)

This notebook:

1. Loads RAW_recipes.csv + RAW_interactions.csv  
2. Preprocesses data  
3. Builds 3 recommender models:
   - Content-based (TF-IDF + cosine similarity)
   - Collaborative (SVD)
   - Basket Model (FP-Growth)
4. Saves all trained models as pickle files  
5. These files will be used by recommender_api.py

In [2]:
# hybrid_train.py
# Hybrid Recommender System (Content + SVD + Basket) - TRAINING SCRIPT

import pandas as pd
import numpy as np
import pickle
import json
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from scipy.sparse import csr_matrix
from difflib import get_close_matches

# =============================================================
# Paths
# =============================================================

RAW_DIR = Path("../data")
CSV_FILE = RAW_DIR / "full_dataset.csv"
PROCESSED_DIR = RAW_DIR / "processed"
PROCESSED_DIR.mkdir(exist_ok=True)

# =============================================================
# Utility
# =============================================================

def clean_list_string(s):
  if isinstance(s, str) and s.strip().startswith("["):
    try:
      return " ".join(eval(s))
    except Exception:
      return s
  return str(s)

# =============================================================
# Recipe Preprocessing
# =============================================================

def preprocess_recipes(csv_file):
  recipes = pd.read_csv(csv_file)
  for col in ["ingredients", "directions", "NER"]:
    recipes[col] = recipes[col].apply(clean_list_string)
  recipes["text"] = (recipes["ingredients"] + " " + recipes["NER"]).str.strip()
  recipes = recipes[["title", "ingredients", "text"]].reset_index()
  recipes.rename(columns={"index": "id", "title": "name"}, inplace=True)
  recipes["name"] = recipes["name"].astype(str).fillna("unknown")
  return recipes

# =============================================================
# TF-IDF
# =============================================================

def compute_tfidf(recipes, max_features=50000):
  tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=max_features,
    dtype=np.float32,
  )
  tfidf_matrix = tfidf.fit_transform(recipes["text"])
  tfidf_norm = normalize(tfidf_matrix)
  return tfidf, csr_matrix(tfidf_norm)

# =============================================================
# Basket Similarity Index
# =============================================================

def build_ingredient_index(recipes):
  ingredient_index = {}
  for i, ingredients in enumerate(recipes["ingredients"].str.split()):
    for ing in set(ingredients):
      ingredient_index.setdefault(ing, set()).add(i)
  return ingredient_index

# =============================================================
# Synthetic Interactions + SVD
# =============================================================

def generate_realistic_interactions(recipes, n_users=800, avg_items=40):
  rng = np.random.default_rng(42)
  recipe_ids = recipes["id"].values
  popularity = rng.lognormal(0, 1, size=len(recipe_ids))
  popularity /= popularity.sum()
  rows = []
  for user_id in range(1, n_users + 1):
    n_items = max(10, int(rng.normal(avg_items, avg_items * 0.3)))
    chosen = rng.choice(recipe_ids, size=n_items, replace=False, p=popularity)
    user_bias = rng.normal(0, 0.4)
    for rid in chosen:
      rating = 4.0 + user_bias + rng.normal(0, 0.4)
      rating = np.clip(rating, 1, 5)
      rows.append(
        {
          "user_id": str(user_id),
          "recipe_id": int(rid),
          "rating": float(rating),
        }
      )
  return pd.DataFrame(rows)

def train_optimized_svd(ratings_df):
  reader = Reader(rating_scale=(1, 5))
  data = Dataset.load_from_df(
    ratings_df[["user_id", "recipe_id", "rating"]],
    reader,
  )
  trainset, testset = train_test_split(data, test_size=0.1, random_state=42)
  best_rmse = float("inf")
  best_model = None
  patience, wait = 2, 0

  for epochs in range(15, 40, 5):
    model = SVD(
      n_factors=80,
      n_epochs=epochs,
      lr_all=0.007,
      reg_all=0.04,
      biased=True,
      random_state=42,
    )
    model.fit(trainset)
    rmse = accuracy.rmse(model.test(testset), verbose=False)
    print(f"Epochs={epochs} RMSE={rmse:.4f}")
    if rmse < best_rmse:
      best_rmse = rmse
      best_model = model
      wait = 0
    else:
      wait += 1
      if wait >= patience:
        break
  print(f"✔ Best RMSE: {best_rmse:.4f}")
  return best_model

# =============================================================
# MAIN
# =============================================================

if __name__ == "__main__":
  print("✔ Preprocessing recipes (full)...")
  recipes_full = preprocess_recipes(CSV_FILE)
  recipes_full.to_pickle(PROCESSED_DIR / "recipes_full.pkl")
  print(f"✔ Full recipes shape: {recipes_full.shape}")

  # ---- Serving subset (for fast online recommendations) ----
  SERVING_SIZE = 100_000
  recipes = recipes_full.head(SERVING_SIZE).reset_index(drop=True)
  print(f"✔ Serving subset shape: {recipes.shape}")

  # =========================================================
  # Build MY_MENU from restaurant menu names via fuzzy matching
  # =========================================================

  MENU_NAMES = [
    # Snacks
    "Sweet Corn",
    "Masala Corn",
    "Spring Roll",
    "Honey Chilli Potato/Gobhi",
    "Chilli Potato",
    "Manchurian Dry",
    "Manchurian Gravy",
    "Cheese Garlic Bread",
    "Stuffed Garlic Bread",
    "Mushroom Chilli Fry",
    "Mushroom Chilli Gravy",
    "Cheese Chilli Fry",
    "Cheese Chilli Gravy",
    "Champ Chilli Fry",
    "Champ Chilli Gravy",
    "Paneer 65",
    "Cheese Finger",

    # Breakfast
    "Aloo Parantha",
    "Gobhi Parantha",
    "Mooli Parantha (Seasonal)",
    "Paneer Parantha",
    "Veg Poha",
    "Upma",
    "Chana Bhatura (2 Pcs)",

    # Maggi
    "Plain Maggi",
    "Veggie Masala Maggi",
    "Makhni Maggi",
    "Paneer Maggi",

    # South Indian
    "Paper Dosa",
    "Masala Dosa",
    "Onion Dosa",
    "Onion Masala Dosa",
    "Ghee Masala Dosa",
    "Paneer Dosa",
    "Rava Dosa",
    "Mix Veg Uttapam",
    "Paneer Uttapam",
    "Vada Sambhar (2 Pcs)",
    "Idli Sambhar (4 Pcs)",
    "Sambhar Rice",
    "Punugullu (12 Pcs)",

    # Momos
    "Veg Steam Momos",
    "Fried Momos",
    "Paneer Steam Momos",
    "Paneer Fried Momos",
    "KFC Momos",
    "Tandoori Momos",
    "Afghani Momos",

    # Rice Bowl
    "Dal Rice Bowl",
    "Nutri Rice Bowl",
    "White Chana Rice Bowl",
    "Rajma Rice Bowl",
    "Paneer Rice Bowl",

    # Indian Veg Thali
    "Budget Thali",
    "Special Thali",
    "D&D's Special Thali",

    # Main Course
    "Dal Fry",
    "Aloo Jeera",
    "Dum Aloo",
    "Dal Makhni",
    "Chana Masala",
    "Matar Paneer",
    "Mushroom Masala",
    "Shahi Paneer",
    "Paneer Lababdar",

    # Chinese Combo
    "Honey Chilli Potato + Noodles/Fried Rice",
    "Manchurian + Noodles/Fried Rice",
    "Cheese Chilli + Noodles/Fried Rice",
    "Regular Chinese Platter",
    "Special Chinese Platter",

    # Indian Breads
    "Plain Roti",
    "Butter Tawa Roti",
    "Plain Prantha",
    "Lachha Prantha",

    # Rice (Jeera / Pulao / Biryani)
    "Jeera Rice (Half)",
    "Jeera Rice (Full)",
    "Matar Pulao (Half)",
    "Matar Pulao (Full)",
    "Veg Biryani (Half)",
    "Veg Biryani (Full)",

    # Fried Rice
    "Mix Veg Fried Rice",
    "Chilli Garlic Fried Rice",
    "Paneer Fried Rice",
    "Mushroom Fried Rice",
    "Veggie Singapore Rice",
    "D&D’s Special Fried Rice",

    # Noodles
    "Veggie Noodles",
    "Schezwan Noodles",
    "Chilli Garlic Noodles",
    "Hakka Noodles",
    "Singapuri Noodles",

    # Wraps
    "Veggie Wrap",
    "Cheese Corn Wrap",
    "Paneer Tikka Wrap",
    "Paneer Patty Wrap",
    "Mushroom Cheese Wrap",
    "Super Veggie Wrap",

    # Burger & Fries
    "Aloo Patty Burger",
    "Cheese Burger",
    "Super Veggie Burger",
    "Paneer Patty Burger",
    "Maharaja Burger",
    "Salted Fries",
    "Masala Fries",
    "Peri Peri Fries",
    "Cheesy Fries",

    # Pizza
    "Margherita Pizza",
    "Cheese & Tomato Pizza",
    "Farm House Pizza",
    "Paneer Makhni Pizza",
    "Fully Loaded Pizza",

    # Pasta
    "White Sauce Pasta",
    "Red Sauce Pasta",
    "Mix Sauce Pasta",
    "Cheese Corn Pasta",
    "Cheese Chilli Pasta",

    # Sandwich
    "Veg Grilled Sandwich",
    "Veg Cheese Grilled Sandwich",
    "Cheese Chilli Sandwich",
    "Paneer Grilled Sandwich",
    "Club Sandwich (3 Layer)",

    # Extras
    "Plain Curd",
    "Mix Raita",
    "Papad",
    "Seasonal Salad",

    # Krushers
    "Chocolate Krusher",
    "Oreo Krusher",
    "Kitkat Krusher",
    "Mango Krusher",
    "Pineapple Krusher",
    "Cold Coffee Krusher",
    "Black Current Krusher",
    "Butterscotch Krusher",

    # Mojito
    "Green Apple Mojito",
    "Green Mint Mojito",
    "Blueberry Mojito",
    "Strawberry Mojito",

    # Beverage
    "Tea",
    "Lassi",
    "Coffee",
    "Mango Shake",
    "Soft Drinks",
]

  print("✔ Building MY_MENU via fuzzy matching on serving subset...")
  title_map = {}
  for _, row in recipes.iterrows():
    norm = str(row["name"]).strip().lower()
    title_map[norm] = (int(row["id"]), row["name"])

  all_titles_norm = list(title_map.keys())

  MY_MENU = {}
  for name in MENU_NAMES:
    q = name.strip().lower()
    match = get_close_matches(q, all_titles_norm, n=1, cutoff=0.3)
    if not match:
      print(f"❌ No close match found for menu item: '{name}'")
      continue
    best = match[0]
    rid, full_title = title_map[best]
    MY_MENU[rid] = name
    print(f"✅ '{name}' -> id={rid}, subset_title='{full_title}'")

  MY_MENU_IDS = list(MY_MENU.keys())
  print(f"✔ Final MY_MENU size (serving subset): {len(MY_MENU)} items")

  with open(PROCESSED_DIR / "my_menu.json", "w") as f:
    json.dump(MY_MENU, f)
  print("✔ Saved my_menu.json built from serving subset")

  print("✔ Computing TF-IDF on serving subset...")
  tfidf, tfidf_norm = compute_tfidf(recipes)

  print("✔ Building ingredient index (serving subset)...")
  ingredient_index = build_ingredient_index(recipes)

  print("✔ Generating interactions + training SVD (serving subset)...")
  ratings = generate_realistic_interactions(recipes)
  svd = train_optimized_svd(ratings)

  # Precompute helpers for serving subset
  name_map = {
    str(n).lower(): i
    for i, n in enumerate(recipes["name"])
    if pd.notnull(n)
  }
  user_to_seen = (
    ratings.groupby("user_id")["recipe_id"]
    .apply(set)
    .to_dict()
  )

  # Save artifacts for serving subset
  print("✔ Saving serving artifacts for FastAPI...")
  recipes.to_pickle(PROCESSED_DIR / "recipes.pkl")
  with open(PROCESSED_DIR / "tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)
  with open(PROCESSED_DIR / "tfidf_norm.pkl", "wb") as f:
    pickle.dump(tfidf_norm, f)
  with open(PROCESSED_DIR / "ingredient_index.pkl", "wb") as f:
    pickle.dump(ingredient_index, f)
  with open(PROCESSED_DIR / "svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)
  with open(PROCESSED_DIR / "name_map.pkl", "wb") as f:
    pickle.dump(name_map, f)
  with open(PROCESSED_DIR / "user_to_seen.pkl", "wb") as f:
    pickle.dump(user_to_seen, f)
  ratings.to_pickle(PROCESSED_DIR / "ratings.pkl")

  print("✔ All serving artifacts saved in ../data/processed")


✔ Preprocessing recipes (full)...
✔ Full recipes shape: (2231142, 4)
✔ Serving subset shape: (100000, 4)
✔ Building MY_MENU via fuzzy matching on serving subset...
✅ 'Sweet Corn' -> id=25918, subset_title='Sweet Popcorn'
✅ 'Masala Corn' -> id=61691, subset_title='Tamale Corn'
✅ 'Spring Roll' -> id=68324, subset_title='Spring Rolls'
✅ 'Honey Chilli Potato/Gobhi' -> id=6577, subset_title='Honey Chili'
✅ 'Chilli Potato' -> id=16535, subset_title='Chilled Potato Soup'
✅ 'Manchurian Dry' -> id=33865, subset_title='Mandarin Dessert'
✅ 'Manchurian Gravy' -> id=44183, subset_title='Chicken Gravy'
✅ 'Cheese Garlic Bread' -> id=49899, subset_title='Cheesy Garlic Bread'
✅ 'Stuffed Garlic Bread' -> id=93507, subset_title='Best Garlic Bread'
✅ 'Mushroom Chilli Fry' -> id=94581, subset_title='Mushroom Curry'
✅ 'Mushroom Chilli Gravy' -> id=69010, subset_title='Mushroom Gravy'
✅ 'Cheese Chilli Fry' -> id=26688, subset_title='Cheesy Chili Franks'
✅ 'Cheese Chilli Gravy' -> id=26688, subset_title='Chee