In [1]:
import pandas as pd
import numpy as np
import json
import joblib
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import warnings


SEED = 42
np.random.seed(SEED)


In [24]:
DATA_PATH = r"Data\recipes.csv"   # use raw string


# Read the dataset
df = pd.read_csv(DATA_PATH)

In [6]:
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (522517, 28)


Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"""https://img.sndimg.com/food/image/upload/w_55...",...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"c(""Mix everything together and bring to a boil..."


In [7]:


NUTRITION_COLS = [
    'Calories', 'FatContent', 'SaturatedFatContent',
    'CholesterolContent', 'SodiumContent',
    'CarbohydrateContent', 'FiberContent',
    'SugarContent', 'ProteinContent'
]

df = df.dropna(subset=NUTRITION_COLS)

for col in NUTRITION_COLS:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=NUTRITION_COLS)

df['RecipeIngredientParts'] = df['RecipeIngredientParts'].apply(
    lambda x: x if isinstance(x, list) else str(x).split(';')
)

df.reset_index(drop=True, inplace=True)


In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df[NUTRITION_COLS])

In [None]:
knn = NearestNeighbors(
    n_neighbors=50,
    metric="cosine"
)

knn.fit(X_scaled)

0,1,2
,n_neighbors,50
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [None]:
def compute_bmi(weight, height_cm):
    return weight / ((height_cm / 100) ** 2)

def compute_bmr(weight, height, age, gender):
    if gender.lower() == "male":
        return 10*weight + 6.25*height - 5*age + 5
    return 10*weight + 6.25*height - 5*age - 161

def activity_multiplier(activity):
    mapping = {
        "Little/no exercise": 1.2,
        "Light exercise": 1.375,
        "Moderate exercise (3-5 days/wk)": 1.55,
        "Very active (6-7 days/wk)": 1.725,
        "Extra active (very active & physical job)": 1.9
    }
    return mapping[activity]

def compute_tdee(bmr, activity):
    return bmr * activity_multiplier(activity)

In [None]:
def estimate_target_vector(tdee, meals_per_day, goal):
    calories_per_meal = tdee / meals_per_day

    if goal == "weight_loss":
        return [
            calories_per_meal,
            20, 5, 50, 400,
            40, 10, 5, 35
        ]

    elif goal == "muscle_gain":
        return [
            calories_per_meal,
            25, 6, 70, 500,
            45, 8, 8, 45
        ]

    else:
        return [
            calories_per_meal,
            22, 5, 60, 450,
            45, 9, 6, 40
        ]

In [None]:
def health_penalty(df, bmi, goal):
    penalty = np.zeros(len(df))

    if bmi >= 30:
        penalty += 0.01 * df['Calories']
        penalty += 0.02 * df['FatContent']

    if goal == "weight_loss":
        penalty += 0.03 * df['SugarContent']

    return penalty

In [None]:
def hybrid_rank(df_candidates, target_vector, bmi, goal):
    similarity = cosine_similarity(
        [target_vector],
        scaler.transform(df_candidates[NUTRITION_COLS])
    )[0]

    df = df_candidates.copy()
    df['similarity_score'] = similarity
    df['health_penalty'] = health_penalty(df, bmi, goal)

    df['final_score'] = (
        0.7 * df['similarity_score']
        - 0.3 * df['health_penalty']
    )

    return df.sort_values("final_score", ascending=False)

In [None]:
def recommend(user_input, top_k=10):
    bmi = compute_bmi(user_input['weight'], user_input['height'])
    bmr = compute_bmr(
        user_input['weight'],
        user_input['height'],
        user_input['age'],
        user_input['gender']
    )
    tdee = compute_tdee(bmr, user_input['activity'])

    target_vector = estimate_target_vector(
        tdee,
        user_input['meals_per_day'],
        user_input['goal']
    )

    _, indices = knn.kneighbors(
        scaler.transform([target_vector])
    )

    candidates = df.iloc[indices[0]]
    ranked = hybrid_rank(candidates, target_vector, bmi, user_input['goal'])

    return ranked.head(top_k)

In [None]:
Path("model").mkdir(exist_ok=True)

joblib.dump(knn, "model/knn.pkl")
joblib.dump(scaler, "model/scaler.pkl")

with open("model/nutrition_schema.json", "w") as f:
    json.dump(NUTRITION_COLS, f)

print("Model artifacts exported successfully.")

Model artifacts exported successfully.
