### Data Exploration
The recipes dataset contains 522,517 recipes from 312 different categories. This dataset provides information about each recipe like cooking times, servings, ingredients, nutrition, instructions, and more.

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("../dataset/recipes.csv")

In [3]:
dataset.head(2)

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"c(""https://img.sndimg.com/food/image/upload/w_...",...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"c(""Soak saffron in warm milk for 5 minutes and..."


In [4]:
dataset.shape

(522517, 28)

In [5]:
dataset.columns

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')

In [6]:
dataset.isnull().sum()

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                       82545
PrepTime                           0
TotalTime                          0
DatePublished                      0
Description                        5
Images                             1
RecipeCategory                   751
Keywords                       17237
RecipeIngredientQuantities         3
RecipeIngredientParts              0
AggregatedRating              253223
ReviewCount                   247489
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeServings                182911
RecipeYield                   348071
R

### Data Preprocessing

In [53]:
features = dataset[['Name', 'RecipeIngredientQuantities', 'RecipeIngredientParts','RecipeInstructions']]

In [54]:
features.head(3)

Unnamed: 0,Name,RecipeIngredientQuantities,RecipeIngredientParts,RecipeInstructions
0,Low-Fat Berry Blue Frozen Dessert,"c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla ...","c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,Biryani,"c(""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ...","c(""saffron"", ""milk"", ""hot green chili peppers""...","c(""Soak saffron in warm milk for 5 minutes and..."
2,Best Lemonade,"c(""1 1/2"", ""1"", NA, ""1 1/2"", NA, ""3/4"")","c(""sugar"", ""lemons, rind of"", ""lemon, zest of""...","c(""Into a 1 quart Jar with tight fitting lid, ..."


In [55]:
full_text = features.loc[0, 'RecipeIngredientParts']
print(full_text)

c("blueberries", "granulated sugar", "vanilla yogurt", "lemon juice")


In [56]:
# Converting from R-style list to Python List
def to_python_list(r_list_string):
    if(isinstance(r_list_string, str)):
        return r_list_string.strip('c()').replace('"', '').split(', ')
    return []

In [57]:
test = to_python_list(full_text)
print(test)

['blueberries', 'granulated sugar', 'vanilla yogurt', 'lemon juice']


In [58]:
features.loc[:, 'RecipeIngredientParts'] = features['RecipeIngredientParts'].apply(to_python_list)

In [59]:
features.loc[:, 'RecipeInstructions'] = features['RecipeInstructions'].apply(to_python_list)
features.loc[:, 'RecipeIngredientQuantities'] = features['RecipeIngredientQuantities'].apply(to_python_list)

In [28]:
features.head(2)

Unnamed: 0,RecipeIngredientQuantities,RecipeIngredientParts,RecipeInstructions
0,"[4, 1/4, 1, 1]","[blueberries, granulated sugar, vanilla yogurt...","[Toss 2 cups berries with sugar., Let stand fo..."
1,"[1, 4, 2, 2, 8, 1/4, 8, 1/2, 1, 1, 1/4, 1/4, 1...","[saffron, milk, hot green chili peppers, onion...",[Soak saffron in warm milk for 5 minutes and p...


#### Score Similarity Each Recipe (v1)
1. Simple, fast
2. Can’t handle fuzzy matches like "garlic cloves" vs "garlic"
3. Doesn’t consider importance of ingredients

In [60]:
def ingredient_match_score_v1(recipe_ingredients, user_ingredients):
    return len(set(recipe_ingredients).intersection(set(user_ingredients)))

In [66]:
user_ingredients = ["onion", "garlic", "tomato"]

In [67]:
features_v1 = features.copy()  # Make an explicit copy
features_v1['match_score'] = features_v1['RecipeIngredientParts'].apply(
    lambda recipe: ingredient_match_score_v1(recipe, user_ingredients)
)

In [68]:
top_recipes = features_v1.sort_values(by='match_score', ascending=False)
best_match = top_recipes.iloc[0]

In [69]:
best_match

Name                                               Blumenthal's Small Frittatas
RecipeIngredientQuantities    [2, 1/2, 1/2, NA, 1/2, 1, 4, 2, 1/4, 1/8, 1/8,...
RecipeIngredientParts         [frozen hash browns, cheddar cheese, sausage, ...
RecipeInstructions            [Preheat over to 325 degrees., Grease (or use ...
match_score                                                                   2
Name: 13848, dtype: object

In [70]:
print("Name: ", best_match['Name'])
print("Ingredients:", best_match['RecipeIngredientParts'])
print("\nInstructions:")
for step in best_match['RecipeInstructions']:
    print("-", step)

Name:  Blumenthal's Small Frittatas
Ingredients: ['frozen hash browns', 'cheddar cheese', 'sausage', 'olive oil', 'onion', 'garlic', 'eggs', 'oregano', 'marjoram', 'sage', 'tarragon', 'plain yogurt', 'biscuit mix']

Instructions:
- Preheat over to 325 degrees.
- Grease (or use paper liners) 12 muffin tins.
- Saute onions and garlic in olive oil.
- Cook sausage.
- Microwave or steam-cook potatoes (do not overcook).
- Toss onions
- garlic
- sausage
- and potatoes with cheese.
- Divide evenly into 12 tins.
- Whip eggs
- egg whites
- and fajita seasoning.
- Mix in other seasonings.
- Pour mixture evenly over potatoe mixture.
- Bake 25 minutes.
- Turn oven off
- but let sit in oven 10 minutes.
- Serve warm.
- Mix in yogurt and biscuit mix.



#### TF-IDF + Cosine Similarity (Score Similarity v2)
1. **Term Frequency** – how often a word appears in a document
2. **Inverse Document Frequency** – how rare that word is across all documents

##### Pros:
1. Matches exact words
2. Handles frequent/common words
3. Can rank results better
4. Learns from the whole dataset

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [38]:
features_v2 = features.copy()

In [71]:
# Each recipe’s ingredient list becomes a “document”:
# ["chicken", "salt", "garlic"] → "chicken salt garlic"
corpus = features_v2['RecipeIngredientParts'].apply(lambda parts: ' '.join(parts))

In [72]:
# Vectorize each recipe's ingredients

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

In [73]:
user_query = "onion garlic tomato"

In [74]:
# Vectorize the user's ingredients
user_vec = vectorizer.transform([user_query])

In [75]:
# Use cosine similarity to compare
cos_sim = cosine_similarity(user_vec, tfidf_matrix)

In [76]:
top_idx = np.argmax(cos_sim)
top_recipe = features.iloc[top_idx]

In [77]:
print("Name: ", top_recipe['Name'])
print("Top recipe match:")
print("\nIngredients:", top_recipe['RecipeIngredientParts'])

print("\nInstructions:")
for step in top_recipe['RecipeInstructions']:
    print("-", step)

Name:  Mexican Bean Dip
Top recipe match:

Ingredients: ['tomato sauce', 'onion']

Instructions:
- Place all ingredients in a medium Microwave-safe bowl.
- Mix well.
- Cover with plastic wrap.
- Microwave on High 6 minutes.
- Stir at 3 minutes.
- Serve with corn chips.
