In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv(r"dataset/full_dataset.csv")

df = df.iloc[100:10000]

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [4]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/omnesvera45/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

# Download required NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Set of stop words to remove
stop_words = set(stopwords.words('english'))

# Custom stop words that are common in ingredient lists but not useful
custom_stop_words = set(['chopped', 'sliced', 'fresh', 'dried', 'ground', 'whole', 'finely', 'minced'])

# Synonym mapping (customize this list based on your data)
synonym_map = {
    'bell pepper': 'bell pepper',
    'capsicum': 'bell pepper',
    'cilantro': 'coriander',
    'garlic powder': 'garlic',
    'onions': 'onion',
    'tomatoes': 'tomato',
    'olive oil': 'oil',
    'black pepper': 'pepper',
}


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/omnesvera45/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/omnesvera45/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:

def normalize_ingredient(ingredient):
    # Step 1: Convert to lowercase
    ingredient = ingredient.lower()
    
    # Step 2: Remove punctuation and special characters
    ingredient = re.sub(r'[^a-zA-Z\s]', '', ingredient)
    
    # Step 3: Tokenize the ingredient string
    tokens = ingredient.split()
    
    # Step 4: Remove stop words and custom stop words
    tokens = [word for word in tokens if word not in stop_words and word not in custom_stop_words]
    
    # Step 5: Lemmatize the remaining words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Step 6: Reconstruct the ingredient string
    normalized_ingredient = ' '.join(tokens)
    
    # Step 7: Apply synonym mapping
    for key, value in synonym_map.items():
        if key in normalized_ingredient:
            normalized_ingredient = normalized_ingredient.replace(key, value)
    
    return normalized_ingredient

# Example usage
ingredients_column = df['ingredients']

# Apply normalization
normalized_ingredients = ingredients_column.apply(normalize_ingredient)
df['normalized_ingredients'] = normalized_ingredients
df.to_csv('recipes_normalized.csv', index=False)
print(df.head(5))

     Unnamed: 0                        title  \
100         100                     Pancakes   
101         101  Crustless Vegetable Ham Pie   
102         102       Red Cabbage And Apples   
103         103          German Potato Salad   
104         104            Jan'S Winter Soup   

                                           ingredients  \
100  ["1 c. flour", "1 tsp. soda", "1 tsp. salt", "...   
101  ["1/4 c. butter", "1/4 lb. mushrooms, sliced",...   
102  ["1 c. red cabbage, finely chopped", "2 Tbsp. ...   
103  ["4 lb. potatoes, cooked", "6 slices bacon, di...   
104  ["2 Tbsp. butter", "2 medium onions, chopped",...   

                                            directions  \
100  ["Mix dry ingredients.", "Add egg, margarine a...   
101  ["Preheat oven to 325\u00b0.", "In large fryin...   
102  ["In a saucepan filled with lightly salted boi...   
103  ["Peel and cut potatoes in thin slices.", "Fry...   
104  ["Saute onions and garlic in butter until onio...   

             

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
def find_similar_recipes(user_ingredients, df, tfidf_matrix, tfidf_vectorizer, top_n=5):
    """
    Find top N recipes based on similarity to user's ingredients.
    
    Args:
    - user_ingredients (str): User's input ingredients as a single string.
    - df (pd.DataFrame): DataFrame containing recipe data.
    - tfidf_matrix (scipy.sparse.csr_matrix): TF-IDF matrix of recipe ingredients.
    - tfidf_vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - top_n (int): Number of top recipes to return.
    
    Returns:
    - pd.DataFrame: Top N recipes similar to user's input.
    """
    # Normalize the user's ingredients
    normalized_user_ingredients = normalize_ingredient(user_ingredients)
    
    # Transform the user's ingredients into a TF-IDF vector
    user_tfidf_vector = tfidf_vectorizer.transform([normalized_user_ingredients])
    
    # Compute cosine similarity between user's ingredients and all recipes
    cosine_similarities = cosine_similarity(user_tfidf_vector, tfidf_matrix).flatten()
    
    # Get the top N similar recipe indices
    top_recipe_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Return the top N recipes
    return df.iloc[top_recipe_indices]

In [None]:

# Load your dataset
df = pd.read_csv('recipes_normalized.csv')

# Normalize the ingredient column
df['normalized_ingredients'] = df['ingredients'].apply(normalize_ingredient)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the ingredient data into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['normalized_ingredients'])

In [11]:



# Prompt the user for input ingredients
user_ingredients = input("Enter the ingredients you have (separated by commas): ")

# Find top 5 similar recipes
top_recipes = find_similar_recipes(user_ingredients, df, tfidf_matrix, tfidf_vectorizer, top_n=5)

# Display the top recipes
print("\nTop Recipes Similar to Your Ingredients:")
print(top_recipes[['title', 'ingredients']])

Enter the ingredients you have (separated by commas): chicken, chillies

Top Recipes Similar to Your Ingredients:
                                 title  \
2061                        Chilli Log   
4578         Impossible Quesadilla Pie   
322   Mexicali Chicken And Cheese Bake   
7665                Chicken-N-Stuffing   
2685                   Poached Chicken   

                                            ingredients  
2061         ["1 16 oz. cream cheese", "chilli powder"]  
4578  ["2 cans (4 oz.) green chillies, chopped", "4 ...  
322   ["3 c. cubed, cooked chicken", "8 oz. (2 c.) s...  
7665  ["1 can cream of chicken soup", "2 chicken bre...  
2685                  ["3 lb. chicken (whole)", "salt"]  
