In [1]:

import numpy as np
import pandas as pd
import sklearn 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:

df = pd.read_excel('data/recipes.xlsx')

In [3]:

df = df.drop(columns=['aloo gobhi'])


In [4]:

df = df.dropna(subset=['amount', 'unit'])


In [5]:

df = df.drop(columns=[
    'recipe_code_org',
    'recipe_name_org',
    'amount_org',
    'unit_org',
    'food_code_org',
    'food_name'
])


In [6]:

(df.isnull().sum()/df.shape[0])*100


recipe_code            0.000000
recipe_name            0.000000
ingredient_name_org    0.000000
food_name_org          0.000000
food_code              1.168679
amount                 0.000000
unit                   0.000000
dtype: float64

In [7]:

(df.isnull().sum()/df.shape[0])*100


recipe_code            0.000000
recipe_name            0.000000
ingredient_name_org    0.000000
food_name_org          0.000000
food_code              1.168679
amount                 0.000000
unit                   0.000000
dtype: float64

In [8]:

df['food_code'] = df['food_code'].fillna('unknown')



In [9]:

df.head()


Unnamed: 0,recipe_code,recipe_name,ingredient_name_org,food_name_org,food_code,amount,unit
0,ASC001,Hot tea (Garam Chai),Milk,"Milk, whole, Cow",L002,50.0,ml
1,ASC001,Hot tea (Garam Chai),Sugar,"Sugar, white",I502,2.0,tsp
2,ASC001,Hot tea (Garam Chai),Tea leaves,"Tea, black, infusion, average",V510,1.5,tsp
3,ASC001,Hot tea (Garam Chai),Water,"Water, distilled",K505,1.5,C
4,ASC002,Instant coffee,Hot milk,"Milk, whole, Cow",L002,75.0,ml


In [10]:

df['recipe_name']


0                                     Hot tea (Garam Chai)
1                                     Hot tea (Garam Chai)
2                                     Hot tea (Garam Chai)
3                                     Hot tea (Garam Chai)
4                                           Instant coffee
                               ...                        
10266    Garlic chickpea soup (Lahasun aur chane ka sho...
10267    Garlic chickpea soup (Lahasun aur chane ka sho...
10268    Garlic chickpea soup (Lahasun aur chane ka sho...
10269    Garlic chickpea soup (Lahasun aur chane ka sho...
10270    Garlic chickpea soup (Lahasun aur chane ka sho...
Name: recipe_name, Length: 10268, dtype: object

In [11]:

print(np.__version__)
print(pd.__version__)


2.3.3
2.3.3


In [12]:

df = pd.read_excel('data/recipes.xlsx')
# Keep only the columns you need
df = df[['recipe_name', 'food_name_org']].copy()  # 👈 use .copy() to avoid warnings

# Clean text function
def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabetic chars
    text = re.sub(r'\s+', ' ', text)      # remove extra spaces
    return text

# Apply cleaning
df['recipe_name'] = df['recipe_name'].apply(clean_text)
df['food_name_org'] = df['food_name_org'].fillna('').apply(clean_text)

# Group by recipe to combine ingredients
recipes_grouped = (
    df.groupby('recipe_name')['food_name_org']
    .apply(lambda x: ' '.join(x))
    .reset_index()
)

# Preview
print(recipes_grouped.head())


                               recipe_name  \
0                          afghani chicken   
1                               al yakhani   
2          almond biscuit badam ke biscuit   
3  almond cardamom cake badam elaichi cake   
4                almond soup badam ka soup   

                                       food_name_org  
0  fenugreek leaves trigonella foenum graecum lem...  
1  bottle gourd elongate pale green lagenaria vul...  
2  almond essence wheat flour refined triticum ae...  
3  wheat flour refined triticum aestivum cardamom...  
4  wheat flour refined triticum aestivum parsley ...  


In [13]:
'''
recipes_grouped = recipes_grouped.drop_duplicates(subset=['recipe_name']).reset_index(drop=True)
'''


"\nrecipes_grouped = recipes_grouped.drop_duplicates(subset=['recipe_name']).reset_index(drop=True)\n"

In [14]:

# Step 4: TF-IDF + cosine similarity
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(recipes_grouped['food_name_org'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [15]:

def get_similar_recipes(recipe_name, cosine_sim=cosine_sim, recipes=recipes_grouped): 
    recipe_name = clean_text(recipe_name)
    if recipe_name not in recipes['recipe_name'].values:
        return []

    idx = recipes[recipes['recipe_name'] == recipe_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    return [recipes.iloc[i[0]]['recipe_name'] for i in sim_scores]

    

# Example
get_similar_recipes('bhel puri')


['khakhra chaat',
 'split bengal gram dal channa dal',
 'oniongreen chilli paranthaparatha pyaaz aur hari mirch ka paranthaparatha',
 'vegetarian nargisi kofta curry',
 'potato samosa aloo ka samosa']