In [25]:
import zipfile
import os

# Unzip the file
zip_path = "indian-food-101.zip"   # change name if different
extract_path = "dataset_folder"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Dataset extracted successfully!")
print(os.listdir(extract_path))


✅ Dataset extracted successfully!
['indian_food.csv']


In [26]:
import pandas as pd

csv_path = os.path.join(extract_path, "indian_food.csv")
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East


In [27]:
df.info()           # See column types and non-null counts
df.isnull().sum()   # Check for missing values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            255 non-null    object
 1   ingredients     255 non-null    object
 2   diet            255 non-null    object
 3   prep_time       255 non-null    int64 
 4   cook_time       255 non-null    int64 
 5   flavor_profile  255 non-null    object
 6   course          255 non-null    object
 7   state           255 non-null    object
 8   region          254 non-null    object
dtypes: int64(2), object(7)
memory usage: 18.1+ KB


Unnamed: 0,0
name,0
ingredients,0
diet,0
prep_time,0
cook_time,0
flavor_profile,0
course,0
state,0
region,1


In [28]:
import pandas as pd

# Display first 10 rows
df.head(10)


Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East
5,Imarti,"Sugar syrup, lentil flour",vegetarian,10,50,sweet,dessert,West Bengal,East
6,Jalebi,"Maida, corn flour, baking soda, vinegar, curd,...",vegetarian,10,50,sweet,dessert,Uttar Pradesh,North
7,Kaju katli,"Cashews, ghee, cardamom, sugar",vegetarian,10,20,sweet,dessert,-1,-1
8,Kalakand,"Milk, cottage cheese, sugar",vegetarian,20,30,sweet,dessert,West Bengal,East
9,Kheer,"Milk, rice, sugar, dried fruits",vegetarian,10,40,sweet,dessert,-1,-1


In [29]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            255 non-null    object
 1   ingredients     255 non-null    object
 2   diet            255 non-null    object
 3   prep_time       255 non-null    int64 
 4   cook_time       255 non-null    int64 
 5   flavor_profile  255 non-null    object
 6   course          255 non-null    object
 7   state           255 non-null    object
 8   region          254 non-null    object
dtypes: int64(2), object(7)
memory usage: 18.1+ KB


In [30]:
# Fill missing values
df = df.fillna({
    'flavor_profile': 'unknown',
    'course': 'unknown',
    'state': 'unknown',
    'region': 'unknown',
    'diet': 'Veg'  # assume Veg if missing
})

# Drop any rows that are completely empty (if any)
df.dropna(how='all', inplace=True)

print("✅ Missing values handled successfully!")
df.isnull().sum()


✅ Missing values handled successfully!


Unnamed: 0,0
name,0
ingredients,0
diet,0
prep_time,0
cook_time,0
flavor_profile,0
course,0
state,0
region,0


In [31]:
df['total_time'] = df['prep_time'] + df['cook_time']
df[['name', 'prep_time', 'cook_time', 'total_time']].head()


Unnamed: 0,name,prep_time,cook_time,total_time
0,Balu shahi,45,25,70
1,Boondi,80,30,110
2,Gajar ka halwa,15,60,75
3,Ghevar,15,30,45
4,Gulab jamun,15,40,55


In [32]:
df = df[['name', 'ingredients', 'diet', 'flavor_profile', 'course', 'region', 'total_time']]
df.head()


Unnamed: 0,name,ingredients,diet,flavor_profile,course,region,total_time
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,sweet,dessert,East,70
1,Boondi,"Gram flour, ghee, sugar",vegetarian,sweet,dessert,West,110
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,sweet,dessert,North,75
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,sweet,dessert,West,45
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,sweet,dessert,East,55


In [33]:
disease_map = {
    'anemia': {'diet': 'Veg', 'flavor_profile': ['spicy', 'mild'], 'course': ['main course']},
    'diabetes': {'diet': 'Veg', 'flavor_profile': ['mild'], 'course': ['main course', 'snack']},
    'obesity': {'diet': 'Veg', 'flavor_profile': ['mild'], 'course': ['snack', 'salad']},
    'hypertension': {'diet': 'Veg', 'flavor_profile': ['mild'], 'course': ['main course']}
}


In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_meals(disease, top_n=5):
    disease = disease.lower()

    if disease not in disease_map:
        print("❌ Disease not recognized. Try: Anemia, Diabetes, Obesity, Hypertension.")
        return

    prefs = disease_map[disease]

    # Filter dataset based on disease preferences
    filtered = df[
        (df['diet'] == prefs['diet']) &
        (df['flavor_profile'].isin(prefs['flavor_profile'])) &
        (df['course'].isin(prefs['course']))
    ]

    if filtered.empty:
        print("⚠️ No direct match found. Showing healthy Veg meals instead.")
        filtered = df[df['diet'] == 'Veg']

    # Vectorize ingredients text
    vectorizer = CountVectorizer(stop_words='english')
    matrix = vectorizer.fit_transform(filtered['ingredients'].fillna(''))

    # Compute similarity between all dishes
    sim = cosine_similarity(matrix)

    # Pick random dishes (you can also improve this later)
    indices = np.random.choice(len(filtered), min(top_n, len(filtered)), replace=False)

    recommendations = filtered.iloc[indices][['name', 'ingredients', 'flavor_profile', 'course', 'region', 'total_time']]
    return recommendations


In [35]:
df['ingredients'].head(10)


Unnamed: 0,ingredients
0,"Maida flour, yogurt, oil, sugar"
1,"Gram flour, ghee, sugar"
2,"Carrots, milk, sugar, ghee, cashews, raisins"
3,"Flour, ghee, kewra, milk, clarified butter, su..."
4,"Milk powder, plain flour, baking powder, ghee,..."
5,"Sugar syrup, lentil flour"
6,"Maida, corn flour, baking soda, vinegar, curd,..."
7,"Cashews, ghee, cardamom, sugar"
8,"Milk, cottage cheese, sugar"
9,"Milk, rice, sugar, dried fruits"


In [36]:
disease_map = {
    'anemia': {'diet': 'Veg', 'flavor_profile': ['spicy', 'mild', 'sweet'], 'course': ['main course', 'snack', 'dessert']},
    'diabetes': {'diet': 'Veg', 'flavor_profile': ['mild', 'spicy'], 'course': ['main course', 'snack']},
    'obesity': {'diet': 'Veg', 'flavor_profile': ['mild'], 'course': ['snack', 'salad', 'main course']},
    'hypertension': {'diet': 'Veg', 'flavor_profile': ['mild', 'spicy'], 'course': ['main course', 'snack']}
}


In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ensure no empty ingredient text
df['ingredients'] = df['ingredients'].fillna('ingredients')
df = df[df['ingredients'].str.len() > 2]

def recommend_meals(disease, top_n=5):
    disease = disease.lower()

    if disease not in disease_map:
        print("❌ Disease not recognized. Try: Anemia, Diabetes, Obesity, Hypertension.")
        return

    prefs = disease_map[disease]

    # Flexible filtering
    filtered = df[
        (df['diet'].str.lower() == prefs['diet'].lower()) &
        (df['flavor_profile'].str.lower().isin([x.lower() for x in prefs['flavor_profile']])) &
        (df['course'].str.lower().isin([x.lower() for x in prefs['course']]))
    ]

    # Fallback if no matches found
    if filtered.empty:
        print(f"⚠️ No exact match found for {disease}. Showing healthy Veg meals instead.")
        filtered = df[df['diet'].str.lower() == 'veg']

    # Vectorize ingredient text
    vectorizer = CountVectorizer(stop_words='english')
    matrix = vectorizer.fit_transform(filtered['ingredients'])

    # Compute similarity (here we just want random top ones)
    indices = np.random.choice(len(filtered), min(top_n, len(filtered)), replace=False)
    recommendations = filtered.iloc[indices][['name', 'ingredients', 'flavor_profile', 'course', 'region', 'total_time']]

    return recommendations


In [38]:
# Clean the ingredients column thoroughly
df['ingredients'] = df['ingredients'].fillna('')
df['ingredients'] = df['ingredients'].astype(str)
df['ingredients'] = df['ingredients'].apply(lambda x: x.strip())

# Remove rows where ingredients are empty after cleaning
df = df[df['ingredients'].str.len() > 2]

print("✅ Ingredients cleaned properly! Total dishes:", len(df))
df['ingredients'].head(5)


✅ Ingredients cleaned properly! Total dishes: 255


Unnamed: 0,ingredients
0,"Maida flour, yogurt, oil, sugar"
1,"Gram flour, ghee, sugar"
2,"Carrots, milk, sugar, ghee, cashews, raisins"
3,"Flour, ghee, kewra, milk, clarified butter, su..."
4,"Milk powder, plain flour, baking powder, ghee,..."


In [39]:
df['diet'].unique()


array(['vegetarian', 'non vegetarian'], dtype=object)

In [40]:
df['diet'] = df['diet'].astype(str).str.lower()

# Normalize common variants
df['diet'] = df['diet'].replace({
    'vegetarian': 'veg',
    'non vegetarian': 'non-veg',
    'non-vegetarian': 'non-veg',
    'unknown': 'veg'  # assume veg if unknown
})

print(df['diet'].unique())


['veg' 'non-veg']


In [41]:
print("Total Veg dishes:", len(df[df['diet'] == 'veg']))
df[df['diet'] == 'veg'][['name', 'flavor_profile', 'course']].head(5)


Total Veg dishes: 226


Unnamed: 0,name,flavor_profile,course
0,Balu shahi,sweet,dessert
1,Boondi,sweet,dessert
2,Gajar ka halwa,sweet,dessert
3,Ghevar,sweet,dessert
4,Gulab jamun,sweet,dessert


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def recommend_meals(disease, top_n=5):
    disease = disease.lower()

    if disease not in disease_map:
        print("❌ Disease not recognized. Try: Anemia, Diabetes, Obesity, Hypertension.")
        return

    prefs = disease_map[disease]

    # Flexible filtering
    filtered = df[
        (df['diet'] == prefs['diet'].lower()) &
        (df['flavor_profile'].str.lower().isin([x.lower() for x in prefs['flavor_profile']])) &
        (df['course'].str.lower().isin([x.lower() for x in prefs['course']]))
    ]

    # Fallback
    if filtered.empty:
        print(f"⚠️ No exact match found for {disease}. Showing healthy Veg meals instead.")
        filtered = df[df['diet'] == 'veg']

    # If still empty, show first few dishes
    if filtered.empty:
        print("⚠️ Still no Veg dishes found. Showing first few available recipes.")
        return df.head(top_n)[['name', 'ingredients', 'flavor_profile', 'course', 'region']]

    # Ensure ingredients are valid
    filtered = filtered.dropna(subset=['ingredients'])
    filtered = filtered[filtered['ingredients'].str.strip().str.len() > 2]

    # Final fallback
    if filtered.empty:
        print("⚠️ No usable ingredient text found — showing first few Veg dishes instead.")
        return df[df['diet'] == 'veg'].head(top_n)[['name', 'ingredients', 'flavor_profile', 'course', 'region']]

    # Vectorize text
    vectorizer = CountVectorizer(stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b')
    matrix = vectorizer.fit_transform(filtered['ingredients'])

    # Random sample for simplicity
    top_n = min(top_n, len(filtered))
    indices = np.random.choice(filtered.index, size=top_n, replace=False)

    return filtered.loc[indices, ['name', 'ingredients', 'flavor_profile', 'course', 'region']]


In [43]:
print("💪 Recommended meals for Anemia:")
display(recommend_meals('Anemia'))

print("\n🩸 Recommended meals for Diabetes:")
display(recommend_meals('Diabetes'))


💪 Recommended meals for Anemia:


Unnamed: 0,name,ingredients,flavor_profile,course,region
105,Navrattan korma,"Green beans, potatoes, khus khus, low fat, gar...",spicy,main course,North
130,Idli,"Split urad dal, urad dal, idli rice, thick poh...",spicy,snack,South
72,Aloo shimla mirch,"Potato, shimla mirch, garam masala, amchur pow...",spicy,main course,North
253,Mawa Bati,"Milk powder, dry fruits, arrowroot powder, all...",sweet,dessert,Central
207,Surnoli,"Rice flakes, yogurt, raw rice, jaggery, grated...",spicy,snack,West



🩸 Recommended meals for Diabetes:


Unnamed: 0,name,ingredients,flavor_profile,course,region
84,Daal puri,"Moong dal, garam masala powder, garlic, green ...",spicy,main course,East
136,Keerai poriyal,"Amaranth leaves, split urad dal, mustard seeds...",spicy,main course,South
193,Kombdi vade,"Rice flour, urad dal, wheat flour, gram flour,...",spicy,snack,West
181,Dhokla,"Rava, coconut, gram flour, mustard, sesame",spicy,snack,West
119,Shahi paneer,"Cottage cheese, malai, garam masala, ginger, t...",spicy,main course,North
