In [None]:
import pandas as pd

# ==========================================================
# STEP 1 — Load Your Indian Food Dataset
# ==========================================================
df = pd.read_csv("Indian_Food_Nutrition_Processed.csv")
df = pd.read_csv("food.csv")
df = pd.read_csv("nutrition.csv")
# ==========================================================
# STEP 2 — Basic Cleanup
# ==========================================================
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.duplicated(keep='first')]
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df.dropna(axis=1, how='all', inplace=True)

df['source'] = 'Indian_Food_Dataset'

# ==========================================================
# STEP 3 — Detect & Clean Nutrition Columns
# ==========================================================
# Try to find possible nutrition columns automatically
def find_col(possible_names):
    for col in df.columns:
        for name in possible_names:
            if name in col.lower():
                return col
    return None

cal_col = find_col(['calorie', 'energy'])
protein_col = find_col(['protein'])
fat_col = find_col(['fat'])
carb_col = find_col(['carb', 'carbohydrate'])

found_cols = [cal_col, protein_col, fat_col, carb_col]
found_cols = [c for c in found_cols if c is not None]

# Convert to numeric
for col in found_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing numeric values
df[found_cols] = df[found_cols].fillna(df[found_cols].median())

# Rename them uniformly for consistency
rename_map = {}
if cal_col: rename_map[cal_col] = 'calories'
if protein_col: rename_map[protein_col] = 'protein'
if fat_col: rename_map[fat_col] = 'fat'
if carb_col: rename_map[carb_col] = 'carbs'
df.rename(columns=rename_map, inplace=True)

# ==========================================================
# STEP 4 — Add Derived Columns
# ==========================================================
if 'name' not in df.columns:
    df['name'] = df.iloc[:, 0]
df['name'] = df['name'].astype(str).str.title()

allergens = {
    'is_allergen_peanuts': ['peanut', 'groundnut'],
    'is_allergen_dairy': ['milk', 'cheese', 'paneer', 'curd', 'butter', 'ghee'],
    'is_allergen_eggs': ['egg'],
    'is_allergen_gluten': ['wheat', 'barley', 'rye', 'bread', 'roti', 'chapati'],
    'is_allergen_seafood': ['fish', 'prawn', 'shrimp', 'crab'],
    'is_allergen_soy': ['soy', 'soya'],
    'is_allergen_treenuts': ['almond', 'cashew', 'walnut', 'pistachio', 'hazelnut']
}

for col, keywords in allergens.items():
    df[col] = df['name'].str.contains('|'.join(keywords), case=False, na=False)

# Use only existing numeric columns for suitability
if 'carbs' in df.columns:
    df['suitable_diabetes'] = df['carbs'] < df['carbs'].median()
else:
    df['suitable_diabetes'] = False

if 'fat' in df.columns:
    df['suitable_hypertension'] = df['fat'] < df['fat'].median()
else:
    df['suitable_hypertension'] = False

df['is_veg'] = ~df['name'].str.contains(
    'chicken|mutton|fish|egg|beef|prawn|crab|meat', case=False, na=False
)

# ==========================================================
# STEP 5 — Final Dataset
# ==========================================================
final_cols = (
    ['source', 'name']
    + [col for col in ['calories', 'protein', 'fat', 'carbs'] if col in df.columns]
    + ['is_veg']
    + list(allergens.keys())
    + ['suitable_diabetes', 'suitable_hypertension']
)
df_final = df[final_cols].copy()

# ==========================================================
# STEP 6 — Save & Display
# ==========================================================
df_final.to_csv("Cleaned_Indian_Food_Dataset.csv", index=False)
print("✅ Cleaned and organized Indian food dataset created successfully!")
print("Shape:", df_final.shape)
display(df_final.head(20))


✅ Cleaned and organized Indian food dataset created successfully!
Shape: (1014, 16)


Unnamed: 0,source,name,calories,protein,fat,carbs,is_veg,is_allergen_peanuts,is_allergen_dairy,is_allergen_eggs,is_allergen_gluten,is_allergen_seafood,is_allergen_soy,is_allergen_treenuts,suitable_diabetes,suitable_hypertension
0,Indian_Food_Dataset,Hot Tea (Garam Chai),16.14,0.39,0.53,2.58,True,False,False,False,False,False,False,False,True,True
1,Indian_Food_Dataset,Instant Coffee,23.16,0.64,0.75,3.65,True,False,False,False,False,False,False,False,True,True
2,Indian_Food_Dataset,Espreso Coffee,51.54,1.75,2.14,6.62,True,False,False,False,False,False,False,False,True,True
3,Indian_Food_Dataset,Iced Tea,10.34,0.03,0.01,2.7,True,False,False,False,False,False,False,False,True,True
4,Indian_Food_Dataset,Raw Mango Drink (Aam Panna),35.92,0.16,0.03,9.05,True,False,False,False,False,False,False,False,True,True
5,Indian_Food_Dataset,Fruit Punch (With Fresh Juices),36.12,0.14,0.03,9.38,True,False,False,False,False,False,False,False,True,True
6,Indian_Food_Dataset,Fruit Punch (With Squashes),23.13,0.07,0.02,5.99,True,False,False,False,False,False,False,False,True,True
7,Indian_Food_Dataset,Lemonade,20.8,0.03,0.01,5.48,True,False,False,False,False,False,False,False,True,True
8,Indian_Food_Dataset,Lem-O-Gin,21.52,0.08,0.03,5.55,True,False,False,False,False,False,False,False,True,True
9,Indian_Food_Dataset,Cumin Infused Water (Jeere/Zeere Ka Pani),9.09,0.17,0.11,1.86,True,False,False,False,False,False,False,False,True,True
