In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [17]:
# 1. LOAD DATASETS (vegetables_USDA.csv is already filtered)
veggies = pd.read_csv('vegetables_USDA.csv')
users = pd.read_csv('user_profiles_enhanced.csv')

print(f"Veggies: {len(veggies)} vegetables")
print(f"Users: {len(users)} Sri Lankan profiles")

Veggies: 166 vegetables
Users: 999 Sri Lankan profiles


In [18]:
# 2. CLEAN VEGETABLE DATASET (this dataset is already veg-only)
# Drop junk columns
veggies.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

In [20]:
# Exact nutrient columns
core_nutrients = [
    'Energ_Kcal', 'Protein_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)',
    'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 
    'Sodium_(mg)', 'Zinc_(mg)', 'Vit_C_(mg)', 'Vit_A_RAE', 'Vit_K_(µg)'
]

available_nutrients = [col for col in core_nutrients if col in veggies.columns]
print(f"{len(available_nutrients)} nutrients: {available_nutrients}")

15 nutrients: ['Energ_Kcal', 'Protein_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Vit_C_(mg)', 'Vit_A_RAE', 'Vit_K_(µg)']


In [21]:
# Clean numerics only
for col in available_nutrients:
    veggies[col] = pd.to_numeric(veggies[col], errors='coerce').fillna(0)

In [24]:
# Remove invalid entries
veg_clean = veggies[['NDB_No', 'Shrt_Desc'] + available_nutrients]
veg_clean = veg_clean[veg_clean['Energ_Kcal'] > 0].drop_duplicates('NDB_No')
print(f"{len(veg_clean)} clean vegetables")

166 clean vegetables


In [27]:
# 3. CREATE USER TARGET VECTORS (PER SERVING)
print("\nUser nutrient targets (per vegetable serving)")
user_targets = pd.DataFrame(index=users.index, columns=available_nutrients)

# RDA targets PER SERVING (divided by 5 daily servings)
rda_per_serving = {
    'Energ_Kcal': users['TEE'] * 0.2 / 5,      # 20% calories from veggies
    'Protein_(g)': 56/5, 'Carbohydrt_(g)': 130/5, 'Fiber_TD_(g)': 28/5,
    'Sugar_Tot_(g)': 25/5, 'Calcium_(mg)': 1000/5, 'Iron_(mg)': 8/5,
    'Magnesium_(mg)': 310/5, 'Phosphorus_(mg)': 700/5,
    'Potassium_(mg)': 4700/5, 'Sodium_(mg)': 2300/5,
    'Zinc_(mg)': 8/5, 'Vit_C_(mg)': 75/5, 'Vit_A_RAE': 700/5
}

for col in available_nutrients:
    if col in rda_per_serving:
        user_targets[col] = rda_per_serving[col]




User nutrient targets (per vegetable serving)


In [28]:
# PERSONALIZE BY CONDITIONS
for idx, row in users.iterrows():
    conditions = str(row.get('Medical_Conditions', '')).upper()
    
    # Diabetes: Low carb/sugar
    if 'DIABETES' in conditions:
        user_targets.loc[idx, 'Carbohydrt_(g)'] *= 0.6
        user_targets.loc[idx, 'Sugar_Tot_(g)'] *= 0.4
    
    # Hypertension: Low sodium, high potassium
    if 'HYPERTENSION' in conditions:
        user_targets.loc[idx, 'Sodium_(mg)'] *= 0.6
        user_targets.loc[idx, 'Potassium_(mg)'] *= 1.4
    
    # Anemia: High iron
    if 'ANEMIA' in conditions:
        user_targets.loc[idx, 'Iron_(mg)'] *= 2.0
    
    # Thyroid: High iodine/calcium (proxy)
    if 'THYROID' in conditions:
        user_targets.loc[idx, 'Calcium_(mg)'] *= 1.3

In [29]:
user_targets.fillna(0, inplace=True)
user_targets['user_id'] = users['District'] + '_' + users['Age'].astype(str)

In [31]:
# 4. COSINE SIMILARITY MATCHING
scaler = StandardScaler()
user_norm = scaler.fit_transform(user_targets[available_nutrients])
veg_norm = scaler.transform(veg_clean[available_nutrients])

sim_matrix = cosine_similarity(user_norm, veg_norm)
top5_idx = np.argsort(sim_matrix, axis=1)[:, -5:][:, ::-1]

print(f"Similarity matrix: {sim_matrix.shape}")

Similarity matrix: (999, 166)


In [32]:
# 5. DISPLAY TOP RECOMMENDATIONS
print("\nTOP 5 VEGETABLE RECOMMENDATIONS:")
for i in range(min(5, len(users))):
    user_id = user_targets.loc[i, 'user_id']
    conditions = users.loc[i, 'Medical_Conditions']
    
    print(f"\n{user_id} ({conditions})")
    print("-" * 40)
    
    top_veggies = veg_clean.iloc[top5_idx[i]]
    display_cols = ['Shrt_Desc', 'Fiber_TD_(g)', 'Vit_C_(mg)', 'Potassium_(mg)', 'Iron_(mg)']
    top_veggies['sim_score'] = sim_matrix[i][top5_idx[i]]
    
    print(top_veggies[display_cols + ['sim_score']].round(2).head())


TOP 5 VEGETABLE RECOMMENDATIONS:

Polonnaruwa_22 (nan)
----------------------------------------
                                             Shrt_Desc  Fiber_TD_(g)  \
147                    PEAS,GRN,SPLIT,MATURE SEEDS,RAW          25.5   
154            USDA COMMODITY,BF PATTIES W/VPP,FRZ,RAW           1.3   
156  USDA COMMODITY,BF,GROUND,BULK/COARSE GROUND,FR...           0.0   
145                          MUNG BNS,MATURE SEEDS,RAW          16.3   
144       LIMA BNS,THIN SEEDED (BABY),MATURE SEEDS,RAW          20.6   

     Vit_C_(mg)  Potassium_(mg)  Iron_(mg)  sim_score  
147         1.8           823.0       4.82       0.07  
154         0.0           294.0       2.66       0.05  
156         0.0           246.0       1.69       0.05  
145         4.8          1246.0       6.74       0.05  
144         0.0          1403.0       6.19       0.05  

Colombo_48 (Thyroid Disorder, IBS)
----------------------------------------
                       Shrt_Desc  Fiber_TD_(g)  Vit_C_(mg

In [34]:
# 6. SAVE RESULTS
veg_clean.to_csv('vegetables_final.csv', index=False)
user_targets.to_csv('user_targets.csv', index=False)
np.save('top5_recommendations.npy', top5_idx)
pd.DataFrame({'user_id': user_targets['user_id'], 'top5_indices': list(top5_idx)}).to_csv('user_recommendations.csv', index=False)
