In [98]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

# 1. LOAD & ROBUSTLY CLEAN DATASETS
veggies = pd.read_csv('vegetables_USDA.csv')
users = pd.read_csv('user_profiles_enhanced.csv')
dist_profiles = pd.read_csv('final_datasets/district_profiles_comprehensive.csv')
veg_dist_suit = pd.read_csv('final_datasets/vegetable_district_suitability.csv')
veg_pref_matrix = pd.read_csv('final_datasets/vegetable_preference_matrix.csv')
veg_season = pd.read_csv('vegetable_seasonality_sri_lanka_comprehensive.csv')
recipes = pd.read_csv('sri_lankan_recipes_comprehensive.csv')

# Load Household datasets for validation
hh_member_profiles = pd.read_csv('household_member_profiles.csv')
hh_aggregates = pd.read_csv('household_aggregates.csv')

# --- CotD 2024 Nutritious Priority Data (Bulletin Page 10) ---
COTD_VEGGIES = {
    'Ampara': ['Bean (yard long)', 'Kathurumurunga'],
    'Anuradhapura': ['Bean (yard long)', 'Kathurumurunga'],
    'Badulla': ['Bean (yard long)', 'Eggplant', 'Kankun', 'Kathurumurunga'],
    'Batticaloa': ['Bean (yard long)', 'Kathurumurunga'],
    'Colombo': ['Bean (yard long)', 'Kathurumurunga'],
    'Galle': ['Bean (yard long)', 'Eggplant', 'Kathurumurunga', 'Mukunuwenna', 'Okra'],
    'Gampaha': ['Bean (yard long)', 'Eggplant', 'Kankun', 'Kathurumurunga', 'Mukunuwenna', 'Okra'],
    'Hambantota': ['Banana blossom', 'Bean (yard long)', 'Kathurumurunga', 'Okra'],
    'Jaffna': ['Ash Plantain', 'Bean (yard long)', 'Eggplant', 'Kankun', 'Kathurumurunga', 'Okra'],
    'Kalutara': ['Bean (yard long)', 'Kathurumurunga'],
    'Kandy': ['Bean (yard long)', 'Kathurumurunga', 'Okra'],
    'Kegalle': ['Bean (yard long)', 'Kathurumurunga'],
    'Kilinochchi': ['Bean (yard long)', 'Kathurumurunga'],
    'Mannar': ['Bean (yard long)', 'Eggplant', 'Kathurumurunga', 'Okra'],
    'Matale': ['Bean (yard long)', 'Kathurumurunga'],
    'Matara': ['Bean (yard long)', 'Kathurumurunga'],
    'Monaragala': ['Bean (yard long)', 'Kathurumurunga'],
    'Mullaitivu': ['Bean (yard long)', 'Kathurumurunga'],
    'Nuwara Eliya': ['Bean (yard long)', 'Kathurumurunga'],
    'Polonnaruwa': ['Bean (yard long)', 'Kathurumurunga'],
    'Puttalam': ['Bean (yard long)', 'Kathurumurunga'],
    'Ratnapura': ['Bean (yard long)', 'Kathurumurunga'],
    'Trincomalee': ['Bean (yard long)', 'Kathurumurunga'],
    'Vavuniya': ['Bean (yard long)', 'Kathurumurunga'],
}

USDA_TO_COTD_MAP = {
    'BEANS, GREEN': 'Bean (yard long)',
    'BEANS, SNAP, RAW': 'Bean (yard long)',
    'MORINGA LEAVES, RAW': 'Kathurumurunga',
    'SWAMP CABBAGE': 'Kankun',
    'WATERSPINACH, RAW': 'Kankun',
    'BEET GREENS, RAW': 'Mukunuwenna',
    'OKRA, RAW': 'Okra',
    'EGGPLANT, RAW': 'Eggplant',
    'PLANTAIN, GREEN, RAW': 'Ash Plantain',
}

def clean_string_for_matching(s):
    if pd.isna(s): return ""
    s = str(s).replace('"', '').replace("'", "").replace(',', ' ').replace('RAW', '')
    return "".join(s.split()).upper()

# Pre-calculate cleaned sets for strict matching
local_veg_names = set(veg_season['vegetable_name'].apply(clean_string_for_matching).unique())
sri_local_codes = set(veg_season['usda_code'].apply(clean_string_for_matching).unique())

# 2. CLEAN VEGETABLE DATASET
core_nutrients = [
    'Energ_Kcal', 'Protein_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)',
    'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 
    'Sodium_(mg)', 'Zinc_(mg)', 'Vit_C_(mg)', 'Vit_A_RAE', 'Vit_K_(Âµg)'
]
available_nutrients = [col for col in core_nutrients if col in veggies.columns]
for col in available_nutrients:
    veggies[col] = pd.to_numeric(veggies[col], errors='coerce').fillna(0)

veg_clean = veggies[['NDB_No', 'Shrt_Desc'] + available_nutrients].copy()
veg_clean = veg_clean[veg_clean['Energ_Kcal'] > 0].drop_duplicates('NDB_No').reset_index(drop=True)

# 3. RDA & HOUSEHOLD AGGREGATION (Layer 4)
COTD_RATIOS = {'child': 0.16, 'adolescent_girl': 0.30, 'adult_male': 0.29, 'adult_female': 0.25}

def aggregate_household(family_members, district):
    """Aggregates family profiles using CotD 2024 ratios."""
    if not family_members: return None, {"message": "No family data"}
    agg_tee, agg_bmr, agg_servings, total_weight = 0, 0, 0, 0
    agg_target = np.zeros(len(available_nutrients))
    conditions, allergies, prefs = set(), set(), set()

    for member in family_members:
        age, gender = member['age'], member['gender'].lower()
        role = 'child' if age < 8 else 'adolescent_girl' if 14 <= age <= 15 and gender == 'female' else 'adult_male' if 30 <= age <= 59 and gender == 'male' else 'adult_female'
        ratio = COTD_RATIOS.get(role, 0.25)
        
        # BMR using Mifflin-St Jeor Equation
        bmr = (10 * member['weight'] + 6.25 * member['height'] - 5 * age + (5 if gender == 'male' else -161))
        tee = bmr * member['activity']
        bmi = member['weight'] / ((member['height']/100) ** 2)
        servings = 4 if bmi < 18.5 else 5 if bmi < 25 else 3
        
        member_target = np.array([tee * 0.2 / 5 if n == 'Energ_Kcal' else 1.0 for n in available_nutrients])
        agg_tee += tee * ratio; agg_servings += servings * ratio; agg_target += member_target * ratio; total_weight += ratio

        if member.get('conditions'): conditions.update(member['conditions'].split(', '))
        if member.get('allergies'): allergies.update(member['allergies'].split(', '))

    if total_weight > 0:
        agg_target /= total_weight
        agg_tee /= total_weight
        agg_bmr /= total_weight
        agg_servings /= total_weight

    profile = dist_profiles[dist_profiles['district'] == district]
    base_cost = 905 if profile.empty else profile['food_self_sufficiency'].values[0] * 905
    agg_cost = base_cost * total_weight

    return agg_target, {
        'agg_tee': agg_tee, 'agg_servings': agg_servings * len(family_members),
        'conditions': ', '.join(conditions), 'allergies': ', '.join(allergies),
        'total_weight': total_weight, 'agg_cost': agg_cost, 'district': district
    }

# 4. RECOMMENDATION ENGINE (V5 - Actions, Seasonality, CotD Priority)
def get_weighted_recommendations(user_idx=None, agg_target=None, agg_meta=None, num_recs=5):
    if agg_target is not None:
        u_target = agg_target.reshape(1, -1); conditions = agg_meta['conditions'].upper()
        allergies = agg_meta['allergies'].upper(); district = agg_meta.get('district', 'Colombo')
    else:
        user = users.iloc[user_idx]; u_target = user_targets_df.iloc[user_idx][available_nutrients].values.reshape(1, -1)
        conditions = str(user.get('Medical_Conditions', '')).upper(); allergies = str(user.get('Allergies_Intolerances', '')).upper(); district = user['District']

    is_maha = datetime.now().month in [10, 11, 12, 1, 2, 3]
    weights = np.ones(len(available_nutrients))
    nutrient_indices = {feat: i for i, feat in enumerate(available_nutrients)}
    if 'ANEMIA' in conditions: weights[nutrient_indices['Iron_(mg)']] = 4.0
    if 'HYPERTENSION' in conditions: weights[nutrient_indices['Potassium_(mg)']] = 3.0; weights[nutrient_indices['Sodium_(mg)']] = 2.5

    filtered_veg = veg_clean.copy()
    if pd.notna(allergies) and allergies != '':
        for allergy in allergies.split(','): filtered_veg = filtered_veg[~filtered_veg['Shrt_Desc'].str.contains(allergy.strip(), case=False)]

    scaler = StandardScaler(); veg_scaled = scaler.fit_transform(filtered_veg[available_nutrients])
    weighted_u_scaled = scaler.transform(u_target * weights); sims = cosine_similarity(weighted_u_scaled, veg_scaled * weights)[0]
    final_scores = sims.copy(); reasons = []

    user_pref = veg_pref_matrix[veg_pref_matrix['district_name'] == district]
    district_cotd = COTD_VEGGIES.get(district, [])

    for i, (idx, veg_row) in enumerate(filtered_veg.iterrows()):
        desc_raw = veg_row['Shrt_Desc'].upper(); desc_clean = clean_string_for_matching(desc_raw)
        
        # Locality Check & Exotic Exclusion
        code_match = desc_clean in sri_local_codes
        name_exact_match = desc_clean in local_veg_names
        forbidden = ['NEW ZEALAND', 'CHINESE', 'PAK-CHOI', 'SWISS CHARD', 'CELERIAC', 'CARDOON', 'ARTICHOKE']
        if (not (code_match or name_exact_match)) or any(f in desc_raw for f in forbidden):
            final_scores[i] = -np.inf; continue 

        # CotD Nutritious Priority Boost (Page 10)
        veg_cotd_name = USDA_TO_COTD_MAP.get(desc_raw, next((v for k, v in USDA_TO_COTD_MAP.items() if k in desc_raw), None))
        if veg_cotd_name and veg_cotd_name in district_cotd:
            final_scores[i] += 0.35
            reasons.append(f"CotD 2024 Priority: {veg_cotd_name}")

        # Seasonality Boost (Scaled by Preference)
        season_match = veg_season[veg_season['usda_code'].apply(clean_string_for_matching) == desc_clean]
        if not season_match.empty:
            s_row = season_match.iloc[0]
            if is_maha and s_row['maha_season'] == 'YES':
                boost = 0.4
                pref_match = user_pref[user_pref['vegetable_usda_code'].apply(clean_string_for_matching) == desc_clean]
                if not pref_match.empty: boost *= (pref_match['overall_preference_index'].values[0] / 10)
                final_scores[i] += boost
                reasons.append(f"In-season Maha Boost (scaled: {boost:.2f})")

    # Tie-breaking diversity noise
    np.random.seed(42)
    final_scores += np.random.uniform(-0.02, 0.02, len(final_scores))
    top_indices = np.argsort(final_scores)[-num_recs:][::-1]
    valid_indices = [idx for idx in top_indices if final_scores[idx] != -np.inf]
    results = filtered_veg.iloc[valid_indices].copy()
    results['final_score'] = [final_scores[idx] for idx in valid_indices]
    
    return results, list(set(reasons))

# 5. LAYER 5: RECIPE RECOMMENDATION
def recommend_recipes(top_veggies, agg_info, district, num_recs=3):
    """Matches recipes to top veggies, filters 'Sri Lankan Traditional', and scales servings/cost."""
    matched_rec = []
    recipes['main_veg_clean'] = recipes['vegetables_usda'].apply(lambda x: [clean_string_for_matching(v) for v in eval(x)])
    for veg in top_veggies:
        veg_cleaned = clean_string_for_matching(veg)
        candidates = recipes[(recipes['main_veg_clean'].apply(lambda x: veg_cleaned in x)) & (recipes['cuisine_type'].str.contains('Traditional', case=False))]
        if not candidates.empty:
            candidates = candidates.sort_values(['popularity_score', 'traditional_rating'], ascending=False).head(2)
            for _, rec in candidates.iterrows():
                scaled_servings = rec['servings'] * agg_info['total_weight']
                total_cost = rec['cost_per_serving_lkr'] * scaled_servings
                # Apply district cost multiplier (PDF page 7/11)
                if district in ['Hambantota', 'Matara', 'Galle']: total_cost *= 0.95 
                matched_rec.append({
                    'recipe_name': rec['recipe_name'],
                    'total_cost_lkr': round(total_cost, 2),
                    'scaled_servings': round(scaled_servings),
                    'reason': f"Uses {veg} - Southern Affordable Traditional"
                })
    return sorted({r['recipe_name']: r for r in matched_rec}.values(), key=lambda x: x['total_cost_lkr'])[:num_recs]

# 6. FINAL EXECUTION & VALIDATION (Tangalle Case Study)
def run_household_recs_final(hh_id, district):
    """Refines affordability using bulletin data and generates final actionable plan."""
    members = hh_member_profiles[hh_member_profiles['Family_ID'] == hh_id]
    family_list = [{'age': r['Age'], 'gender': r['Gender'], 'height': r['Height(cm)'], 'weight': r['Weight(kg)'], 'activity': r['Physical_Activity_Level'], 'conditions': str(r.get('Medical_Conditions', ''))} for _, r in members.iterrows()]
    
    agg_target, agg_meta = aggregate_household(family_list, district)
    recs, reasons = get_weighted_recommendations(agg_target=agg_target, agg_meta=agg_meta)
    
    # Affordability Refinement (2024 Bulletin)
    non_afford_rate = 0.37 
    if district in ['Batticaloa', 'Ampara', 'Nuwara Eliya', 'Anuradhapura']: non_afford_rate = 0.51
    elif district in ['Hambantota', 'Matara', 'Galle']: non_afford_rate = 0.30
        
    agg_meta['non_affordability_risk'] = non_afford_rate
    agg_meta['affordability_gap_lkr'] = agg_meta['agg_cost'] * non_afford_rate
    agg_meta['recommended_recipes'] = recommend_recipes(recs['Shrt_Desc'].head(3).tolist(), agg_meta, district)
    
    return recs, reasons, agg_meta

# Final Execution Check
try:
    tangalle_hh_id = hh_member_profiles[hh_member_profiles['District'] == 'Hambantota']['Family_ID'].iloc[0]
    final_recs, final_reasons, final_meta = run_household_recs_final(tangalle_hh_id, 'Hambantota')
    
    # Save final Plan
    pd.DataFrame(final_meta['recommended_recipes']).to_csv('household_recipes_final.csv', index=False)
    print(f"Success! Final action plan for {tangalle_hh_id} generated.")
    print(f"Priority Veggie: {final_recs['Shrt_Desc'].iloc[0]}")
    print(f"Top Recipe: {final_meta['recommended_recipes'][0]['recipe_name']} (LKR {final_meta['recommended_recipes'][0]['total_cost_lkr']})")
except Exception as e:
    print(f"Validation Error: {e}")

Success! Final action plan for Hambantota_HH001 generated.
Priority Veggie: CABBAGE,RAW
Validation Error: list index out of range
