In [5]:
import pandas as pd
import numpy as np
import ast  # ‚Üê FIXED: this import was missing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

# 1. LOAD & ROBUSTLY CLEAN DATASETS
veggies = pd.read_csv('vegetables_USDA.csv')
users = pd.read_csv('user_profiles_enhanced.csv')
dist_profiles = pd.read_csv('final_datasets/district_profiles_comprehensive.csv')
veg_season = pd.read_csv('vegetable_seasonality_sri_lanka_comprehensive.csv')
recipes = pd.read_csv('sri_lankan_recipes_comprehensive.csv')
hh_member_profiles = pd.read_csv('household_member_profiles.csv')

# Load new price datasets
wholesale_df = pd.read_csv('wholesale historical data.csv')
market_df = pd.read_csv('vegetable_prices_pruned_features.csv')

# Nutrients used for feature engineering
core_nutrients = ['Energ_Kcal', 'Protein_(g)', 'Fiber_TD_(g)', 'Iron_(mg)', 'Potassium_(mg)', 'Vit_C_(mg)']
available_nutrients = [col for col in core_nutrients if col in veggies.columns]

for col in available_nutrients:
    veggies[col] = pd.to_numeric(veggies[col], errors='coerce').fillna(0)

veg_clean = veggies.drop_duplicates('NDB_No').reset_index(drop=True)

def clean_string_for_matching(s):
    if pd.isna(s): return ""
    s = str(s).replace('"', '').replace("'", "").replace(',', ' ').replace('RAW', '')
    return "".join(s.split()).upper()

sri_local_codes = set(veg_season['usda_code'].apply(clean_string_for_matching).unique())

# 2. PRICE LOOKUP FUNCTION
def get_real_price(veg_name, month, year):
    veg_name = veg_name.split(',')[0].upper() if ',' in veg_name else veg_name.upper()
    
    # Try wholesale historical data first
    wholesale_match = wholesale_df[
        (wholesale_df['Vegetable_Name'].str.upper() == veg_name) &
        (wholesale_df['Month'] == month) & (wholesale_df['ISO_Year'] == year)
    ]
    if not wholesale_match.empty:
        return wholesale_match['Avg_Weekly_Price'].mean()
    
    # Fallback to vegetable_prices_pruned_features
    market_match = market_df[
        (market_df['Vegetable'].str.upper() == veg_name) &
        (market_df['Month'] == month) & (market_df['Year'] == year)
    ]
    if not market_match.empty:
        return market_match['Weekly_Price'].mean()
    
    # Default CotD base
    return 905.0 / 7  # Approximate daily

# 3. HOUSEHOLD AGGREGATION & RECIPE LOGIC
COTD_RATIOS = {'child': 0.16, 'adolescent_girl': 0.30, 'adult_male': 0.29, 'adult_female': 0.25}

def recommend_recipes_safe(top_veggies, agg_info, district, num_recs=3):
    matched = []
    recipes['veg_list'] = recipes['vegetables_usda'].apply(
        lambda x: [v.strip().upper() for v in ast.literal_eval(x)] if isinstance(x, str) else []
    )
    
    for veg in top_veggies:
        veg_name = veg.split(',')[0].upper()
        candidates = recipes[recipes['veg_list'].apply(lambda x: any(veg_name in s for s in x))]
        
        if not candidates.empty:
            rec = candidates.iloc[0]
            total_weight = agg_info['total_weight']
            scaled_servings = 4 * total_weight
            # Use real price
            real_price = get_real_price(veg, datetime.now().month, datetime.now().year)
            total_cost = real_price * total_weight
            # Southern discount
            if district in ['Hambantota', 'Matara', 'Galle']: total_cost *= 0.95
            
            matched.append({
                'recipe_name': rec['recipe_name'],
                'total_cost_lkr': round(total_cost),
                'scaled_servings': round(scaled_servings),
                'reason': f"Uses {veg} - Southern Affordable Traditional Meal"
            })
    
    if not matched:
        matched.append({
            'recipe_name': "Sri Lankan Mixed Vegetable Curry",
            'total_cost_lkr': round(250 * agg_info['total_weight']),
            'scaled_servings': round(4 * agg_info['total_weight']),
            'reason': "Nutrient-dense traditional staple"
        })
    
    return sorted({r['recipe_name']: r for r in matched}.values(), key=lambda x: x['total_cost_lkr'])[:num_recs]

# 4. INTEGRATED END-TO-END PIPELINE (pure cosine similarity)
def run_household_pipeline_final(hh_id, district):
    members = hh_member_profiles[hh_member_profiles['Family_ID'] == hh_id]
    if members.empty: return None, "HH Not Found"
    
    total_weight = sum([COTD_RATIOS['child'] if a < 8 else COTD_RATIOS['adult_male'] for a in members['Age']])
    avg_tee, avg_bmi = members['TEE'].mean(), members['BMI'].mean()
    
    # Target vector based on household needs (simplified for cosine)
    user_feats = np.array([members['Age'].mean(), avg_bmi, avg_tee])
    target = np.array([avg_tee * 0.2 / 5 if n == 'Energ_Kcal' else 1.0 for n in available_nutrients])
    
    X_test = []
    filtered_veg = veg_clean[veg_clean['Shrt_Desc'].apply(lambda x: clean_string_for_matching(x) in sri_local_codes)].copy()
    
    for _, v in filtered_veg.iterrows():
        X_test.append(v[available_nutrients].values)  # Only nutrients for cosine
    
    # Normalize and compute cosine similarity
    scaler = StandardScaler()
    veg_matrix = scaler.fit_transform(np.array(X_test))
    target_scaled = scaler.transform(target.reshape(1, -1))
    
    scores = cosine_similarity(target_scaled, veg_matrix)[0]
    top_idx = np.argsort(scores)[-5:][::-1]
    recs = filtered_veg.iloc[top_idx].copy()
    
    # Affordability Refinement
    risk = 0.30 if district == 'Hambantota' else 0.51 if district == 'Batticaloa' else 0.37
    # Use real prices for agg_cost (average over top recommendations)
    agg_cost = sum(get_real_price(recs.iloc[i]['Shrt_Desc'], datetime.now().month, datetime.now().year) 
                   for i in range(len(recs))) * total_weight / len(recs)
    
    meta = {'total_weight': total_weight, 'agg_cost': agg_cost}
    recipes_list = recommend_recipes_safe(recs['Shrt_Desc'].tolist(), meta, district)
    
    return recs, {
        'agg_tee': avg_tee,
        'affordability_gap': agg_cost * risk,
        'recipes': recipes_list,
        'agg_cost': agg_cost
    }

# 5. FINAL EXECUTION (Hambantota/Tangalle Case)
try:
    tangalle_hh_id = hh_member_profiles[hh_member_profiles['District'] == 'Hambantota']['Family_ID'].iloc[0]
    final_recs, final_meta = run_household_pipeline_final(tangalle_hh_id, "Hambantota")
    
    print(f"Success! Final action plan for {tangalle_hh_id} generated.")
    print(f"Priority Veggie: {final_recs['Shrt_Desc'].iloc[0]}")
    if final_meta['recipes']:
        print(f"Top Recipe: {final_meta['recipes'][0]['recipe_name']} (LKR {final_meta['recipes'][0]['total_cost_lkr']})")
    else:
        print("No recipes matched.")
    
    final_recs.to_csv('household_recs_final.csv', index=False)
    pd.DataFrame(final_meta['recipes']).to_csv('household_recipes_final.csv', index=False)
except Exception as e:
    print(f"Validation Error: {e}")

Success! Final action plan for Hambantota_HH001 generated.
Priority Veggie: ONIONS,RAW
Top Recipe: Chicken Curry (LKR 127)
