In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import ast
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from datetime import datetime
from zoneinfo import ZoneInfo

warnings.filterwarnings('ignore')

# ────────────────────────────────────────────────────────────────
# 1. LOAD ALL DATASETS + NEW PRICE DATASETS
# ────────────────────────────────────────────────────────────────
veggies = pd.read_csv('vegetables_USDA.csv')
users = pd.read_csv('user_profiles_enhanced.csv')
dist_profiles = pd.read_csv('final_datasets/district_profiles_comprehensive.csv')
veg_season = pd.read_csv('vegetable_seasonality_sri_lanka_comprehensive.csv')
recipes = pd.read_csv('sri_lankan_recipes_comprehensive.csv')
hh_member_profiles = pd.read_csv('household_member_profiles.csv')

# New price datasets
wholesale_df = pd.read_csv('wholesale historical data.csv')
market_df = pd.read_csv('vegetable_prices_pruned_features.csv')

core_nutrients = ['Energ_Kcal', 'Protein_(g)', 'Fiber_TD_(g)', 'Iron_(mg)', 
                  'Potassium_(mg)', 'Sodium_(mg)', 'Vit_C_(mg)', 'Vit_A_RAE']
available_nutrients = [col for col in core_nutrients if col in veggies.columns]

for col in available_nutrients:
    veggies[col] = pd.to_numeric(veggies[col], errors='coerce').fillna(0)

veg_clean = veggies[['NDB_No', 'Shrt_Desc'] + available_nutrients].copy()
veg_clean = veg_clean[veg_clean['Energ_Kcal'] > 0].drop_duplicates('NDB_No').reset_index(drop=True)

def clean_string_for_matching(s):
    if pd.isna(s): return ""
    s = str(s).replace('"', '').replace("'", "").replace(',', ' ').replace('RAW', '')
    return "".join(s.split()).upper()

sri_local_codes = set(veg_season['usda_code'].apply(clean_string_for_matching).unique())

# CotD 2024 mapping (Bulletin page 10)
USDA_TO_COTD_MAP = {
    'BEANS, GREEN': 'Bean (yard long)',
    'OKRA, RAW': 'Okra',
    'MORINGA LEAVES, RAW': 'Kathurumurunga',
    'EGGPLANT, RAW': 'Eggplant',
    'SWAMP CABBAGE': 'Kankun',
    'WATERSPINACH, RAW': 'Kankun',
    'BEET GREENS, RAW': 'Mukunuwenna',
    'PLANTAIN, GREEN, RAW': 'Banana blossom',
}

COTD_VEGGIES = {
    'Hambantota': ['Banana blossom', 'Bean (yard long)', 'Kathurumurunga', 'Okra'],
    'Galle': ['Bean (yard long)', 'Eggplant', 'Kathurumurunga', 'Mukunuwenna', 'Okra'],
    'Matara': ['Bean (yard long)', 'Kathurumurunga'],
    # Add more districts as needed
}

# ────────────────────────────────────────────────────────────────
# 2. REAL PRICE LOOKUP FUNCTION
# ────────────────────────────────────────────────────────────────
def get_real_price(veg_name, month=None, year=None):
    if month is None or year is None:
        now = datetime.now(ZoneInfo("Asia/Colombo"))
        month = now.month
        year = now.year
    
    veg_name = veg_name.split(',')[0].upper() if ',' in veg_name else veg_name.upper()
    
    # Primary: Dambulla wholesale
    wholesale_match = wholesale_df[
        (wholesale_df['Vegetable_Name'].str.upper() == veg_name) &
        (wholesale_df['Month'] == month) & (wholesale_df['ISO_Year'] == year)
    ]
    if not wholesale_match.empty:
        return wholesale_match['Avg_Weekly_Price'].mean()
    
    # Fallback: Colombo market
    market_match = market_df[
        (market_df['Vegetable'].str.upper() == veg_name) &
        (market_df['Month'] == month) & (market_df['Year'] == year)
    ]
    if not market_match.empty:
        return market_match['Weekly_Price'].mean()
    
    # Default CotD base (approx daily)
    return 905.0 / 7

# ────────────────────────────────────────────────────────────────
# 3. LTR TRAINING DATA GENERATION (with CotD feature)
# ────────────────────────────────────────────────────────────────
def generate_ltr_training_data(num_users=150):
    X_list, y_list, group_sizes = [], [], []
    scaler = StandardScaler()
    veg_matrix = scaler.fit_transform(veg_clean[available_nutrients])
    
    for i in range(num_users):
        u = users.iloc[i % len(users)]
        u_target = np.array([u.get('TEE', 2000) * 0.2 / 5 if n == 'Energ_Kcal' else 1.0 for n in available_nutrients])
        old_scores = cosine_similarity(u_target.reshape(1, -1), veg_matrix)[0]
        
        top_idx = np.argsort(old_scores)[-10:]
        pos_labels = np.linspace(4.0, 1.0, 10)
        neg_idx = np.random.choice(len(veg_clean), 30, replace=False)
        neg_labels = np.zeros(30)
        
        all_idx = np.concatenate([top_idx, neg_idx])
        labels = np.concatenate([pos_labels, neg_labels])
        user_feats = np.array([u['Age'], u.get('BMI', 22), u.get('TEE', 2000)])
        
        for idx in all_idx:
            v = veg_clean.iloc[idx]
            v_feats = v[available_nutrients].values
            # CotD priority feature (same as inference)
            v_name = clean_string_for_matching(v['Shrt_Desc'])
            cotd_feature = 1 if USDA_TO_COTD_MAP.get(v_name, '') in COTD_VEGGIES.get('Hambantota', []) else 0
            X_list.append(np.concatenate([user_feats, v_feats, [cotd_feature]]))
            y_list.append(labels[np.where(all_idx == idx)[0][0]])
        
        group_sizes.append(len(all_idx))
    
    return np.array(X_list), np.array(y_list), group_sizes

# Train XGBoost LTR
print("Training proper XGBoost LTR model...")
X_train, y_train, groups = generate_ltr_training_data()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(groups)
params = {
    'objective': 'rank:pairwise',
    'eta': 0.05,
    'max_depth': 6,
    'eval_metric': 'ndcg@10',
    'tree_method': 'hist'
}
ranker = xgb.train(params, dtrain, num_boost_round=200)

# ────────────────────────────────────────────────────────────────
# 4. RECIPE RECOMMENDATION LOGIC
# ────────────────────────────────────────────────────────────────
def recommend_recipes_safe(top_veggies, agg_info, district, num_recs=3):
    matched = []
    recipes['veg_list'] = recipes['vegetables_usda'].apply(
        lambda x: [v.strip().upper() for v in ast.literal_eval(x)] if isinstance(x, str) else []
    )
   
    for veg in top_veggies:
        veg_name = veg.split(',')[0].upper()
        candidates = recipes[recipes['veg_list'].apply(lambda x: any(veg_name in s for s in x))]
       
        if not candidates.empty:
            rec = candidates.iloc[0]
            total_weight = agg_info['total_weight']
            # Use REAL price from historical data
            real_price = get_real_price(veg)
            total_cost = real_price * total_weight
            if district in ['Hambantota', 'Matara', 'Galle']:
                total_cost *= 0.95
            
            matched.append({
                'recipe_name': rec['recipe_name'],
                'total_cost_lkr': round(total_cost),
                'scaled_servings': round(4 * total_weight),
                'reason': f"Uses {veg} - Southern Affordable Traditional Meal"
            })
   
    if not matched:
        matched.append({
            'recipe_name': "Sri Lankan Mixed Vegetable Curry",
            'total_cost_lkr': round(250 * agg_info['total_weight']),
            'scaled_servings': round(4 * agg_info['total_weight']),
            'reason': "Nutrient-dense traditional staple"
        })
   
    return sorted({r['recipe_name']: r for r in matched}.values(), key=lambda x: x['total_cost_lkr'])[:num_recs]

# ────────────────────────────────────────────────────────────────
# 5. END-TO-END HOUSEHOLD PIPELINE
# ────────────────────────────────────────────────────────────────
def run_household_pipeline_final(hh_id, district):
    members = hh_member_profiles[hh_member_profiles['Family_ID'] == hh_id]
    if members.empty:
        return None, "Household not found"
   
    total_weight = sum([0.16 if a < 8 else 0.29 if a >= 30 and members['Gender'].iloc[0] == 'Male' else 0.25 
                        for a in members['Age']])
    avg_tee = members['TEE'].mean()
    avg_bmi = members['BMI'].mean()
    
    user_feats = np.array([members['Age'].mean(), avg_bmi, avg_tee])
    X_test = []
    filtered_veg = veg_clean[veg_clean['Shrt_Desc'].apply(lambda x: clean_string_for_matching(x) in sri_local_codes)].copy()
   
    for _, v in filtered_veg.iterrows():
        v_feats = v[available_nutrients].values
        v_name = clean_string_for_matching(v['Shrt_Desc'])
        cotd_feature = 1 if USDA_TO_COTD_MAP.get(v_name, '') in COTD_VEGGIES.get(district, []) else 0
        X_test.append(np.concatenate([user_feats, v_feats, [cotd_feature]]))
   
    dtest = xgb.DMatrix(np.array(X_test))
    scores = ranker.predict(dtest)
    top_idx = np.argsort(scores)[-5:][::-1]
    recs = filtered_veg.iloc[top_idx].copy()
   
    risk = 0.30 if district == 'Hambantota' else 0.51 if district == 'Batticaloa' else 0.37
    agg_cost = sum(get_real_price(recs.iloc[i]['Shrt_Desc']) for i in range(len(recs))) * total_weight / len(recs)
   
    meta = {'total_weight': total_weight, 'agg_cost': agg_cost}
    recipes_list = recommend_recipes_safe(recs['Shrt_Desc'].tolist(), meta, district)
   
    return recs, {
        'agg_tee': avg_tee,
        'affordability_gap': agg_cost * risk,
        'recipes': recipes_list,
        'agg_cost': agg_cost
    }

# ────────────────────────────────────────────────────────────────
# 6. EXECUTE FOR DEMO
# ────────────────────────────────────────────────────────────────
try:
    tangalle_id = hh_member_profiles[hh_member_profiles['District'] == 'Hambantota']['Family_ID'].iloc[0]
    final_recs, final_meta = run_household_pipeline_final(tangalle_id, "Hambantota")
    
    print("Success! XGBoost LTR + Recipes generated for Tangalle.")
    print(f"Top Recommended Veggie: {final_recs['Shrt_Desc'].iloc[0]}")
    print(f"Daily Affordability Gap: LKR {final_meta['affordability_gap']:.2f}")
    if final_meta['recipes']:
        print(f"Top Actionable Recipe: {final_meta['recipes'][0]['recipe_name']}")
        print(f"Estimated Cost: LKR {final_meta['recipes'][0]['total_cost_lkr']}")
    else:
        print("No recipes matched.")
    
    final_recs.to_csv('household_recs_xgboost_ltr.csv', index=False)
    pd.DataFrame(final_meta['recipes']).to_csv('household_recipes_final.csv', index=False)
except Exception as e:
    print(f"Pipeline Error: {e}")

Training proper XGBoost LTR model...
Success! XGBoost LTR + Recipes generated for Tangalle.
Top Recommended Veggie: CASSAVA,RAW
Daily Affordability Gap: LKR 38.40
Top Actionable Recipe: Spinach Curry
Estimated Cost: LKR 122
