In [20]:
import pandas as pd
import numpy as np
import xgboost as xgb
import ast
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

# 1. LOAD & ROBUSTLY CLEAN DATASETS
veggies = pd.read_csv('vegetables_USDA.csv')
users = pd.read_csv('user_profiles_enhanced.csv')
dist_profiles = pd.read_csv('final_datasets/district_profiles_comprehensive.csv')
veg_season = pd.read_csv('vegetable_seasonality_sri_lanka_comprehensive.csv')
recipes = pd.read_csv('sri_lankan_recipes_comprehensive.csv')
hh_member_profiles = pd.read_csv('household_member_profiles.csv')

# Nutrients used for feature engineering
core_nutrients = ['Energ_Kcal', 'Protein_(g)', 'Fiber_TD_(g)', 'Iron_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Vit_C_(mg)', 'Vit_A_RAE']
available_nutrients = [col for col in core_nutrients if col in veggies.columns]

# Ensure data is clean for similarity teacher logic
for col in available_nutrients:
    veggies[col] = pd.to_numeric(veggies[col], errors='coerce').fillna(0)
veg_clean = veggies.drop_duplicates('NDB_No').reset_index(drop=True)

def clean_string_for_matching(s):
    if pd.isna(s): return ""
    s = str(s).replace('"', '').replace("'", "").replace(',', ' ').replace('RAW', '')
    return "".join(s.split()).upper()

sri_local_codes = set(veg_season['usda_code'].apply(clean_string_for_matching).unique())

# 2. PROPER LTR DATA GENERATION (Using Cosine Similarity as a Teacher)
def generate_ltr_training_data(num_users=200):
    """
    Creates a proper LTR training set with (User + Veggie) fused features.
    Labels (relevance) are generated using your existing cosine logic.
    """
    X_list, y_list, group_sizes = [], [], []
    scaler = StandardScaler()
    veg_matrix = scaler.fit_transform(veg_clean[available_nutrients])
    
    for i in range(num_users):
        u = users.iloc[i % len(users)]
        # Target based on user TEE and standard RDA
        u_target = np.array([u.get('TEE', 2000) * 0.2 / 5 if n == 'Energ_Kcal' else 1.0 for n in available_nutrients])
        
        # Ground Truth Teacher Signal
        old_scores = cosine_similarity(u_target.reshape(1, -1), veg_matrix)[0]
        
        top_idx = np.argsort(old_scores)[-10:]
        pos_labels = np.linspace(4.0, 1.0, 10) # Higher rank = higher label
        neg_idx = np.random.choice(len(veg_clean), 30, replace=False)
        neg_labels = np.zeros(30)
        
        all_idx = np.concatenate([top_idx, neg_idx])
        labels = np.concatenate([pos_labels, neg_labels])
        user_feats = np.array([u['Age'], u.get('BMI', 22), u.get('TEE', 2000)])
        
        for idx in all_idx:
            v_feats = veg_clean.iloc[idx][available_nutrients].values
            X_list.append(np.concatenate([user_feats, v_feats]))
            y_list.append(labels[np.where(all_idx == idx)[0][0]])
        group_sizes.append(len(all_idx))
    return np.array(X_list), np.array(y_list), group_sizes

# 3. TRAIN REAL XGBRANKER
print("Training proper XGBoost LTR model...")
X_train, y_train, groups = generate_ltr_training_data()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(groups)

params = {
    'objective': 'rank:pairwise', # Optimization for relative ranking
    'eta': 0.05,
    'max_depth': 6,
    'eval_metric': 'ndcg@10',
    'tree_method': 'hist'
}
ranker = xgb.train(params, dtrain, num_boost_round=300)

# 4. LAYER 5: RECIPE RECOMMENDATION & SCALING
COTD_RATIOS = {'child': 0.16, 'adolescent_girl': 0.30, 'adult_male': 0.29, 'adult_female': 0.25}

def recommend_recipes(top_veggies, agg_info, district, num_recs=3):
    """Matches recipes to top ranked veggies, filters traditional cuisine, and scales portions."""
    matched_rec = []
    # Use ast.literal_eval for safe parsing of main vegetable strings
    recipes['main_veg_clean'] = recipes['vegetables_usda'].apply(
        lambda x: [clean_string_for_matching(v) for v in ast.literal_eval(x)] if isinstance(x, str) else []
    )
    
    for veg in top_veggies:
        veg_cleaned = clean_string_for_matching(veg)
        candidates = recipes[(recipes['main_veg_clean'].apply(lambda x: veg_cleaned in x)) & 
                             (recipes['cuisine_type'].str.contains('Traditional|Sri Lankan', case=False, na=False))]
        
        if not candidates.empty:
            candidates = candidates.sort_values(['popularity_score', 'traditional_rating'], ascending=False).head(2)
            for _, rec in candidates.iterrows():
                # Scale for household size using total_weight
                scaled_servings = rec['servings'] * agg_info['total_weight']
                total_cost = rec['cost_per_serving_lkr'] * scaled_servings
                
                # Southern district cost refinement
                if district in ['Hambantota', 'Matara', 'Galle']: total_cost *= 0.95 
                
                matched_rec.append({
                    'recipe_name': rec['recipe_name'],
                    'scaled_servings': round(scaled_servings),
                    'total_cost_lkr': round(total_cost, 2),
                    'ingredients': rec['other_ingredients'],
                    'reason': f"Uses {veg} - Southern Affordable Traditional Meal"
                })
    return sorted({r['recipe_name']: r for r in matched_rec}.values(), key=lambda x: x['total_cost_lkr'])[:num_recs]

# 5. END-TO-END PIPELINE (Household -> LTR -> Recipes)
def run_household_pipeline_ltr_final(hh_id, district):
    # Aggregation logic with CotD ratios
    members = hh_member_profiles[hh_member_profiles['Family_ID'] == hh_id]
    if members.empty: return None, "HH Not Found"
    
    total_weight = sum([COTD_RATIOS['child'] if a < 8 else COTD_RATIOS['adult_male'] for a in members['Age']])
    avg_tee, avg_bmi = members['TEE'].mean(), members['BMI'].mean()
    
    # XGBRanker Inference Features
    user_feats = np.array([members['Age'].mean(), avg_bmi, avg_tee])
    X_test = []
    filtered_veg = veg_clean[veg_clean['Shrt_Desc'].apply(lambda x: clean_string_for_matching(x) in sri_local_codes)].copy()
    
    for _, v in filtered_veg.iterrows():
        X_test.append(np.concatenate([user_feats, v[available_nutrients].values]))
        
    dtest = xgb.DMatrix(np.array(X_test))
    scores = ranker.predict(dtest)
    top_idx = np.argsort(scores)[-5:][::-1]
    recs = filtered_veg.iloc[top_idx].copy()
    
    # Affordability Refinement (CotD Page 11)
    risk = 0.30 if district == 'Hambantota' else 0.51 if district == 'Batticaloa' else 0.37
    agg_cost = 905 * total_weight
    
    # Layer 5: Scaling Actionable Recipes
    recipes_list = recommend_recipes(recs['Shrt_Desc'].tolist(), {'total_weight': total_weight}, district)
    
    return recs, {
        'agg_tee': avg_tee,
        'affordability_gap': agg_cost * risk,
        'recipes': recipes_list
    }

# 6. FINAL EXECUTION (Hambantota/Tangalle)
try:
    tangalle_id = hh_member_profiles[hh_member_profiles['District'] == 'Hambantota']['Family_ID'].iloc[0]
    final_recs, final_meta = run_household_pipeline_ltr_final(tangalle_id, "Hambantota")
    
    print("Success! XGBoost LTR + Recipes generated for Tangalle.")
    print(f"Top Recommended Veggie: {final_recs['Shrt_Desc'].iloc[0]}")
    print(f"Top Scaled Recipe: {final_meta['recipes'][0]['recipe_name']} (LKR {final_meta['recipes'][0]['total_cost_lkr']})")
    
    final_recs.to_csv('household_recs_xgboost_ltr.csv', index=False)
    pd.DataFrame(final_meta['recipes']).to_csv('household_recipes_final.csv', index=False)
except Exception as e:
    print(f"Pipeline Error: {e}")

Training proper XGBoost LTR model...
Success! XGBoost LTR + Recipes generated for Tangalle.
Top Recommended Veggie: CASSAVA,RAW
Top Scaled Recipe: Pumpkin Curry Variation 1 (LKR 322.32)
