In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# Load the user profiles dataset
user_profiles_df = pd.read_csv('user_profiles_enhanced.csv')

In [3]:
# Define family roles and their characteristics based on PDF (Cost Diet Bulletin 2024)
family_roles = {
    'adult_male': {
        'age_range': (30, 59),
        'gender': 'Male',
        'ratio': 0.29,
        'calorie_factor': 1.0,
        'nutrient_factor': 1.0
    },
    'adult_female': {
        'age_range': (30, 59),
        'gender': 'Female',
        'ratio': 0.25,
        'calorie_factor': 0.85,
        'nutrient_factor': 0.9
    },
    'adolescent_girl': {
        'age_range': (14, 15),
        'gender': 'Female',
        'ratio': 0.30,
        'calorie_factor': 0.9,
        'nutrient_factor': 1.1  # Higher for growth
    },
    'child': {
        'age_range': (6, 7),
        'gender': 'Mixed',  # Can be either
        'ratio': 0.16,
        'calorie_factor': 0.6,
        'nutrient_factor': 0.7
    }
}

# District cost data from PDF (page 7)
district_costs = {
    'Hambantota': 856,  # Lowest cost
    'Puttalam': 1024,   # Highest cost
    'Colombo': 905,     # Average (default)
    'Gampaha': 905,
    'Kalutara': 905,
    'Kandy': 905,
    'Matale': 905,
    'Nuwara Eliya': 905,
    'Galle': 905,
    'Matara': 905,
    'Hambantota': 856,
    'Jaffna': 905,
    'Kilinochchi': 905,
    'Mannar': 905,
    'Vavuniya': 905,
    'Mullaitivu': 905,
    'Batticaloa': 905,
    'Ampara': 905,
    'Trincomalee': 905,
    'Kurunegala': 905,
    'Puttalam': 1024,
    'Anuradhapura': 905,
    'Polonnaruwa': 905,
    'Badulla': 905,
    'Monaragala': 905,
    'Ratnapura': 905,
    'Kegalle': 905
}

# District non-affordability rates from PDF (page 11)
district_non_affordability = {
    'Batticaloa': 0.51,
    'Ampara': 0.48,
    'Nuwara Eliya': 0.48,
    'Anuradhapura': 0.48,
    'Colombo': 0.37,  # National average
    'Gampaha': 0.37,
    'Kalutara': 0.37,
    'Kandy': 0.37,
    'Matale': 0.37,
    'Galle': 0.37,
    'Matara': 0.37,
    'Hambantota': 0.37,
    'Jaffna': 0.37,
    'Kilinochchi': 0.37,
    'Mannar': 0.37,
    'Vavuniya': 0.37,
    'Mullaitivu': 0.37,
    'Trincomalee': 0.37,
    'Kurunegala': 0.37,
    'Puttalam': 0.37,
    'Polonnaruwa': 0.37,
    'Badulla': 0.37,
    'Monaragala': 0.37,
    'Ratnapura': 0.37,
    'Kegalle': 0.37
}

In [4]:
def create_family_member(base_profile, role, member_id, family_id):
    """
    Create a family member profile based on a base profile and role
    """
    member = base_profile.copy()
    
    # Override age and gender based on role
    role_info = family_roles[role]
    member['Age'] = random.randint(*role_info['age_range'])
    member['Gender'] = role_info['gender'] if role_info['gender'] != 'Mixed' else random.choice(['Male', 'Female'])
    
    # Adjust height and weight based on age/gender
    if role == 'child':
        member['Height(cm)'] = random.uniform(110, 125)
        member['Weight(kg)'] = random.uniform(18, 25)
    elif role == 'adolescent_girl':
        member['Height(cm)'] = random.uniform(150, 160)
        member['Weight(kg)'] = random.uniform(40, 50)
    elif role == 'adult_female':
        member['Height(cm)'] = random.uniform(150, 165)
        member['Weight(kg)'] = random.uniform(50, 65)
    elif role == 'adult_male':
        member['Height(cm)'] = random.uniform(160, 180)
        member['Weight(kg)'] = random.uniform(60, 80)
    
    # Recalculate BMI, BMR, TEE
    height_m = member['Height(cm)'] / 100
    member['Height_m'] = height_m
    member['BMI'] = member['Weight(kg)'] / (height_m ** 2)
    
    # Calculate BMR using Mifflin-St Jeor Equation
    if member['Gender'] == 'Male':
        bmr = 10 * member['Weight(kg)'] + 6.25 * member['Height(cm)'] - 5 * member['Age'] + 5
    else:
        bmr = 10 * member['Weight(kg)'] + 6.25 * member['Height(cm)'] - 5 * member['Age'] - 161
    
    member['BMR'] = bmr
    member['TEE'] = bmr * member['Physical_Activity_Level']
    
    # Adjust weight goal
    if member['BMI'] > 25:
        weight_loss = random.uniform(5, 15)
        member['Dietary_Goals(kg)'] = member['Weight(kg)'] - weight_loss
        member['Weight_Goal_Change_kg'] = -weight_loss
    elif member['BMI'] < 18.5:
        weight_gain = random.uniform(3, 8)
        member['Dietary_Goals(kg)'] = member['Weight(kg)'] + weight_gain
        member['Weight_Goal_Change_kg'] = weight_gain
    else:
        member['Dietary_Goals(kg)'] = member['Weight(kg)']
        member['Weight_Goal_Change_kg'] = 0
    
    # Determine BMI category
    bmi = member['BMI']
    if bmi < 18.5:
        bmi_category = 'Underweight'
    elif 18.5 <= bmi < 25:
        bmi_category = 'Normal'
    elif 25 <= bmi < 30:
        bmi_category = 'Overweight'
    else:
        bmi_category = 'Obese'
    
    member['BMI_Category'] = bmi_category
    
    # Adjust vegetable servings based on role
    base_servings = 5  # Base from adult
    if role == 'child':
        member['Veg_Servings_Target'] = int(base_servings * 0.6)
    elif role == 'adolescent_girl':
        member['Veg_Servings_Target'] = int(base_servings * 0.9)
    elif role == 'adult_female':
        member['Veg_Servings_Target'] = int(base_servings * 0.85)
    else:
        member['Veg_Servings_Target'] = base_servings
    
    # Add family information
    member['Family_ID'] = family_id
    member['Member_ID'] = f"{family_id}_{member_id}"
    member['Role'] = role
    member['Role_Ratio'] = role_info['ratio']
    member['Calorie_Factor'] = role_info['calorie_factor']
    member['Nutrient_Factor'] = role_info['nutrient_factor']
    
    return member

In [5]:
def create_household_dataset(user_profiles, households_per_district=3):
    """
    Create a household dataset by generating families from user profiles
    """
    households = []
    
    for district in user_profiles['District'].unique():
        district_profiles = user_profiles[user_profiles['District'] == district]
        
        for hh in range(households_per_district):
            family_id = f"{district}_HH{hh+1:03d}"
            
            # Select a base profile for this family
            base_profile = district_profiles.sample(1).iloc[0].to_dict()
            
            # Define family composition (most common: 2 adults, 1-2 children)
            if random.random() < 0.6:  # 60% nuclear families
                composition = {
                    'adult_male': 1,
                    'adult_female': 1,
                    'adolescent_girl': random.choice([0, 1]),
                    'child': random.choice([1, 2])
                }
            else:  # 40% extended or different compositions
                composition = {
                    'adult_male': random.choice([1, 2]),
                    'adult_female': random.choice([1, 2]),
                    'adolescent_girl': random.choice([0, 1, 2]),
                    'child': random.choice([0, 1, 2, 3])
                }
            
            # Create family members
            member_id = 1
            for role, count in composition.items():
                for _ in range(count):
                    member = create_family_member(base_profile, role, member_id, family_id)
                    households.append(member)
                    member_id += 1
    
    return pd.DataFrame(households)

In [6]:
def calculate_household_aggregates(household_df):
    """
    Calculate aggregate metrics for each household
    """
    aggregates = []
    
    for family_id in household_df['Family_ID'].unique():
        family = household_df[household_df['Family_ID'] == family_id]
        
        # Get district from first member
        district = family.iloc[0]['District']
        
        # Calculate aggregates
        agg = {
            'Family_ID': family_id,
            'District': district,
            'Household_Size': len(family),
            'Num_Adult_Male': len(family[family['Role'] == 'adult_male']),
            'Num_Adult_Female': len(family[family['Role'] == 'adult_female']),
            'Num_Adolescent_Girl': len(family[family['Role'] == 'adolescent_girl']),
            'Num_Child': len(family[family['Role'] == 'child']),
            'Total_TEE': family['TEE'].sum(),
            'Avg_TEE': family['TEE'].mean(),
            'Total_Veg_Servings_Target': family['Veg_Servings_Target'].sum(),
            'Avg_Veg_Servings_Target': family['Veg_Servings_Target'].mean(),
            'Total_BMR': family['BMR'].sum(),
            'Avg_BMI': family['BMI'].mean(),
            'Dietary_Restrictions': '; '.join(family['Restrictions'].dropna().unique()),
            'Medical_Conditions': '; '.join(family['Medical_Conditions'].dropna().unique()),
            'Cultural_Preferences': '; '.join(family['Cultural_Seasonal_Preferences'].dropna().unique())
        }
        
        # Calculate weighted ratio total (from PDF ratios)
        agg['Weight_Ratio_Total'] = (
            agg['Num_Adult_Male'] * family_roles['adult_male']['ratio'] +
            agg['Num_Adult_Female'] * family_roles['adult_female']['ratio'] +
            agg['Num_Adolescent_Girl'] * family_roles['adolescent_girl']['ratio'] +
            agg['Num_Child'] * family_roles['child']['ratio']
        )
        
        # Calculate expected daily cost (based on PDF)
        base_cost = 905  # National average from PDF
        district_cost = district_costs.get(district, base_cost)
        agg['Expected_Daily_Cost_LKR'] = district_cost * (agg['Weight_Ratio_Total'] / 1.0)
        
        # Add non-affordability risk
        agg['Non_Affordability_Risk'] = district_non_affordability.get(district, 0.37)
        agg['Affordability_Status'] = 'Affordable' if agg['Non_Affordability_Risk'] < 0.4 else 'At Risk'
        
        # Calculate affordability gap
        agg['Affordability_Gap_LKR'] = max(0, agg['Expected_Daily_Cost_LKR'] - (base_cost * 0.8))
        
        aggregates.append(agg)
    
    return pd.DataFrame(aggregates)

In [7]:
# Create the household dataset
print("Creating household dataset...")
household_profiles_df = create_household_dataset(user_profiles_df, households_per_district=2)

# Calculate household aggregates
print("Calculating household aggregates...")
household_aggregates_df = calculate_household_aggregates(household_profiles_df)

# Save the datasets
household_profiles_df.to_csv('household_member_profiles.csv', index=False)
household_aggregates_df.to_csv('household_aggregates.csv', index=False)

print(f"Created {len(household_profiles_df)} family member profiles")
print(f"Created {len(household_aggregates_df)} household aggregates")
print("\nSample household member:")
print(household_profiles_df[['Family_ID', 'District', 'Age', 'Gender', 'Role', 'Veg_Servings_Target']].head())
print("\nSample household aggregate:")
print(household_aggregates_df[['Family_ID', 'District', 'Household_Size', 'Total_Veg_Servings_Target', 'Expected_Daily_Cost_LKR', 'Affordability_Status']].head())

Creating household dataset...
Calculating household aggregates...
Created 254 family member profiles
Created 50 household aggregates

Sample household member:
           Family_ID     District  Age  Gender          Role  \
0  Polonnaruwa_HH001  Polonnaruwa   55    Male    adult_male   
1  Polonnaruwa_HH001  Polonnaruwa   31  Female  adult_female   
2  Polonnaruwa_HH001  Polonnaruwa    7    Male         child   
3  Polonnaruwa_HH001  Polonnaruwa    7    Male         child   
4  Polonnaruwa_HH002  Polonnaruwa   46    Male    adult_male   

   Veg_Servings_Target  
0                    5  
1                    4  
2                    3  
3                    3  
4                    5  

Sample household aggregate:
           Family_ID     District  Household_Size  Total_Veg_Servings_Target  \
0  Polonnaruwa_HH001  Polonnaruwa               4                         15   
1  Polonnaruwa_HH002  Polonnaruwa               5                         19   
2      Colombo_HH001      Colombo    