## 1 - Data loading and Cleaning

In [None]:
# Install libraries
!pip install torch_geometric

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import json
import ast
import networkx as nx
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from PIL import Image
import requests
from io import BytesIO
import cv2
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, SAGEConv, HeteroConv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import umap

# Set style for visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Optional advanced imports (with error handling)
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False
    print("XGBoost not available. Install with: pip install xgboost")

warnings.filterwarnings('ignore')

In [None]:
def load_and_explore_dataset():
    """Load the MM-Food-100K dataset and perform initial exploration"""
    print("Loading dataset from Hugging Face...")

    # Load the dataset
    dataset = load_dataset("Codatta/MM-Food-100K")

    # Convert to pandas DataFrame for easier manipulation
    df = pd.DataFrame(dataset['train'])

    print(f"Dataset shape: {df.shape}")
    print("\nDataset columns:")
    print(df.columns.tolist())

    # Display basic info
    print("\nDataset info:")
    print(df.info())

    # Display first few rows
    print("\nFirst 5 rows:")
    display(df.head())

    return df

# Load the dataset
df_raw = load_and_explore_dataset()

def initial_data_analysis(df):
    """Perform initial data analysis"""
    print("="*50)
    print("INITIAL DATA ANALYSIS")
    print("="*50)

    # Check for missing values
    print("\nMissing values per column:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])

    # Check food type distribution
    print("\nFood type distribution:")
    food_type_counts = df['food_type'].value_counts()
    print(food_type_counts)

    # Basic nutritional statistics
    print("\nNutritional statistics:")
    # Extract nutritional information from JSON
    nutrition_cols = ['calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g']

    # Function to extract nutrition values
    def extract_nutrition(nutrition_json, key):
        try:
            if pd.isna(nutrition_json):
                return np.nan
            nutrition_dict = json.loads(nutrition_json.replace("'", "\""))
            return nutrition_dict.get(key, np.nan)
        except:
            return np.nan

    for col in nutrition_cols:
        df[col] = df['nutritional_profile'].apply(lambda x: extract_nutrition(x, col))

    # Display basic stats for nutritional values
    print(df[nutrition_cols].describe())

    return df

# Perform initial analysis
df = initial_data_analysis(df_raw)

### 1B. data cleaning

In [None]:
import re
import json
import ast
import numpy as np
import pandas as pd

def comprehensive_data_cleaning(df):
    """
    Comprehensive data cleaning and wrangling pipeline for MM-Food-100K dataset
    """
    print("="*60)
    print("COMPREHENSIVE DATA CLEANING & WRANGLING")
    print("="*60)

    # Create a copy to avoid modifying the original
    df_clean = df.copy()

    # Function to extract nutrition values safely
    def extract_nutrition(nutrition_json, key):
        try:
            if pd.isna(nutrition_json):
                return np.nan
            nutrition_dict = json.loads(nutrition_json.replace("'", "\""))
            return nutrition_dict.get(key, np.nan)
        except:
            return np.nan

    # Extract nutritional information from JSON first
    nutrition_cols = ['calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g']
    for col in nutrition_cols:
        df_clean[col] = df_clean['nutritional_profile'].apply(lambda x: extract_nutrition(x, col))


    # 1. Handle Missing Values
    print("1. Handling missing values...")

    # Check missing values
    missing_percent = (df_clean.isnull().sum() / len(df_clean)) * 100
    print("Missing values percentage:")
    print(missing_percent[missing_percent > 0].sort_values(ascending=False))

    # Strategy for different columns
    missing_strategies = {
        # Nutritional data: impute with median by food_type
        'calories_kcal': 'median_by_type',
        'protein_g': 'median_by_type',
        'fat_g': 'median_by_type',
        'carbohydrate_g': 'median_by_type',

        # Text data: fill with appropriate defaults
        'ingredients': 'empty_list', # This column will be replaced by 'ingredients_cleaned'
        'cooking_methods': 'unknown', # This column will be replaced by 'cooking_methods_cleaned'
        'portion_sizes': 'empty_list', # This column will be replaced by 'portion_weights' and 'total_weight_g'

        # Other columns
        'dish_name': 'unknown_dish',
        'food_type': 'unknown_type'
    }

    # Apply missing value strategies
    for col, strategy in missing_strategies.items():
        if col in df_clean.columns and df_clean[col].isnull().sum() > 0:
            if strategy == 'median_by_type':
                # Impute with median of the same food_type
                df_clean[col] = df_clean.groupby('food_type')[col].transform(
                    lambda x: x.fillna(x.median()) if x.notnull().sum() > 0 else x.fillna(0)
                )
            elif strategy == 'empty_list':
                df_clean[col] = df_clean[col].fillna('[]')
            elif strategy == 'unknown':
                df_clean[col] = df_clean[col].fillna('unknown')
            elif strategy == 'unknown_dish':
                df_clean[col] = df_clean[col].fillna('unknown_dish')
            elif strategy == 'unknown_type':
                df_clean[col] = df_clean[col].fillna('unknown_type')

    # 2. Data Type Conversion and Validation
    print("\n2. Data type conversion and validation...")

    # Convert nutritional columns to numeric, handling errors
    nutrition_cols = ['calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g']
    for col in nutrition_cols:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        # Fill any remaining NaN with median after coercion
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    # 3. Ingredient List Processing
    print("\n3. Processing ingredient lists...")

    def clean_ingredient_list(ingredient_str):
        """Clean and standardize ingredient lists"""
        try:
            if pd.isna(ingredient_str) or ingredient_str == '[]' or ingredient_str is None:
                return []

            # Handle different list formats
            if isinstance(ingredient_str, str):
                # Clean the string
                ingredient_str = ingredient_str.strip()
                if ingredient_str.startswith('[') and ingredient_str.endswith(']'):
                    ingredients = ast.literal_eval(ingredient_str)
                else:
                    # Handle malformed lists
                    ingredients = [ing.strip() for ing in ingredient_str.split(',')]
            elif isinstance(ingredient_str, list):
                 ingredients = ingredient_str
            else:
                 return []


            # Clean each ingredient
            cleaned_ingredients = []
            for ingredient in ingredients:
                if isinstance(ingredient, str):
                    # Standardize formatting
                    ing = ingredient.lower().strip()
                    # Remove common prefixes/suffixes
                    ing = re.sub(r'^\d+\s*', '', ing)  # Remove quantities like "2 "
                    ing = re.sub(r'\s*\(.*\)', '', ing)  # Remove parentheses content
                    ing = re.sub(r'\s*(tbsp|tsp|cup|cups|oz|lb|lbs|g|kg|ml|l)$', '', ing)  # Remove units
                    ing = ing.strip()

                    if ing and len(ing) > 1:  # Filter out empty/single character ingredients
                        cleaned_ingredients.append(ing)

            return list(set(cleaned_ingredients))  # Remove duplicates

        except (ValueError, SyntaxError, TypeError) as e:
            print(f"Error processing ingredients: {e}")
            return []

    df_clean['ingredients_cleaned'] = df_clean['ingredients'].apply(clean_ingredient_list)

    # 4. Cooking Methods Processing
    print("\n4. Processing cooking methods...")

    def clean_cooking_methods(method_str):
        """Clean and standardize cooking methods"""
        try:
            if pd.isna(method_str) or method_str is None or method_str == 'unknown':
                return []

            if isinstance(method_str, str):
                methods = [m.strip().lower() for m in method_str.split(',')]
                methods = [m for m in methods if m and m != 'unknown']
                return list(set(methods))  # Remove duplicates
            elif isinstance(method_str, list):
                 methods = method_str
                 methods = [m.strip().lower() for m in methods if isinstance(m, str)]
                 methods = [m for m in methods if m and m != 'unknown']
                 return list(set(methods))
            else:
                return []

        except (AttributeError, TypeError) as e:
            print(f"Error processing cooking methods: {e}")
            return []

    df_clean['cooking_methods_cleaned'] = df_clean['cooking_method'].apply(clean_cooking_methods)

    # 5. Portion Size Processing
    print("\n5. Processing portion sizes...")

    def extract_portion_weights(portion_str):
        """Extract weights from portion size information"""
        try:
            if pd.isna(portion_str) or portion_str is None or portion_str == '[]':
                return [], 0

            if isinstance(portion_str, str):
                if portion_str.startswith('[') and portion_str.endswith(']'):
                    portions = ast.literal_eval(portion_str)
                else:
                    portions = [p.strip() for p in portion_str.split(',')]
            elif isinstance(portion_str, list):
                 portions = portion_str
            else:
                 return [], 0


            weights = []
            total_weight = 0

            for portion in portions:
                if isinstance(portion, str) and ':' in portion:
                    try:
                        # Extract weight value
                        weight_part = portion.split(':')[1].strip()
                        # Remove units and convert to float
                        weight_value = re.sub(r'[^\d.]', '', weight_part)
                        if weight_value:
                            weight_float = float(weight_value)
                            weights.append(weight_float)
                            total_weight += weight_float
                    except (ValueError, IndexError):
                        continue

            return weights, total_weight

        except (ValueError, SyntaxError, TypeError) as e:
            print(f"Error processing portion sizes: {e}")
            return [], 0

    portion_results = df_clean['portion_size'].apply(extract_portion_weights)
    df_clean['portion_weights'] = portion_results.apply(lambda x: x[0])
    df_clean['total_weight_g'] = portion_results.apply(lambda x: x[1])

    # 6. Outlier Detection and Handling
    print("\n6. Handling outliers...")

    def detect_outliers_iqr(series, threshold=1.5):
        """Detect outliers using IQR method"""
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        return (series < lower_bound) | (series > upper_bound)

    # Detect outliers in nutritional values
    outlier_cols = ['calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g', 'total_weight_g']
    for col in outlier_cols:
        if col in df_clean.columns:
            outliers = detect_outliers_iqr(df_clean[col].dropna())
            print(f"Outliers in {col}: {outliers.sum()} ({outliers.mean()*100:.2f}%)")

            # Cap outliers (winsorization)
            if outliers.sum() > 0:
                # Using 1st and 99th percentiles as capping values
                lower_bound = df_clean[col].quantile(0.01)
                upper_bound = df_clean[col].quantile(0.99)
                df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)

    # 7. Text Data Cleaning
    print("\n7. Cleaning text data...")

    def clean_dish_name(name):
        """Clean and standardize dish names"""
        if pd.isna(name) or name is None or name == 'unknown_dish':
            return 'unknown_dish'

        name = str(name).strip().lower()
        # Remove extra spaces and special characters
        name = re.sub(r'[^\w\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name)
        return name.strip()

    df_clean['dish_name'] = df_clean['dish_name'].apply(clean_dish_name)

    # 8. Food Type Standardization
    print("\n8. Standardizing food types...")

    def standardize_food_type(food_type):
        """Standardize food type categories"""
        if pd.isna(food_type) or food_type is None or food_type == 'unknown_type':
            return 'unknown'

        food_type = str(food_type).lower().strip()

        # Standardize categories
        type_mapping = {
            'homemade': 'homemade',
            'home made': 'homemade',
            'home-made': 'homemade',
            'restaurant': 'restaurant',
            'restaurant food': 'restaurant',
            'raw': 'raw_vegetables_fruits',
            'raw vegetables': 'raw_vegetables_fruits',
            'raw fruits': 'raw_vegetables_fruits',
            'vegetables': 'raw_vegetables_fruits',
            'fruits': 'raw_vegetables_fruits',
            'packaged': 'packaged_food',
            'packaged food': 'packaged_food',
            'processed': 'packaged_food'
        }

        return type_mapping.get(food_type, food_type)

    df_clean['food_type_standardized'] = df_clean['food_type'].apply(standardize_food_type)

    # 9. Feature Engineering Preparation
    print("\n9. Preparing for feature engineering...")

    # Create flags for data quality
    df_clean['has_nutrition_data'] = (
        (df_clean['calories_kcal'].notna()) &
        (df_clean['protein_g'].notna()) &
        (df_clean['fat_g'].notna()) &
        (df_clean['carbohydrate_g'].notna())
    )

    df_clean['has_ingredients'] = df_clean['ingredients_cleaned'].apply(lambda x: len(x) > 0)
    df_clean['has_cooking_methods'] = df_clean['cooking_methods_cleaned'].apply(lambda x: len(x) > 0)
    df_clean['has_portion_data'] = df_clean['total_weight_g'].notna() & (df_clean['total_weight_g'] > 0)

    # Calculate data completeness score
    df_clean['data_quality_score'] = (
        df_clean['has_nutrition_data'].astype(int) +
        df_clean['has_ingredients'].astype(int) +
        df_clean['has_cooking_methods'].astype(int) +
        df_clean['has_portion_data'].astype(int)
    ) / 4

    # 10. Final Data Quality Report
    print("\n10. Final data quality report:")

    quality_report = {
        'total_records': len(df_clean),
        'records_with_complete_nutrition': df_clean['has_nutrition_data'].sum(),
        'records_with_ingredients': df_clean['has_ingredients'].sum(),
        'records_with_cooking_methods': df_clean['has_cooking_methods'].sum(),
        'records_with_portion_data': df_clean['has_portion_data'].sum(),
        'records_with_high_quality': (df_clean['data_quality_score'] >= 0.75).sum(),
        'average_data_quality_score': df_clean['data_quality_score'].mean()
    }

    for metric, value in quality_report.items():
        if 'average' in metric:
            print(f"{metric}: {value:.3f}")
        else:
            print(f"{metric}: {value}")

    # 11. Export cleaned data
    print("\n11. Exporting cleaned data...")

    # Select only the cleaned columns for further analysis
    clean_columns = [
        'dish_name', 'food_type_standardized', 'calories_kcal',
        'protein_g', 'fat_g', 'carbohydrate_g', 'ingredients_cleaned',
        'cooking_methods_cleaned', 'portion_weights', 'total_weight_g',
        'has_nutrition_data', 'has_ingredients', 'has_cooking_methods',
        'has_portion_data', 'data_quality_score'
    ]

    # Keep original columns that are already clean or needed for other steps
    original_clean_cols = [col for col in df.columns if col not in [
        'dish_name', 'food_type', 'nutritional_profile', 'ingredients',
        'cooking_method', 'portion_size' # Corrected 'cooking_methods' and 'portion_sizes'
    ]]

    # Ensure 'sub_dt', 'image_url', 'camera_or_phone_prob', 'food_prob' are included
    essential_original_cols = ['sub_dt', 'image_url', 'camera_or_phone_prob', 'food_prob']
    for col in essential_original_cols:
      if col not in original_clean_cols:
        original_clean_cols.append(col)


    final_columns = clean_columns + original_clean_cols
    # Filter out columns that don't exist in df_clean
    df_final = df_clean[[col for col in final_columns if col in df_clean.columns]]

    # Save cleaned dataset
    df_final.to_csv('mm_food_100k_cleaned.csv', index=False)
    print("Cleaned dataset saved to 'mm_food_100k_cleaned.csv'")

    return df_final

# Enhanced load function with comprehensive cleaning
def load_and_clean_dataset():
    """
    Load dataset and apply comprehensive cleaning
    """
    print("Loading and cleaning MM-Food-100K dataset...")

    # Load the dataset
    dataset = load_dataset("Codatta/MM-Food-100K")
    df = pd.DataFrame(dataset['train'])

    print(f"Original dataset shape: {df.shape}")

    # Apply comprehensive cleaning
    df_clean = comprehensive_data_cleaning(df)

    print(f"Cleaned dataset shape: {df_clean.shape}")
    print("\nCleaned dataset info:")
    print(df_clean.info())

    return df_clean

# Load and clean the dataset
df = load_and_clean_dataset()

## 2 - EDA

### **Objective**
 This comprehensive EDA provides a solid foundation for understanding the cleaned dataset's characteristics, relationships, and patterns, which is essential for effective feature engineering and model building.

**The analysis reveals:**

    

*    Clear relationships between ingredients, cooking methods, and nutritional
*   Patterns in food preparation complexity
* Insights into dietary characteristics across different food types
*Data quality assessment for reliable modeling

### **Key Data EDA Steps:**

**1. Data Quality**

   *  Missing values properly handled

   *  Consistent data types and formats

*   Outliers identified and treated

**2. Nutritional Patterns**

   *  Macronutrient distributions and correlations

* Energy density analysis

*  Portion size characteristics

**3. Ingredient Relationships**

*     Most common ingredients and their frequencies

*     Relationship between ingredient count and nutrition

*     Ingredient diversity patterns

**4. Cooking Method Impact**

 *    Most frequently used cooking techniques

*  Nutritional differences between cooking methods

*     Preparation time analysis

**5. Advanced Metrics**

*     Macronutrient ratios

*     Data quality scores

*     Complexity measures

In [None]:
def comprehensive_eda_cleaned(df):
    """
    Comprehensive EDA using the cleaned dataset
    """
    print("="*60)
    print("COMPREHENSIVE EDA WITH CLEANED DATA")
    print("="*60)

    # 1. Dataset Overview
    print("1. DATASET OVERVIEW")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes.value_counts()}")

    # Check for remaining missing values
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    print(f"\nMissing values:\n{missing_values[missing_values > 0]}")
    print(f"Missing percentage:\n{missing_percent[missing_percent > 0].round(2)}")

    # 2. Basic Statistics
    print("\n2. BASIC STATISTICS")

    # Nutritional statistics
    nutrition_cols = ['calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g', 'total_weight_g']
    print("Nutritional statistics:")
    print(df[nutrition_cols].describe().round(2))

    # Ingredient statistics
    df['num_ingredients'] = df['ingredients_cleaned'].apply(len)
    df['num_cooking_methods'] = df['cooking_methods_cleaned'].apply(len)

    print(f"\nAverage ingredients per recipe: {df['num_ingredients'].mean():.2f}")
    print(f"Average cooking methods per recipe: {df['num_cooking_methods'].mean():.2f}")
    # Removed the line referencing 'estimated_prep_time'
    # print(f"Average preparation time: {df['estimated_prep_time'].mean():.2f} minutes")

    # 3. Food Type Analysis
    print("\n3. FOOD TYPE ANALYSIS")

    if 'food_type_standardized' in df.columns:
        food_type_counts = df['food_type_standardized'].value_counts()
        print("Food type distribution:")
        for food_type, count in food_type_counts.items():
            print(f"  {food_type}: {count} ({count/len(df)*100:.1f}%)")

        # Nutritional comparison by food type
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        for i, col in enumerate(nutrition_cols[:4]):
            row, col_idx = i // 2, i % 2
            sns.boxplot(data=df, x='food_type_standardized', y=col, ax=axes[row, col_idx])
            axes[row, col_idx].set_title(f'{col} by Food Type')
            axes[row, col_idx].tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()

    # 4. Ingredient Analysis
    print("\n4. INGREDIENT ANALYSIS")

    # Get all ingredients
    all_ingredients = [ingredient for sublist in df['ingredients_cleaned'] for ingredient in sublist]
    ingredient_counts = Counter(all_ingredients)

    print(f"Total unique ingredients: {len(ingredient_counts)}")
    print(f"Total ingredient instances: {len(all_ingredients)}")

    # Top ingredients
    top_ingredients = ingredient_counts.most_common(20)
    print("\nTop 20 ingredients:")
    for ingredient, count in top_ingredients:
        print(f"  {ingredient}: {count}")

    # Plot top ingredients
    plt.figure(figsize=(12, 8))
    ingredients, counts = zip(*top_ingredients)
    sns.barplot(x=list(counts), y=list(ingredients))
    plt.title('Top 20 Most Common Ingredients')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()

    # 5. Cooking Methods Analysis
    print("\n5. COOKING METHODS ANALYSIS")

    # Get all cooking methods
    all_methods = [method for sublist in df['cooking_methods_cleaned'] for method in sublist]
    method_counts = Counter(all_methods)

    print(f"Total unique cooking methods: {len(method_counts)}")
    print(f"Total method instances: {len(all_methods)}")

    # Top cooking methods
    top_methods = method_counts.most_common(15)
    print("\nTop 15 cooking methods:")
    for method, count in top_methods:
        print(f"  {method}: {count}")

    # Plot cooking methods
    plt.figure(figsize=(12, 6))
    methods, counts = zip(*top_methods)
    sns.barplot(x=list(counts), y=list(methods))
    plt.title('Top 15 Cooking Methods')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.show()

    # 6. Nutritional Analysis
    print("\n6. NUTRITIONAL ANALYSIS")

    # Distribution of nutritional values
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))

    # Check if 'nutritional_density' exists before adding to list
    nutrition_cols_extended = nutrition_cols.copy()
    if 'nutritional_density' in df.columns:
        nutrition_cols_extended.append('nutritional_density')

    for i, col in enumerate(nutrition_cols_extended):
        row, col_idx = i // 3, i % 3
        if col in df.columns:
            sns.histplot(df[col].dropna(), bins=50, ax=axes[row, col_idx], kde=True)
            axes[row, col_idx].set_title(f'Distribution of {col}')
            axes[row, col_idx].set_xlabel(col)

    plt.tight_layout()
    plt.show()

    # Correlation matrix
    # Ensure all columns exist before calculating correlation
    cols_for_corr = [col for col in nutrition_cols_extended if col in df.columns]
    if len(cols_for_corr) > 1:
        nutritional_corr = df[cols_for_corr].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(nutritional_corr, annot=True, cmap='coolwarm', center=0, fmt='.2f')
        plt.title('Nutritional Values Correlation Matrix')
        plt.tight_layout()
        plt.show()
    else:
        print("\nInsufficient nutritional columns for correlation matrix.")


    # 7. Portion Size Analysis
    print("\n7. PORTION SIZE ANALYSIS")

    if 'total_weight_g' in df.columns:
        print(f"Average portion size: {df['total_weight_g'].mean():.2f}g")
        print(f"Median portion size: {df['total_weight_g'].median():.2f}g")
        print(f"Portion size range: {df['total_weight_g'].min():.2f}g - {df['total_weight_g'].max():.2f}g")

        plt.figure(figsize=(10, 6))
        sns.histplot(df['total_weight_g'].dropna(), bins=50, kde=True)
        plt.title('Distribution of Portion Sizes')
        plt.xlabel('Weight (g)')
        plt.ylabel('Count')
        plt.show()

    # 8. Relationship: Ingredients vs Nutrition
    print("\n8. INGREDIENTS vs NUTRITION RELATIONSHIP")

    # Scatter plots: Number of ingredients vs nutritional values
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    for i, col in enumerate(nutrition_cols[:4]):
        row, col_idx = i // 2, i % 2
        sns.scatterplot(data=df, x='num_ingredients', y=col, alpha=0.6, ax=axes[row, col_idx])
        axes[row, col_idx].set_title(f'{col} vs Number of Ingredients')
        axes[row, col_idx].set_xlabel('Number of Ingredients')
        axes[row, col_idx].set_ylabel(col)

    plt.tight_layout()
    plt.show()

    # Correlation analysis
    ingredient_nutrition_corr = df[['num_ingredients'] + nutrition_cols].corr()
    print("Correlation between ingredient count and nutrition:")
    print(ingredient_nutrition_corr['num_ingredients'][1:].round(3))

    # 9. Relationship: Cooking Methods vs Nutrition
    print("\n9. COOKING METHODS vs NUTRITION RELATIONSHIP")

    # Analyze nutritional impact of cooking methods
    top_methods_list = [method for method, count in top_methods[:8]]
    method_nutrition = {}

    for method in top_methods_list:
        method_recipes = df[df['cooking_methods_cleaned'].apply(lambda x: method in x)]
        if len(method_recipes) > 10:
            method_nutrition[method] = {
                'count': len(method_recipes),
                'avg_calories': method_recipes['calories_kcal'].mean(),
                'avg_protein': method_recipes['protein_g'].mean(),
                'avg_fat': method_recipes['fat_g'].mean(),
                'avg_carbs': method_recipes['carbohydrate_g'].mean()
            }

    # Create comparison plot
    methods = list(method_nutrition.keys())
    calories = [method_nutrition[m]['avg_calories'] for m in methods]
    protein = [method_nutrition[m]['avg_protein'] for m in methods]
    fat = [method_nutrition[m]['avg_fat'] for m in methods]
    carbs = [method_nutrition[m]['avg_carbs'] for m in methods]

    x = np.arange(len(methods))
    width = 0.2

    plt.figure(figsize=(14, 8))
    plt.bar(x - width*1.5, calories, width, label='Calories', alpha=0.8)
    plt.bar(x - width/2, protein, width, label='Protein (g)', alpha=0.8)
    plt.bar(x + width/2, fat, width, label='Fat (g)', alpha=0.8)
    plt.bar(x + width*1.5, carbs, width, label='Carbs (g)', alpha=0.8)

    plt.xlabel('Cooking Methods')
    plt.ylabel('Nutritional Values')
    plt.title('Nutritional Profile by Cooking Method')
    plt.xticks(x, methods, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

    # 10. Advanced Nutritional Analysis
    print("\n10. ADVANCED NUTRITIONAL ANALYSIS")

    # Macronutrient ratios
    # Check if columns exist before calculating ratios
    if all(col in df.columns for col in nutrition_cols):
        total_macros = df['protein_g'] + df['fat_g'] + df['carbohydrate_g']
        df['protein_ratio'] = df['protein_g'] / total_macros.replace(0, 1)
        df['fat_ratio'] = df['fat_g'] / total_macros.replace(0, 1)
        df['carb_ratio'] = df['carbohydrate_g'] / total_macros.replace(0, 1)

        # Plot macronutrient distribution
        macro_ratios = ['protein_ratio', 'fat_ratio', 'carb_ratio']
        plt.figure(figsize=(12, 8))
        for ratio in macro_ratios:
            sns.kdeplot(df[ratio].dropna(), label=ratio.replace('_ratio', '').title())
        plt.title('Distribution of Macronutrient Ratios')
        plt.xlabel('Ratio')
        plt.ylabel('Density')
        plt.legend()
        plt.show()
    else:
        print("\nInsufficient nutritional columns for macronutrient ratio analysis.")


    # Energy density analysis
    if 'total_weight_g' in df.columns and 'calories_kcal' in df.columns:
        df['energy_density'] = df['calories_kcal'] / df['total_weight_g'].replace(0, 1)

        plt.figure(figsize=(10, 6))
        sns.histplot(df['energy_density'].dropna(), bins=50, kde=True)
        plt.title('Distribution of Energy Density (calories/gram)')
        plt.xlabel('Energy Density')
        plt.ylabel('Count')
        plt.show()
    else:
        print("\nInsufficient columns for energy density analysis.")


    # 11. Data Quality Assessment
    print("\n11. DATA QUALITY ASSESSMENT")

    if 'data_quality_score' in df.columns:
        print(f"Average data quality score: {df['data_quality_score'].mean():.3f}")
        print(f"Data quality distribution:")
        print(df['data_quality_score'].value_counts().sort_index())

        plt.figure(figsize=(10, 6))
        sns.histplot(df['data_quality_score'], bins=10, kde=True)
        plt.title('Distribution of Data Quality Scores')
        plt.xlabel('Data Quality Score')
        plt.ylabel('Count')
        plt.show()
    else:
        print("\n'data_quality_score' column not found for quality assessment.")


    # 12. Outlier Analysis
    print("\n12. OUTLIER ANALYSIS")

    def detect_outliers(series, threshold=3):
        z_scores = np.abs((series - series.mean()) / series.std())
        return z_scores > threshold

    outlier_cols = ['calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g', 'total_weight_g']
    outlier_results = {}

    for col in outlier_cols:
        if col in df.columns:
            outliers = detect_outliers(df[col].dropna())
            outlier_results[col] = {
                'count': outliers.sum(),
                'percentage': (outliers.sum() / len(df[col].dropna())) * 100
            }
        else:
            outlier_results[col] = {'count': 0, 'percentage': 0.0}


    print("Outlier analysis:")
    for col, results in outlier_results.items():
        print(f"  {col}: {results['count']} outliers ({results['percentage']:.2f}%)")

    # 13. Temporal Patterns (Preparation Time)
    print("\n13. TEMPORAL PATTERNS")

    if 'estimated_prep_time' in df.columns:
        print(f"Preparation time statistics:")
        print(f"  Mean: {df['estimated_prep_time'].mean():.2f} minutes")
        print(f"  Median: {df['estimated_prep_time'].median():.2f} minutes")
        print(f"  Std: {df['estimated_prep_time'].std():.2f} minutes")

        # Preparation time distribution
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(df['estimated_prep_time'].dropna(), bins=30, kde=True)
        plt.title('Distribution of Preparation Time')
        plt.xlabel('Minutes')

        plt.subplot(1, 2, 2)
        sns.boxplot(y=df['estimated_prep_time'].dropna())
        plt.title('Boxplot of Preparation Time')
        plt.ylabel('Minutes')

        plt.tight_layout()
        plt.show()

        # Preparation time vs complexity
        if 'num_ingredients' in df.columns and 'num_cooking_methods' in df.columns:
            complexity_corr = df[['estimated_prep_time', 'num_ingredients', 'num_cooking_methods']].corr()
            print("Correlation between preparation time and complexity:")
            print(complexity_corr['estimated_prep_time'][1:].round(3))
        else:
             print("\nInsufficient columns for preparation time vs complexity analysis.")
    else:
        print("\n'estimated_prep_time' column not found for temporal analysis.")


    # 14. Comprehensive Summary
    print("\n14. COMPREHENSIVE SUMMARY")

    summary_stats = {
        'total_recipes': len(df),
        'avg_ingredients': df['num_ingredients'].mean(),
        'avg_cooking_methods': df['num_cooking_methods'].mean(),
        'avg_calories': df['calories_kcal'].mean(),
        'avg_protein': df['protein_g'].mean(),
        'avg_fat': df['fat_g'].mean(),
        'avg_carbs': df['carbohydrate_g'].mean(),
        'avg_prep_time': df['estimated_prep_time'].mean() if 'estimated_prep_time' in df.columns else None,
        'data_quality': df['data_quality_score'].mean() if 'data_quality_score' in df.columns else None
    }

    print("Dataset Summary:")
    for key, value in summary_stats.items():
        if value is not None:
            if isinstance(value, float):
                print(f"  {key.replace('_', ' ').title()}: {value:.2f}")
            else:
                print(f"  {key.replace('_', ' ').title()}: {value}")

    return {
        'ingredient_counts': ingredient_counts,
        'method_counts': method_counts,
        'nutritional_stats': df[nutrition_cols].describe(),
        'outlier_analysis': outlier_results,
        'summary_stats': summary_stats
    }

# Run comprehensive EDA on cleaned data
eda_results = comprehensive_eda_cleaned(df)

## 3 - Feature Engineering

### 3.1 Multimodal Feature Extraction

Extracts multimodal features including:

*    Nutritional ratios and distributions

*    Ingredient co-occurrence networks

*    Cultural cuisine clustering

*   Portion size analysis

*    Image quality assessment (placeholder)

In [None]:
def multimodal_feature_extraction(df):
    """Extract multimodal features from the dataset"""
    print("="*50)
    print("MULTIMODAL FEATURE EXTRACTION")
    print("="*50)

    # 1. Nutritional Analysis Features
    print("\n1. Nutritional Analysis Features")

    # Calculate nutritional ratios
    df['protein_calorie_ratio'] = df['protein_g'] / df['calories_kcal']
    df['fat_calorie_ratio'] = df['fat_g'] / df['calories_kcal']
    df['carb_calorie_ratio'] = df['carbohydrate_g'] / df['calories_kcal']

    # Replace infinities with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 2. Ingredient Network Analysis
    print("\n2. Ingredient Network Analysis")

    # Create ingredient co-occurrence matrix
    all_ingredients = list(set([ingredient for sublist in df['ingredients_cleaned'].tolist() for ingredient in sublist]))
    ingredient_index = {ingredient: idx for idx, ingredient in enumerate(all_ingredients)}

    # Initialize co-occurrence matrix
    co_occurrence = np.zeros((len(all_ingredients), len(all_ingredients)))

    # Fill co-occurrence matrix
    for ingredients in df['ingredients_cleaned']:
        for i in range(len(ingredients)):
            for j in range(i+1, len(ingredients)):
                idx1 = ingredient_index[ingredients[i]]
                idx2 = ingredient_index[ingredients[j]]
                co_occurrence[idx1, idx2] += 1
                co_occurrence[idx2, idx1] += 1

    # Create ingredient graph
    G = nx.Graph()

    # Add nodes
    for ingredient in all_ingredients:
        G.add_node(ingredient)

    # Add edges (only for top ingredients to reduce complexity)
    top_ingredients = [ingredient for ingredient, count in Counter(all_ingredients).most_common(50)]

    for i in range(len(top_ingredients)):
        for j in range(i+1, len(top_ingredients)):
            ing1, ing2 = top_ingredients[i], top_ingredients[j]
            idx1, idx2 = ingredient_index[ing1], ingredient_index[ing2]
            weight = co_occurrence[idx1, idx2]
            if weight > 0:
                G.add_edge(ing1, ing2, weight=weight)

    # Calculate network metrics
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)

    print(f"Ingredient network created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    print("Top ingredients by degree centrality:")
    for ingredient in sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:10]:
        print(f"{ingredient}: {degree_centrality[ingredient]:.4f}")

    # 3. Cultural Cuisine Clustering
    print("\n3. Cultural Cuisine Clustering")

    # Create features for clustering (simplified approach)
    # In a real scenario, we'd use more sophisticated text embeddings
    cuisine_keywords = {
        'italian': ['pasta', 'tomato', 'basil', 'olive oil', 'parmesan'],
        'mexican': ['tortilla', 'chili', 'avocado', 'lime', 'cilantro'],
        'asian': ['soy sauce', 'ginger', 'garlic', 'rice', 'sesame oil'],
        'indian': ['curry', 'turmeric', 'cumin', 'coriander', 'garam masala'],
        'american': ['cheese', 'beef', 'potato', 'ketchup', 'mayonnaise']
    }

    def detect_cuisine(ingredients):
        scores = {cuisine: 0 for cuisine in cuisine_keywords}
        for ingredient in ingredients:
            for cuisine, keywords in cuisine_keywords.items():
                if ingredient in keywords:
                    scores[cuisine] += 1
        if max(scores.values()) == 0:
            return 'unknown'
        return max(scores.items(), key=lambda x: x[1])[0]

    df['predicted_cuisine'] = df['ingredients_cleaned'].apply(detect_cuisine)

    print("Cuisine distribution:")
    print(df['predicted_cuisine'].value_counts())

    # 4. Portion Size Analysis
    print("\n4. Portion Size Analysis")

    # Calculate realistic serving sizes based on food type
    serving_sizes = df.groupby('food_type_standardized')['total_weight_g'].agg(['mean', 'median', 'std']).round(2)
    print("Serving sizes by food type:")
    print(serving_sizes)

    # Flag unrealistic portion sizes (outside 2 standard deviations)
    def flag_unrealistic_portion(row):
        if pd.isna(row['total_weight_g']):
            return False
        food_type = row['food_type_standardized']
        # Check if food_type exists in serving_sizes index
        if food_type not in serving_sizes.index:
            return False # Or handle as appropriate

        stats = serving_sizes.loc[food_type]
        # Handle potential NaN or Inf in stats
        if pd.isna(stats['mean']) or pd.isna(stats['std']):
            return False

        lower_bound = stats['mean'] - 2 * stats['std']
        upper_bound = stats['mean'] + 2 * stats['std']
        return row['total_weight_g'] < lower_bound or row['total_weight_g'] > upper_bound

    df['unrealistic_portion'] = df.apply(flag_unrealistic_portion, axis=1)
    print(f"Number of unrealistic portion sizes: {df['unrealistic_portion'].sum()}")

    # 5. Image Quality Assessment (simulated - would need actual images)
    print("\n5. Image Quality Assessment")

    # This would normally require downloading and processing images
    # For demonstration, we'll create placeholder features
    df['image_quality_score'] = np.random.uniform(0.7, 1.0, len(df))  # Placeholder

    print("Image quality score distribution:")
    print(df['image_quality_score'].describe())

    return df, G

# Extract multimodal features
df, ingredient_graph = multimodal_feature_extraction(df)

### 3.2 Nutritional Engineering

Engineers advanced nutritional features including:

*    Macronutrient ratios

*     Health indicators (high-fiber, low-sodium, balanced meal flags)

*     Dietary compatibility (vegan, keto, paleo, gluten-free)

*     Nutritional density scores

In [None]:
def nutritional_engineering(df):
    """Create advanced nutritional features"""
    print("="*50)
    print("NUTRITIONAL ENGINEERING")
    print("="*50)

    # 1. Macronutrient Ratios
    print("\n1. Macronutrient Ratios")

    # Calculate macronutrient percentages
    total_macros = df['protein_g'] + df['fat_g'] + df['carbohydrate_g']
    df['protein_pct'] = df['protein_g'] / total_macros * 100
    df['fat_pct'] = df['fat_g'] / total_macros * 100
    df['carb_pct'] = df['carbohydrate_g'] / total_macros * 100

    # Replace infinities with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # 2. Health Indicators
    print("\n2. Health Indicators")

    # High fiber flag (assuming we had fiber data)
    # For demonstration, we'll create a placeholder based on ingredients
    high_fiber_ingredients = ['whole wheat', 'oats', 'beans', 'lentils', 'broccoli',
                             'avocado', 'berries', 'apples', 'nuts', 'seeds']

    def has_high_fiber(ingredients):
        return any(ingredient in high_fiber_ingredients for ingredient in ingredients)

    df['high_fiber'] = df['ingredients_cleaned'].apply(has_high_fiber)

    # Low sodium flag (placeholder)
    df['low_sodium'] = np.random.choice([True, False], len(df), p=[0.3, 0.7])

    # Balanced meal flag (reasonable distribution of macros)
    def is_balanced(row):
        if pd.isna(row['protein_pct']) or pd.isna(row['fat_pct']) or pd.isna(row['carb_pct']):
            return False
        return (15 <= row['protein_pct'] <= 35 and
                20 <= row['fat_pct'] <= 40 and
                35 <= row['carb_pct'] <= 65)

    df['balanced_meal'] = df.apply(is_balanced, axis=1)

    print(f"High fiber meals: {df['high_fiber'].sum()}")
    print(f"Low sodium meals: {df['low_sodium'].sum()}")
    print(f"Balanced meals: {df['balanced_meal'].sum()}")

    # 3. Dietary Compatibility
    print("\n3. Dietary Compatibility")

    # Define dietary restrictions
    vegan_restricted = ['meat', 'chicken', 'beef', 'pork', 'fish', 'seafood', 'egg',
                       'dairy', 'milk', 'cheese', 'butter', 'honey', 'gelatin']
    keto_restricted = ['sugar', 'honey', 'maple syrup', 'rice', 'pasta', 'bread', 'potato',
                      'corn', 'beans', 'grains', 'fruit juice']
    paleo_restricted = ['dairy', 'legumes', 'grains', 'processed food', 'refined sugar',
                       'vegetable oil', 'soy', 'peanut']
    gluten_restricted = ['wheat', 'barley', 'rye', 'bread', 'pasta', 'cereal', 'couscous']

    def check_diet_compatibility(ingredients, diet_restrictions):
        return not any(ingredient in diet_restrictions for ingredient in ingredients)

    df['vegan'] = df['ingredients_cleaned'].apply(lambda x: check_diet_compatibility(x, vegan_restricted))
    df['keto'] = df['ingredients_cleaned'].apply(lambda x: check_diet_compatibility(x, keto_restricted))
    df['paleo'] = df['ingredients_cleaned'].apply(lambda x: check_diet_compatibility(x, paleo_restricted))
    df['gluten_free'] = df['ingredients_cleaned'].apply(lambda x: check_diet_compatibility(x, gluten_restricted))

    print("Dietary compatibility counts:")
    print(f"Vegan: {df['vegan'].sum()}")
    print(f"Keto: {df['keto'].sum()}")
    print(f"Paleo: {df['paleo'].sum()}")
    print(f"Gluten-free: {df['gluten_free'].sum()}")

    # 4. Nutritional Density Scores
    print("\n4. Nutritional Density Scores")

    # Create a simplified nutritional density score
    # In a real scenario, this would incorporate more nutrients
    def calculate_nutritional_density(row):
        if pd.isna(row['calories_kcal']) or row['calories_kcal'] == 0:
            return np.nan

        # Base score on protein content (higher protein = more nutrient dense)
        protein_score = min(row['protein_g'] / row['calories_kcal'] * 100, 10)

        # Adjust based on food type
        food_type_bonus = {
            'raw_vegetables_fruits': 2,
            'homemade': 1,
            'restaurant': 0,
            'packaged_food': -1
        }.get(row['food_type_standardized'], 0)

        # Adjust based on balanced meal flag
        balance_bonus = 2 if row['balanced_meal'] else 0

        return protein_score + food_type_bonus + balance_bonus

    df['nutritional_density'] = df.apply(calculate_nutritional_density, axis=1)

    print("Nutritional density score distribution:")
    print(df['nutritional_density'].describe())

    # Visualize nutritional density by food type
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x='food_type_standardized', y='nutritional_density')
    plt.title('Nutritional Density by Food Type')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return df

# Perform nutritional engineering
df = nutritional_engineering(df)

## 4 - Machine Learning Models

In [None]:
def focused_ml_pipeline(df, target_variable='calories_kcal', test_size=0.2, random_state=42):
    """
    Focused ML pipeline with 2-3 models, comparison, and parameter tuning
    """
    print("="*60)
    print("FOCUSED MACHINE LEARNING PIPELINE")
    print("="*60)

    # 1. Data Preparation
    print("1. PREPARING DATA FOR MACHINE LEARNING...")

    # Select relevant features
    feature_columns = [
        'protein_g', 'fat_g', 'carbohydrate_g', 'total_weight_g',
        'num_ingredients', 'num_cooking_methods', 'cooking_health_score',
        'protein_ratio', 'fat_ratio', 'carb_ratio', 'energy_density'
    ]

    # Filter to only include columns that exist in the dataframe
    available_features = [col for col in feature_columns if col in df.columns]

    # Handle missing values for selected features
    X = df[available_features].copy()
    y = df[target_variable]

    # Impute missing values
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)
    X = pd.DataFrame(X_imputed, columns=available_features, index=X.index)

    # Remove rows where target is missing
    valid_indices = y.notna()
    X = X[valid_indices]
    y = y[valid_indices]

    print(f"Features used: {available_features}")
    print(f"Target variable: {target_variable}")
    print(f"Final dataset shape: {X.shape}")

    # 2. Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 3. Model Selection and Training
    print("\n2. TRAINING AND COMPARING MODELS...")

    # Define models to compare
    models = {
        'Random Forest': RandomForestRegressor(random_state=random_state),
        'Gradient Boosting': GradientBoostingRegressor(random_state=random_state),
        'XGBoost': XGBRegressor(random_state=random_state, verbosity=0)
    }

    # Train and evaluate each model
    results = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        results[name] = {
            'model': model,
            'mae': mae,
            'mse': mse,
            'rmse': rmse,
            'r2': r2,
            'predictions': y_pred
        }

        print(f"  {name} - R²: {r2:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

    # 4. Model Comparison
    print("\n3. MODEL COMPARISON")

    # Create comparison table
    comparison_df = pd.DataFrame({
        'Model': list(results.keys()),
        'R² Score': [results[name]['r2'] for name in results],
        'MAE': [results[name]['mae'] for name in results],
        'RMSE': [results[name]['rmse'] for name in results]
    }).sort_values('R² Score', ascending=False)

    print("Model Performance Comparison:")
    print(comparison_df.to_string(index=False))

    # Visual comparison
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # R² Score comparison
    axes[0].bar(comparison_df['Model'], comparison_df['R² Score'], color=['blue', 'green', 'orange'])
    axes[0].set_title('R² Score Comparison')
    axes[0].set_ylabel('R² Score')
    axes[0].tick_params(axis='x', rotation=45)

    # Error metrics comparison
    x_pos = np.arange(len(comparison_df))
    width = 0.25
    axes[1].bar(x_pos - width, comparison_df['MAE'], width, label='MAE', color='red')
    axes[1].bar(x_pos, comparison_df['RMSE'], width, label='RMSE', color='purple')
    axes[1].set_title('Error Metrics Comparison')
    axes[1].set_ylabel('Error')
    axes[1].set_xticks(x_pos)
    axes[1].set_xticklabels(comparison_df['Model'], rotation=45)
    axes[1].legend()

    plt.tight_layout()
    plt.show()

    # 5. Residual Analysis
    print("\n4. RESIDUAL ANALYSIS")

    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    for i, (name, result) in enumerate(results.items()):
        residuals = y_test - result['predictions']

        axes[i].scatter(result['predictions'], residuals, alpha=0.6)
        axes[i].axhline(y=0, color='red', linestyle='--')
        axes[i].set_xlabel('Predicted Values')
        axes[i].set_ylabel('Residuals')
        axes[i].set_title(f'{name} - Residual Plot')

        # Add residual statistics
        residual_mean = residuals.mean()
        residual_std = residuals.std()
        axes[i].text(0.05, 0.95, f'Mean: {residual_mean:.2f}\nStd: {residual_std:.2f}',
                    transform=axes[i].transAxes, verticalalignment='top')

    plt.tight_layout()
    plt.show()

    # 6. Feature Importance Analysis
    print("\n5. FEATURE IMPORTANCE ANALYSIS")

    fig, axes = plt.subplots(1, 3, figsize=(18, 8))

    for i, (name, result) in enumerate(results.items()):
        if hasattr(result['model'], 'feature_importances_'):
            importance = result['model'].feature_importances_
            feature_importance = pd.DataFrame({
                'feature': available_features,
                'importance': importance
            }).sort_values('importance', ascending=True)

            axes[i].barh(feature_importance['feature'], feature_importance['importance'])
            axes[i].set_title(f'{name} - Feature Importance')
            axes[i].set_xlabel('Importance')

    plt.tight_layout()
    plt.show()

    # 7. Hyperparameter Tuning for Best Model
    print("\n6. HYPERPARAMETER TUNING")

    # Select best model based on R² score
    best_model_name = comparison_df.iloc[0]['Model']
    best_model = results[best_model_name]['model']

    print(f"Tuning hyperparameters for {best_model_name}...")

    # Define parameter grids for each model type
    param_grids = {
        'Random Forest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        'Gradient Boosting': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 0.9, 1.0]
        },
        'XGBoost': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0]
        }
    }

    # Perform grid search with cross-validation
    if best_model_name in param_grids:
        grid_search = GridSearchCV(
            estimator=type(best_model)(random_state=random_state),
            param_grid=param_grids[best_model_name],
            cv=5,
            scoring='r2',
            n_jobs=-1,
            verbose=1
        )

        grid_search.fit(X_train_scaled, y_train)

        print(f"Best parameters for {best_model_name}:")
        for param, value in grid_search.best_params_.items():
            print(f"  {param}: {value}")

        # Evaluate tuned model
        tuned_model = grid_search.best_estimator_
        y_pred_tuned = tuned_model.predict(X_test_scaled)

        tuned_r2 = r2_score(y_test, y_pred_tuned)
        tuned_mae = mean_absolute_error(y_test, y_pred_tuned)

        print(f"Tuned {best_model_name} Performance:")
        print(f"  R²: {tuned_r2:.4f} (Before: {results[best_model_name]['r2']:.4f})")
        print(f"  MAE: {tuned_mae:.2f} (Before: {results[best_model_name]['mae']:.2f})")

        # Update results with tuned model
        results[f'Tuned {best_model_name}'] = {
            'model': tuned_model,
            'mae': tuned_mae,
            'r2': tuned_r2,
            'predictions': y_pred_tuned
        }

    # 8. Final Model Evaluation
    print("\n7. FINAL MODEL EVALUATION")

    # Compare original vs tuned model
    if f'Tuned {best_model_name}' in results:
        improvement_r2 = results[f'Tuned {best_model_name}']['r2'] - results[best_model_name]['r2']
        improvement_mae = results[best_model_name]['mae'] - results[f'Tuned {best_model_name}']['mae']

        print(f"Improvement after tuning:")
        print(f"  R²: +{improvement_r2:.4f}")
        print(f"  MAE: -{improvement_mae:.2f}")

    # 9. Prediction Visualization
    print("\n8. PREDICTION VISUALIZATION")

    plt.figure(figsize=(12, 10))

    # Plot actual vs predicted for best model
    best_predictions = results[best_model_name]['predictions']

    plt.subplot(2, 2, 1)
    plt.scatter(y_test, best_predictions, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'{best_model_name} - Actual vs Predicted')

    # Error distribution
    plt.subplot(2, 2, 2)
    errors = y_test - best_predictions
    sns.histplot(errors, kde=True)
    plt.xlabel('Prediction Error')
    plt.ylabel('Frequency')
    plt.title('Error Distribution')

    # Feature importance for best model
    plt.subplot(2, 2, 3)
    if hasattr(results[best_model_name]['model'], 'feature_importances_'):
        importance = results[best_model_name]['model'].feature_importances_
        feature_importance = pd.DataFrame({
            'feature': available_features,
            'importance': importance
        }).sort_values('importance', ascending=True)

        plt.barh(feature_importance['feature'], feature_importance['importance'])
        plt.xlabel('Importance')
        plt.title('Feature Importance')

    # Model comparison bar chart
    plt.subplot(2, 2, 4)
    model_names = []
    r2_scores = []

    for name, result in results.items():
        if 'r2' in result:
            model_names.append(name)
            r2_scores.append(result['r2'])

    plt.bar(model_names, r2_scores, color=['blue', 'green', 'orange', 'red'])
    plt.xticks(rotation=45)
    plt.ylabel('R² Score')
    plt.title('Model Comparison (R² Score)')

    plt.tight_layout()
    plt.show()

    # 10. Model Interpretation (SHAP values for best model)
    print("\n9. MODEL INTERPRETATION (SHAP VALUES)")

    try:
        import shap

        # Use the best model for interpretation
        explainer = shap.TreeExplainer(results[best_model_name]['model'])
        shap_values = explainer.shap_values(X_test_scaled)

        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test_scaled, feature_names=available_features, show=False)
        plt.title(f'SHAP Summary Plot - {best_model_name}')
        plt.tight_layout()
        plt.show()

        # Force plot for a specific instance
        plt.figure(figsize=(12, 6))
        shap.force_plot(explainer.expected_value, shap_values[0], X_test_scaled[0],
                       feature_names=available_features, show=False, matplotlib=True)
        plt.title(f'SHAP Force Plot - First Instance')
        plt.tight_layout()
        plt.show()

    except ImportError:
        print("SHAP not installed. Install with: pip install shap")

    # 11. Final Recommendations
    print("\n10. FINAL RECOMMENDATIONS")

    best_overall = max(results.items(), key=lambda x: x[1]['r2'] if 'r2' in x[1] else -1)
    print(f"Best performing model: {best_overall[0]}")
    print(f"Best R² Score: {best_overall[1]['r2']:.4f}")
    print(f"Best MAE: {best_overall[1]['mae']:.2f}")

    # Key insights
    print("\nKey Insights:")
    print("- Feature importance reveals which factors most influence calorie content")
    print("- Residual analysis shows model bias and error patterns")
    print("- Hyperparameter tuning can significantly improve model performance")

    return {
        'results': results,
        'best_model': best_overall[0],
        'best_score': best_overall[1]['r2'],
        'feature_importance': feature_importance if 'feature_importance' in locals() else None,
        'X_test': X_test,
        'y_test': y_test,
        'scaler': scaler,
        'imputer': imputer
    }

# Run the focused ML pipeline
ml_results = focused_ml_pipeline(df, target_variable='calories_kcal')

## 5 - ML advanced

### 5.1 recommendations systems

This comprehensive recommendation system provides:

  - Diet-Specific Recommendations: Vegan, vegetarian, keto, paleo, gluten-free

  - Ingredient-Based Filtering: Preferred and excluded ingredients

  - Cooking Method Preferences: Preferred and excluded cooking methods

  - Health Goal Optimization: Weight loss, muscle gain, maintenance, etc.

  - Similarity-Based Recommendations: Recipes similar to liked dishes

  - User Profiles: Personalized recommendations based on comprehensive profiles

  - Advanced Filtering: Calorie ranges, prep time, nutritional density

In [None]:
class NutritionalRecommender:
    """
    Advanced recommendation system for recipes based on multiple criteria:
    - Diet type (vegan, keto, paleo, gluten-free, etc.)
    - Ingredient preferences/allergies
    - Cooking method preferences
    - Health preferences (calorie density, nutritional density, etc.)
    """

    def __init__(self, df):
        self.df = df.copy()
        self._prepare_data()
        self._build_feature_vectors()

    def _prepare_data(self):
        """Prepare data for recommendation system"""
        # Ensure we have the necessary columns
        required_columns = ['ingredients_cleaned', 'cooking_methods_cleaned',
                           'calories_kcal', 'protein_g', 'fat_g', 'carbohydrate_g',
                           'nutritional_density', 'total_weight_g']

        for col in required_columns:
            if col not in self.df.columns:
                raise ValueError(f"Missing required column: {col}")

        # Calculate additional metrics
        self.df['energy_density'] = self.df['calories_kcal'] / self.df['total_weight_g'].replace(0, 1)
        self.df['protein_density'] = self.df['protein_g'] / self.df['total_weight_g'].replace(0, 1)

        # Create diet type flags if not already present
        if 'vegan' not in self.df.columns:
            vegan_restricted = ['meat', 'chicken', 'beef', 'pork', 'fish', 'seafood', 'egg',
                               'dairy', 'milk', 'cheese', 'butter', 'honey', 'gelatin']
            self.df['vegan'] = self.df['ingredients_cleaned'].apply(
                lambda x: not any(any(restr in ing for restr in vegan_restricted) for ing in x)
            )

        if 'vegetarian' not in self.df.columns:
            vegetarian_restricted = ['meat', 'chicken', 'beef', 'pork', 'fish', 'seafood']
            self.df['vegetarian'] = self.df['ingredients_cleaned'].apply(
                lambda x: not any(any(restr in ing for restr in vegetarian_restricted) for ing in x)
            )

    def _build_feature_vectors(self):
        """Build feature vectors for similarity calculations"""
        # Create ingredient-based features
        all_ingredients = list(set([ing for sublist in self.df['ingredients_cleaned'] for ing in sublist]))
        self.ingredient_vectorizer = {ing: idx for idx, ing in enumerate(all_ingredients)}

        # Create cooking method features
        all_methods = list(set([method for sublist in self.df['cooking_methods_cleaned'] for method in sublist]))
        self.method_vectorizer = {method: idx for idx, method in enumerate(all_methods)}

        # Build feature matrix
        self.feature_vectors = []
        for _, row in self.df.iterrows():
            # Ingredient features (one-hot encoded)
            ing_vector = np.zeros(len(self.ingredient_vectorizer))
            for ing in row['ingredients_cleaned']:
                if ing in self.ingredient_vectorizer:
                    ing_vector[self.ingredient_vectorizer[ing]] = 1

            # Cooking method features
            method_vector = np.zeros(len(self.method_vectorizer))
            for method in row['cooking_methods_cleaned']:
                if method in self.method_vectorizer:
                    method_vector[self.method_vectorizer[method]] = 1

            # Nutritional features (normalized)
            nutrition_vector = np.array([
                row['calories_kcal'] / 1000,  # Scale to reasonable range
                row['protein_g'] / 100,
                row['fat_g'] / 100,
                row['carbohydrate_g'] / 100,
                row['nutritional_density'] / 10,
                row['energy_density']
            ])

            # Combine all features
            combined_vector = np.concatenate([ing_vector, method_vector, nutrition_vector])
            self.feature_vectors.append(combined_vector)

        self.feature_vectors = np.array(self.feature_vectors)
        self.scaler = StandardScaler()
        self.scaled_vectors = self.scaler.fit_transform(self.feature_vectors)

    def recommend_by_diet(self, diet_type, n_recommendations=5, **filters):
        """
        Recommend recipes based on specific diet type
        Supported diets: vegan, vegetarian, keto, paleo, gluten_free
        """
        valid_diets = ['vegan', 'vegetarian', 'keto', 'paleo', 'gluten_free']
        if diet_type not in valid_diets:
            raise ValueError(f"Diet type must be one of: {valid_diets}")

        if diet_type not in self.df.columns:
            raise ValueError(f"Diet information for {diet_type} not available in dataset")

        # Filter by diet
        diet_recipes = self.df[self.df[diet_type] == True]

        if len(diet_recipes) == 0:
            return self._fallback_recommendations(f"No {diet_type} recipes found", n_recommendations)

        # Apply additional filters
        filtered_recipes = self._apply_filters(diet_recipes, **filters)

        if len(filtered_recipes) == 0:
            return self._fallback_recommendations(f"No {diet_type} recipes match your filters", n_recommendations)

        # Sort by nutritional density or other criteria
        recommendations = filtered_recipes.nlargest(n_recommendations, 'nutritional_density')

        return self._format_recommendations(recommendations, f"{diet_type.title()} Recommendations")

    def recommend_by_ingredients(self, preferred_ingredients=[], excluded_ingredients=[],
                               n_recommendations=5, **filters):
        """
        Recommend recipes based on ingredient preferences and exclusions
        """
        # Filter recipes
        filtered_recipes = self.df.copy()

        # Include preferred ingredients
        if preferred_ingredients:
            preferred_ingredients = [ing.lower() for ing in preferred_ingredients]
            filtered_recipes = filtered_recipes[
                filtered_recipes['ingredients_cleaned'].apply(
                    lambda x: any(pref_ing in ing for ing in x for pref_ing in preferred_ingredients)
                )
            ]

        # Exclude unwanted ingredients
        if excluded_ingredients:
            excluded_ingredients = [ing.lower() for ing in excluded_ingredients]
            filtered_recipes = filtered_recipes[
                filtered_recipes['ingredients_cleaned'].apply(
                    lambda x: not any(excl_ing in ing for ing in x for excl_ing in excluded_ingredients)
                )
            ]

        # Apply additional filters
        filtered_recipes = self._apply_filters(filtered_recipes, **filters)

        if len(filtered_recipes) == 0:
            return self._fallback_recommendations("No recipes match your ingredient preferences", n_recommendations)

        # Score by ingredient match
        def ingredient_match_score(recipe_ingredients):
            score = 0
            for pref_ing in preferred_ingredients:
                if any(pref_ing in ing for ing in recipe_ingredients):
                    score += 2  # Bonus for preferred ingredients
            return score

        filtered_recipes['ingredient_score'] = filtered_recipes['ingredients_cleaned'].apply(ingredient_match_score)

        # Sort by ingredient score and nutritional density
        recommendations = filtered_recipes.sort_values(
            ['ingredient_score', 'nutritional_density'], ascending=[False, False]
        ).head(n_recommendations)

        return self._format_recommendations(recommendations, "Ingredient-based Recommendations")

    def recommend_by_cooking_method(self, preferred_methods=[], excluded_methods=[],
                                  n_recommendations=5, **filters):
        """
        Recommend recipes based on cooking method preferences
        """
        filtered_recipes = self.df.copy()

        # Include preferred methods
        if preferred_methods:
            preferred_methods = [method.lower() for method in preferred_methods]
            filtered_recipes = filtered_recipes[
                filtered_recipes['cooking_methods_cleaned'].apply(
                    lambda x: any(pref_method in method for method in x for pref_method in preferred_methods)
                )
            ]

        # Exclude unwanted methods
        if excluded_methods:
            excluded_methods = [method.lower() for method in excluded_methods]
            filtered_recipes = filtered_recipes[
                filtered_recipes['cooking_methods_cleaned'].apply(
                    lambda x: not any(excl_method in method for method in x for excl_method in excluded_methods)
                )
            ]

        # Apply additional filters
        filtered_recipes = self._apply_filters(filtered_recipes, **filters)

        if len(filtered_recipes) == 0:
            return self._fallback_recommendations("No recipes match your cooking method preferences", n_recommendations)

        # Score by method match
        def method_match_score(recipe_methods):
            score = 0
            for pref_method in preferred_methods:
                if any(pref_method in method for method in recipe_methods):
                    score += 1
            return score

        filtered_recipes['method_score'] = filtered_recipes['cooking_methods_cleaned'].apply(method_match_score)

        # Sort by method score and nutritional density
        recommendations = filtered_recipes.sort_values(
            ['method_score', 'nutritional_density'], ascending=[False, False]
        ).head(n_recommendations)

        return self._format_recommendations(recommendations, "Cooking Method Recommendations")

    def recommend_by_health_goals(self, goal_type, target_calories=None, n_recommendations=5, **filters):
        """
        Recommend recipes based on health goals
        Supported goals: weight_loss, muscle_gain, maintenance, high_protein, low_carb, low_fat
        """
        goal_strategies = {
            'weight_loss': {
                'max_calories': 400,
                'min_protein': 15,
                'max_carbs': 40,
                'sort_by': ['energy_density', 'nutritional_density'],
                'ascending': [True, False]
            },
            'muscle_gain': {
                'min_protein': 25,
                'min_calories': 400,
                'sort_by': ['protein_density', 'calories_kcal'],
                'ascending': [False, False]
            },
            'maintenance': {
                'min_nutritional_density': 5,
                'sort_by': ['nutritional_density', 'cooking_health_score'],
                'ascending': [False, False]
            },
            'high_protein': {
                'min_protein': 20,
                'min_protein_ratio': 0.3,
                'sort_by': ['protein_g', 'protein_density'],
                'ascending': [False, False]
            },
            'low_carb': {
                'max_carbs': 20,
                'sort_by': ['carbohydrate_g', 'nutritional_density'],
                'ascending': [True, False]
            },
            'low_fat': {
                'max_fat': 10,
                'sort_by': ['fat_g', 'nutritional_density'],
                'ascending': [True, False]
            }
        }

        if goal_type not in goal_strategies:
            raise ValueError(f"Goal type must be one of: {list(goal_strategies.keys())}")

        strategy = goal_strategies[goal_type]
        filtered_recipes = self.df.copy()

        # Apply goal-specific filters
        for key, value in strategy.items():
            if key.startswith('min_'):
                col = key[4:]
                if col in filtered_recipes.columns:
                    filtered_recipes = filtered_recipes[filtered_recipes[col] >= value]
            elif key.startswith('max_'):
                col = key[4:]
                if col in filtered_recipes.columns:
                    filtered_recipes = filtered_recipes[filtered_recipes[col] <= value]

        # Apply target calories if specified
        if target_calories:
            filtered_recipes = filtered_recipes[
                filtered_recipes['calories_kcal'].between(
                    target_calories * 0.8, target_calories * 1.2
                )
            ]

        # Apply additional filters
        filtered_recipes = self._apply_filters(filtered_recipes, **filters)

        if len(filtered_recipes) == 0:
            return self._fallback_recommendations(f"No recipes match your {goal_type} goals", n_recommendations)

        # Sort according to strategy
        recommendations = filtered_recipes.sort_values(
            strategy['sort_by'], ascending=strategy['ascending']
        ).head(n_recommendations)

        return self._format_recommendations(recommendations, f"{goal_type.replace('_', ' ').title()} Recommendations")

    def recommend_similar_to_recipe(self, recipe_id, n_recommendations=5, **filters):
        """
        Recommend recipes similar to a given recipe
        """
        if recipe_id >= len(self.df) or recipe_id < 0:
            raise ValueError("Invalid recipe ID")

        # Get target recipe features
        target_vector = self.scaled_vectors[recipe_id]

        # Calculate similarities
        similarities = cosine_similarity([target_vector], self.scaled_vectors)[0]

        # Apply filters
        filtered_indices = self._get_filtered_indices(**filters)

        if len(filtered_indices) == 0:
            return self._fallback_recommendations("No similar recipes match your filters", n_recommendations)

        # Get similar recipes (excluding the target itself)
        similar_indices = []
        for idx in np.argsort(similarities)[::-1]:
            if idx != recipe_id and idx in filtered_indices:
                similar_indices.append(idx)
            if len(similar_indices) >= n_recommendations:
                break

        if not similar_indices:
            return self._fallback_recommendations("No similar recipes found", n_recommendations)

        recommendations = self.df.iloc[similar_indices].copy()
        recommendations['similarity_score'] = similarities[similar_indices]

        return self._format_recommendations(recommendations, "Similar Recipes")

    def _apply_filters(self, recipes, **filters):
        """Apply additional filters to recipes"""
        filtered = recipes.copy()

        # Calorie range
        if 'min_calories' in filters:
            filtered = filtered[filtered['calories_kcal'] >= filters['min_calories']]
        if 'max_calories' in filters:
            filtered = filtered[filtered['calories_kcal'] <= filters['max_calories']]

        # Protein range
        if 'min_protein' in filters:
            filtered = filtered[filtered['protein_g'] >= filters['min_protein']]
        if 'max_protein' in filters:
            filtered = filtered[filtered['protein_g'] <= filters['max_protein']]

        # Preparation time
        if 'max_prep_time' in filters and 'estimated_prep_time' in filtered.columns:
            filtered = filtered[filtered['estimated_prep_time'] <= filters['max_prep_time']]

        # Nutritional density
        if 'min_nutritional_density' in filters:
            filtered = filtered[filtered['nutritional_density'] >= filters['min_nutritional_density']]

        # Food type
        if 'food_type' in filters and 'food_type_standardized' in filtered.columns:
            filtered = filtered[filtered['food_type_standardized'] == filters['food_type']]

        return filtered

    def _get_filtered_indices(self, **filters):
        """Get indices of recipes that match the filters"""
        filtered = self._apply_filters(self.df, **filters)
        return filtered.index.tolist()

    def _fallback_recommendations(self, message, n_recommendations):
        """Provide fallback recommendations when no matches are found"""
        print(f"Warning: {message}. Showing general recommendations instead.")

        # Fall back to general healthy recommendations
        fallback = self.df.nlargest(n_recommendations, 'nutritional_density')
        return self._format_recommendations(fallback, "General Healthy Recommendations (Fallback)")

    def _format_recommendations(self, recommendations, title):
        """Format recommendations for display"""
        result = {
            'title': title,
            'count': len(recommendations),
            'recipes': []
        }

        for _, recipe in recommendations.iterrows():
            recipe_info = {
                'id': recipe.name if hasattr(recipe, 'name') else None,
                'dish_name': recipe.get('dish_name_cleaned', recipe.get('dish_name', 'Unknown')),
                'calories': round(recipe['calories_kcal'], 1),
                'protein': round(recipe['protein_g'], 1),
                'carbs': round(recipe['carbohydrate_g'], 1),
                'fat': round(recipe['fat_g'], 1),
                'nutritional_density': round(recipe.get('nutritional_density', 0), 2),
                'ingredients': recipe['ingredients_cleaned'][:5],  # Top 5 ingredients
                'cooking_methods': recipe['cooking_methods_cleaned'],
                'prep_time': round(recipe.get('estimated_prep_time', 0)) if 'estimated_prep_time' in recipe else 'Unknown'
            }

            # Add similarity score if available
            if 'similarity_score' in recipe:
                recipe_info['similarity'] = round(recipe['similarity_score'], 3)

            result['recipes'].append(recipe_info)

        return result

    def print_recommendations(self, recommendations):
        """Print recommendations in a user-friendly format"""
        print(f"\n{'='*60}")
        print(f"{recommendations['title']}")
        print(f"{'='*60}")
        print(f"Found {recommendations['count']} recipes\n")

        for i, recipe in enumerate(recommendations['recipes'], 1):
            print(f"{i}. {recipe['dish_name']}")
            print(f"   Calories: {recipe['calories']} kcal")
            print(f"   Protein: {recipe['protein']}g | Carbs: {recipe['carbs']}g | Fat: {recipe['fat']}g")
            print(f"   Nutritional Density: {recipe['nutritional_density']}")
            print(f"   Prep Time: {recipe['prep_time']} minutes")
            print(f"   Cooking Methods: {', '.join(recipe['cooking_methods'])}")
            print(f"   Key Ingredients: {', '.join(recipe['ingredients'])}")
            if 'similarity' in recipe:
                print(f"   Similarity Score: {recipe['similarity']}")
            print()

# Example usage and demonstration
def demonstrate_recommendation_system(df):
    """Demonstrate the recommendation system with various examples"""
    print("Initializing Recommendation System...")
    recommender = NutritionalRecommender(df)

    print("\n" + "="*60)
    print("RECOMMENDATION SYSTEM DEMONSTRATION")
    print("="*60)

    # 1. Diet-based recommendations
    print("\n1. VEGAN RECOMMENDATIONS")
    vegan_recs = recommender.recommend_by_diet('vegan', n_recommendations=3, max_calories=500)
    recommender.print_recommendations(vegan_recs)

    # 2. Ingredient-based recommendations
    print("\n2. CHICKEN AND VEGETABLE RECIPES")
    ingredient_recs = recommender.recommend_by_ingredients(
        preferred_ingredients=['chicken', 'vegetable', 'broccoli'],
        excluded_ingredients=['cheese', 'cream'],
        n_recommendations=3,
        min_protein=20
    )
    recommender.print_recommendations(ingredient_recs)

    # 3. Cooking method recommendations
    print("\n3. GRILLED OR BAKED RECIPES")
    cooking_recs = recommender.recommend_by_cooking_method(
        preferred_methods=['grill', 'bake', 'roast'],
        excluded_methods=['fry', 'deep fry'],
        n_recommendations=3,
        max_calories=600
    )
    recommender.print_recommendations(cooking_recs)

    # 4. Health goal recommendations
    print("\n4. WEIGHT LOSS RECIPES")
    health_recs = recommender.recommend_by_health_goals(
        'weight_loss',
        n_recommendations=3,
        max_prep_time=30
    )
    recommender.print_recommendations(health_recs)

    print("\n5. HIGH PROTEIN RECIPES FOR MUSCLE GAIN")
    muscle_recs = recommender.recommend_by_health_goals(
        'muscle_gain',
        n_recommendations=3,
        min_protein=30
    )
    recommender.print_recommendations(muscle_recs)

    # 5. Similar recipes
    print("\n6. RECIPES SIMILAR TO FIRST RECIPE")
    if len(df) > 0:
        similar_recs = recommender.recommend_similar_to_recipe(
            0,  # First recipe in dataset
            n_recommendations=3,
            max_calories=800
        )
        recommender.print_recommendations(similar_recs)

    return recommender

# Run the demonstration
nutrition_recommender = demonstrate_recommendation_system(df)