In [2]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import logging

# Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    mean_squared_error,
)
from sklearn.utils import class_weight

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from tensorflow.keras.layers import Dense, Dropout, InputLayer
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

# NLTK for Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Surprise Library for Recommender System
from surprise import Dataset as SurpriseDataset
from surprise import Reader, SVD, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --------------------------- Configuration --------------------------- #
# Define file paths
CORE_RECIPE_PATH = 'core-data_recipe.csv'
RAW_RECIPE_PATH = 'raw-data_recipe.csv'
CORE_TRAIN_PATH = 'core-data-train_rating.csv'
CORE_VALID_PATH = 'core-data-valid_rating.csv'
CORE_TEST_PATH = 'core-data-test_rating.csv'
RAW_INTERACTION_PATH = 'raw-data_interaction.csv'

CORE_IMAGE_DIR = 'core-data-images'
RAW_IMAGE_DIR = 'raw-data-images'

# Define paths for saved features and models
CORE_IMAGE_FEATURES_PATH = 'core_image_features.npy'
RAW_IMAGE_FEATURES_PATH = 'raw_image_features.npy'
TFIDF_VECTORIZER_PATH = 'tfidf_vectorizer.pkl'
SCALER_PATH = 'scaler.pkl'
HEALTH_MODEL_PATH = 'healthiness_model.h5'
LE_USER_PATH = 'le_user.pkl'
LE_RECIPE_PATH = 'le_recipe.pkl'
SVD_MODEL_PATH = 'svd_model.pkl'

# Define nutritional columns (update based on actual data)
TARGET_NUTRIENTS = ['calories', 'protein', 'fat', 'carbohydrates', 'fiber']

# Define batch size for image processing
IMAGE_BATCH_SIZE = 32  # Adjust based on available memory

# Define logging level
LOG_LEVEL = logging.INFO

# --------------------------- Logging Configuration --------------------------- #
# Configure logging to write both to console and a log file
logging.basicConfig(
    level=LOG_LEVEL,
    format='%(asctime)s:%(levelname)s:%(message)s',
    handlers=[
        logging.FileHandler("training.log"),
        logging.StreamHandler()
    ]
)

# --------------------------- TensorFlow GPU Configuration --------------------------- #
def configure_tensorflow():
    """
    Configure TensorFlow to use GPU if available, else fallback to CPU.
    """
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Uncomment the following lines if you want to limit GPU memory growth
            # for gpu in gpus:
            #     tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(f"{len(gpus)} Physical GPU(s), {len(logical_gpus)} Logical GPU(s) found.")
            logging.info(f"{len(gpus)} Physical GPU(s), {len(logical_gpus)} Logical GPU(s) found.")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(f"RuntimeError during GPU configuration: {e}")
            logging.error(f"RuntimeError during GPU configuration: {e}")
    else:
        # No GPU detected, set TensorFlow to use CPU
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
        print("No CUDA-capable GPU detected. TensorFlow is set to use CPU.")
        logging.info("No CUDA-capable GPU detected. TensorFlow is set to use CPU.")

# Call the configuration function
configure_tensorflow()

# --------------------------- Utility Functions --------------------------- #
def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing non-alphabetic characters,
    tokenizing, and removing stopwords.
    """
    if not isinstance(text, str):
        return ''
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join back to string
    return ' '.join(tokens)

def parse_nutritions_nested(df):
    """
    Parse the 'nutritions' column assuming nested dictionaries with 'amount' keys.
    """
    print("Parsing 'nutritions' column with nested dictionary structure...")
    logging.info("Parsing 'nutritions' column with nested dictionary structure...")

    # Function to clean the nutrition strings
    def clean_nutrition_str(nutrition_str):
        if isinstance(nutrition_str, str):
            # Replace single quotes with double quotes for JSON compatibility
            cleaned_str = nutrition_str.replace("u'", "'")
            return cleaned_str
        return '{}'

    # Clean the nutrition strings
    cleaned_nutritions = df['nutritions'].apply(clean_nutrition_str)

    # Safely parse using ast.literal_eval with exception handling
    def safe_literal_eval(nutrition_str):
        try:
            return ast.literal_eval(nutrition_str)
        except Exception as e:
            logging.error(f"Error parsing nutritions: {e} for string: {nutrition_str}")
            return {}

    nutritions_expanded = cleaned_nutritions.apply(safe_literal_eval)

    # Extract only the target nutrients
    def extract_nutrients(nutrition_dict):
        nutrient_values = {}
        for nutrient in TARGET_NUTRIENTS:
            if nutrient in nutrition_dict:
                nutrition_info = nutrition_dict[nutrient]  # Assign nutrition_info
                if isinstance(nutrition_info, dict):
                    amount = nutrition_info.get('amount', np.nan)
                    if isinstance(amount, str):
                        amount = re.sub(r'[^\d\.]', '', amount)
                    nutrient_values[nutrient] = float(amount) if amount else np.nan
                elif isinstance(nutrition_info, (int, float)):
                    nutrient_values[nutrient] = nutrition_info
                else:
                    logging.warning(f"Unexpected format for nutrient '{nutrient}'. Assigning NaN.")
                    nutrient_values[nutrient] = np.nan
            else:
                logging.warning(f"Nutrient '{nutrient}' not found in nutrition data. Assigning NaN.")
                nutrient_values[nutrient] = np.nan
        return pd.Series(nutrient_values)

    nutritions_df = nutritions_expanded.apply(extract_nutrients)

    # Handle missing values by filling NaNs with the mean of each nutrient
    nutritions_df = nutritions_df.fillna(nutritions_df.mean())

    # Merge the extracted nutrients back into the original DataFrame
    df = pd.concat([df, nutritions_df], axis=1)

    # Drop the original 'nutritions' column
    df.drop('nutritions', axis=1, inplace=True)

    # Ensure that nutrient columns are of float type
    for col in TARGET_NUTRIENTS:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill any remaining NaNs with the mean
    df[TARGET_NUTRIENTS] = df[TARGET_NUTRIENTS].fillna(df[TARGET_NUTRIENTS].mean())

    print("Columns after parsing 'nutritions':")
    print(df.columns.tolist())
    logging.info(f"Columns after parsing 'nutritions': {df.columns.tolist()}")

    print("Parsing 'nutritions' column completed.")
    logging.info("Parsing 'nutritions' column completed.")
    return df

def extract_image_features(recipes_df, image_dir, output_file, model, batch_size=32):
    """
    Extract image features using a pre-trained CNN model in batches and save them to a file.
    Utilizes tf.data.Dataset for efficient batch processing.
    """
    print(f"Starting image feature extraction from {image_dir}...")
    logging.info(f"Starting image feature extraction from {image_dir}...")

    def get_image_paths(df, image_dir):
        """
        Generate image file paths from the DataFrame.
        """
        image_paths = []
        valid_indices = []
        for idx, recipe_id in enumerate(df['recipe_id']):
            img_path = os.path.join(image_dir, f"{recipe_id}.jpg")
            if os.path.exists(img_path):
                image_paths.append(img_path)
                valid_indices.append(idx)
            else:
                logging.warning(f"Image not found: {img_path}")
        return image_paths, valid_indices

    # Create a list of image paths
    image_paths, valid_indices = get_image_paths(recipes_df, image_dir)

    # Function to load and preprocess images
    def load_and_preprocess_image(path):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [224, 224])
        img = preprocess_input(img)
        return img

    # Create a tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices(image_paths)
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

    features = []

    for batch_images in tqdm(dataset, desc="Extracting Features"):
        batch_features = model.predict(batch_images, verbose=0)
        features.extend(batch_features)

    features = np.array(features, dtype=np.float32)

    # Create a full features array with zeros for missing images
    full_features = np.zeros((len(recipes_df), features.shape[1]), dtype=np.float32)
    full_features[valid_indices] = features

    # Save features to a numpy file
    np.save(output_file, full_features)
    print(f"Image features saved to {output_file}.")
    logging.info(f"Image features saved to {output_file}.")

    return full_features

def load_or_extract_image_features(recipes_df, image_dir, output_file, model, batch_size=32):
    """
    Load image features from a file if available; otherwise, extract and save them.
    """
    if os.path.exists(output_file):
        print(f"Loading image features from {output_file}...")
        logging.info(f"Loading image features from {output_file}...")
        image_features = np.load(output_file).astype(np.float32)
    else:
        image_features = extract_image_features(recipes_df, image_dir, output_file, model, batch_size)
    return image_features

def load_data():
    """
    Load all necessary CSV files into pandas DataFrames.
    """
    print("Loading Core Recipe Data...")
    logging.info("Loading Core Recipe Data...")
    core_recipes = pd.read_csv(CORE_RECIPE_PATH)
    print(f"Core Recipes Shape: {core_recipes.shape}")
    logging.info(f"Core Recipes Shape: {core_recipes.shape}")

    print("Loading Raw Recipe Data...")
    logging.info("Loading Raw Recipe Data...")
    raw_recipes = pd.read_csv(RAW_RECIPE_PATH, low_memory=False)
    print(f"Raw Recipes Shape: {raw_recipes.shape}")
    logging.info(f"Raw Recipes Shape: {raw_recipes.shape}")

    print("Loading Interaction Data...")
    logging.info("Loading Interaction Data...")
    core_train = pd.read_csv(CORE_TRAIN_PATH)
    core_valid = pd.read_csv(CORE_VALID_PATH)
    core_test = pd.read_csv(CORE_TEST_PATH)
    raw_interactions = pd.read_csv(RAW_INTERACTION_PATH)

    return core_recipes, raw_recipes, core_train, core_valid, core_test, raw_interactions

def preprocess_data(core_recipes, raw_recipes, core_train, core_valid, core_test, raw_interactions):
    """
    Handle missing values, encode categorical variables, and preprocess text data.
    """
    print("Handling Missing Values...")
    logging.info("Handling Missing Values...")

    # For core recipes
    core_recipes.dropna(subset=['recipe_id', 'recipe_name', 'ingredients', 'nutritions'], inplace=True)
    # For raw recipes
    raw_recipes.dropna(subset=['recipe_id', 'recipe_name', 'ingredients', 'nutritions'], inplace=True)
    # For interactions
    core_train.dropna(inplace=True)
    core_valid.dropna(inplace=True)
    core_test.dropna(inplace=True)
    raw_interactions.dropna(inplace=True)

    print("Missing values handled.")
    logging.info("Missing values handled.")

    print("Encoding Categorical Variables...")
    logging.info("Encoding Categorical Variables...")

    # Convert IDs to strings
    core_train['user_id'] = core_train['user_id'].astype(str)
    core_train['recipe_id'] = core_train['recipe_id'].astype(str)
    core_valid['user_id'] = core_valid['user_id'].astype(str)
    core_valid['recipe_id'] = core_valid['recipe_id'].astype(str)
    core_test['user_id'] = core_test['user_id'].astype(str)
    core_test['recipe_id'] = core_test['recipe_id'].astype(str)
    raw_interactions['user_id'] = raw_interactions['user_id'].astype(str)
    raw_interactions['recipe_id'] = raw_interactions['recipe_id'].astype(str)

    # Initialize Label Encoders
    le_user = LabelEncoder()
    le_recipe = LabelEncoder()

    # Fit LabelEncoders on combined user and recipe IDs
    combined_user_ids = pd.concat(
        [
            core_train['user_id'],
            core_valid['user_id'],
            core_test['user_id'],
            raw_interactions['user_id'],
        ]
    ).drop_duplicates()
    le_user.fit(combined_user_ids)
    combined_recipe_ids = pd.concat(
        [core_recipes['recipe_id'], raw_recipes['recipe_id']]
    ).drop_duplicates()
    le_recipe.fit(combined_recipe_ids)

    # Encode user IDs
    core_train['user_id_encoded'] = le_user.transform(core_train['user_id'])
    core_valid['user_id_encoded'] = le_user.transform(core_valid['user_id'])
    core_test['user_id_encoded'] = le_user.transform(core_test['user_id'])
    raw_interactions['user_id_encoded'] = le_user.transform(raw_interactions['user_id'])

    # Encode recipe IDs
    core_train['recipe_id_encoded'] = le_recipe.transform(core_train['recipe_id'])
    core_valid['recipe_id_encoded'] = le_recipe.transform(core_valid['recipe_id'])
    core_test['recipe_id_encoded'] = le_recipe.transform(core_test['recipe_id'])
    raw_interactions['recipe_id_encoded'] = le_recipe.transform(raw_interactions['recipe_id'])

    # Encode recipe IDs in recipes DataFrames
    core_recipes['recipe_id_encoded'] = le_recipe.transform(core_recipes['recipe_id'])
    raw_recipes['recipe_id_encoded'] = le_recipe.transform(raw_recipes['recipe_id'])

    print("Categorical variables encoded.")
    logging.info("Categorical variables encoded.")

    print("Preprocessing Text Data...")
    logging.info("Preprocessing Text Data...")

    # Download NLTK data
    nltk.download('punkt')
    nltk.download('stopwords')

    # Preprocess ingredients
    core_recipes['ingredients_clean'] = core_recipes['ingredients'].apply(preprocess_text)
    raw_recipes['ingredients_clean'] = raw_recipes['ingredients'].apply(preprocess_text)

    print("Text data preprocessed.")
    logging.info("Text data preprocessed.")

    # Save encoders for future use
    joblib.dump(le_user, LE_USER_PATH)
    joblib.dump(le_recipe, LE_RECIPE_PATH)
    print("Label encoders saved.")
    logging.info("Label encoders saved.")

    # Parse 'nutritions' column
    core_recipes = parse_nutritions_nested(core_recipes)
    raw_recipes = parse_nutritions_nested(raw_recipes)

    return (
        core_recipes,
        raw_recipes,
        core_train,
        core_valid,
        core_test,
        raw_interactions,
        le_user,
        le_recipe,
    )

def feature_engineering(core_recipes, raw_recipes):
    """
    Vectorize ingredients, scale nutritional features, and extract image features.
    """
    print("Vectorizing Ingredients with TF-IDF...")
    logging.info("Vectorizing Ingredients with TF-IDF...")

    # Check if TF-IDF vectorizer exists
    if os.path.exists(TFIDF_VECTORIZER_PATH):
        print(f"Loading TF-IDF Vectorizer from {TFIDF_VECTORIZER_PATH}...")
        logging.info(f"Loading TF-IDF Vectorizer from {TFIDF_VECTORIZER_PATH}...")
        tfidf_vectorizer = joblib.load(TFIDF_VECTORIZER_PATH)
        ingredient_tfidf_core = tfidf_vectorizer.transform(core_recipes['ingredients_clean'])
        ingredient_tfidf_raw = tfidf_vectorizer.transform(raw_recipes['ingredients_clean'])
    else:
        # Initialize and fit TF-IDF Vectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=1000)
        ingredient_tfidf_core = tfidf_vectorizer.fit_transform(core_recipes['ingredients_clean'])
        ingredient_tfidf_raw = tfidf_vectorizer.transform(raw_recipes['ingredients_clean'])
        # Save the vectorizer
        joblib.dump(tfidf_vectorizer, TFIDF_VECTORIZER_PATH)
        print(f"TF-IDF Vectorizer saved to {TFIDF_VECTORIZER_PATH}.")
        logging.info(f"TF-IDF Vectorizer saved to {TFIDF_VECTORIZER_PATH}.")

    print("Ingredient vectorization completed.")
    logging.info("Ingredient vectorization completed.")

    print("Scaling Nutritional Features...")
    logging.info("Scaling Nutritional Features...")

    # Convert to numeric if not already
    core_recipes[TARGET_NUTRIENTS] = core_recipes[TARGET_NUTRIENTS].apply(pd.to_numeric, errors='coerce')
    raw_recipes[TARGET_NUTRIENTS] = raw_recipes[TARGET_NUTRIENTS].apply(pd.to_numeric, errors='coerce')

    # Fill missing values with mean
    core_recipes[TARGET_NUTRIENTS] = core_recipes[TARGET_NUTRIENTS].fillna(core_recipes[TARGET_NUTRIENTS].mean())
    raw_recipes[TARGET_NUTRIENTS] = raw_recipes[TARGET_NUTRIENTS].fillna(raw_recipes[TARGET_NUTRIENTS].mean())

    # Check if scaler exists
    if os.path.exists(SCALER_PATH):
        print(f"Loading Scaler from {SCALER_PATH}...")
        logging.info(f"Loading Scaler from {SCALER_PATH}...")
        scaler = joblib.load(SCALER_PATH)
        core_recipes_scaled = scaler.transform(core_recipes[TARGET_NUTRIENTS])
        raw_recipes_scaled = scaler.transform(raw_recipes[TARGET_NUTRIENTS])
    else:
        # Initialize and fit scaler
        scaler = StandardScaler()
        core_recipes_scaled = scaler.fit_transform(core_recipes[TARGET_NUTRIENTS])
        raw_recipes_scaled = scaler.transform(raw_recipes[TARGET_NUTRIENTS])
        # Save the scaler
        joblib.dump(scaler, SCALER_PATH)
        print(f"Scaler saved to {SCALER_PATH}.")
        logging.info(f"Scaler saved to {SCALER_PATH}.")

    # Add scaled features back to DataFrame
    for i, col in enumerate(TARGET_NUTRIENTS):
        core_recipes[f'{col}_scaled'] = core_recipes_scaled[:, i]
        raw_recipes[f'{col}_scaled'] = raw_recipes_scaled[:, i]

    print("Nutritional features scaled.")
    logging.info("Nutritional features scaled.")

    print("Extracting Image Features...")
    logging.info("Extracting Image Features...")

    # Load MobileNetV2 model without the top classification layers and with global average pooling
    base_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg', input_shape=(224, 224, 3))

    # Utilize all available GPUs for the model
    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        model = Sequential([
            base_model,
            Dense(1024, activation='relu')  # Additional dense layer for feature extraction
        ])

    # Extract or load image features for core and raw datasets
    core_image_features = load_or_extract_image_features(
        core_recipes,
        CORE_IMAGE_DIR,
        CORE_IMAGE_FEATURES_PATH,
        model,
        batch_size=IMAGE_BATCH_SIZE
    )
    raw_image_features = load_or_extract_image_features(
        raw_recipes,
        RAW_IMAGE_DIR,
        RAW_IMAGE_FEATURES_PATH,
        model,
        batch_size=IMAGE_BATCH_SIZE
    )

    print("Image feature extraction completed.")
    logging.info("Image feature extraction completed.")

    return (
        ingredient_tfidf_core,
        ingredient_tfidf_raw,
        core_image_features,
        raw_image_features,
        tfidf_vectorizer,
        scaler,
    )

def build_and_evaluate_svd(core_train, core_valid, core_test):
    """
    Combine train, validation, and test data, build the SVD model, and evaluate it.
    """
    print("Building and Evaluating SVD Recommender System...")
    logging.info("Building and Evaluating SVD Recommender System...")

    # Combine training, validation, and test interactions
    all_core_interactions = pd.concat([core_train, core_valid, core_test], ignore_index=True)

    # Convert DataFrame into Surprise Dataset
    reader = Reader(rating_scale=(all_core_interactions['rating'].min(), all_core_interactions['rating'].max()))
    data = SurpriseDataset.load_from_df(
        all_core_interactions[['user_id_encoded', 'recipe_id_encoded', 'rating']], reader
    )

    # Split the data into training and testing sets
    trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

    print("Training SVD Model...")
    logging.info("Training SVD Model...")

    # Initialize the SVD model
    svd_model = SVD(
        n_factors=100,
        n_epochs=20,
        lr_all=0.005,
        reg_all=0.4,
        random_state=42
    )

    # Train the model on trainset
    svd_model.fit(trainset)
    logging.info("SVD model training completed.")

    # Test the model on the test set
    predictions = svd_model.test(testset)

    # Calculate RMSE
    rmse = accuracy.rmse(predictions, verbose=True)

    # Save the SVD model
    joblib.dump(svd_model, SVD_MODEL_PATH)
    print(f"SVD Model saved to '{SVD_MODEL_PATH}'.")
    logging.info(f"SVD Model saved to '{SVD_MODEL_PATH}'.")

    print(f"SVD Recommender System RMSE: {rmse:.3f}")
    logging.info(f"SVD Recommender System RMSE: {rmse:.3f}")

    return svd_model, rmse

def build_healthiness_model(core_recipes):
    """
    Build and train the Keras-based neural network for healthiness prediction.
    """
    print("Defining Healthiness Metric...")
    logging.info("Defining Healthiness Metric...")
    # Define target variable (healthy if calories < 500)
    core_recipes['is_healthy'] = (core_recipes['calories'] < 500).astype(int)

    print("Preparing Features and Target...")
    logging.info("Preparing Features and Target...")

    # Features and target
    feature_cols = [f'{col}_scaled' for col in TARGET_NUTRIENTS]
    X = core_recipes[feature_cols].values.astype(np.float32)
    y = core_recipes['is_healthy'].values

    # Split into training, validation, and testing sets
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.4, random_state=42
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42
    )
    print(f"Training Samples: {X_train.shape[0]}, Validation Samples: {X_val.shape[0]}, Testing Samples: {X_test.shape[0]}")
    logging.info(f"Training Samples: {X_train.shape[0]}, Validation Samples: {X_val.shape[0]}, Testing Samples: {X_test.shape[0]}")

    print("Building Keras Model...")
    logging.info("Building Keras Model...")

    # Utilize all available GPUs for the model
    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        # Define the neural network architecture
        model = Sequential([
            InputLayer(input_shape=(X_train.shape[1],)),
            Dense(256, activation='relu'),
            Dropout(0.4),
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])

        # Compile the model
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

    print("Training Keras Model with Early Stopping...")
    logging.info("Training Keras Model with Early Stopping...")
    # Define EarlyStopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # Calculate class weights to handle class imbalance if present
    class_weights_dict = {}
    if len(np.unique(y_train)) > 1:
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(y_train),
            y=y_train
        )
        class_weights_dict = dict(enumerate(class_weights))
        logging.info(f"Computed class weights: {class_weights_dict}")
    else:
        logging.info("Only one class present in y_train. No class weights computed.")

    # Train the model
    history = model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=256,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        class_weight=class_weights_dict if class_weights_dict else None,
        verbose=1
    )

    print("Model Training Completed.")
    logging.info("Model Training Completed.")

    print("Evaluating Model on Test Set...")
    logging.info("Evaluating Model on Test Set...")
    # Evaluate on test set
    loss, accuracy_val = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy_val:.4f}")
    logging.info(f"Test Accuracy: {accuracy_val:.4f}")

    # Generate classification report
    y_pred_prob = model.predict(X_test).flatten()
    y_pred = (y_pred_prob >= 0.5).astype(int)
    report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    logging.info("Classification Report:")
    logging.info(report)

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f"ROC AUC Score: {roc_auc:.4f}")
    logging.info(f"ROC AUC Score: {roc_auc:.4f}")

    # Plot Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Unhealthy', 'Healthy'],
        yticklabels=['Unhealthy', 'Healthy']
    )
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig('healthiness_confusion_matrix.png')
    plt.close()
    print("Confusion matrix plot saved as 'healthiness_confusion_matrix.png'.")
    logging.info("Confusion matrix plot saved as 'healthiness_confusion_matrix.png'.")

    # Plot ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f'ROC Curve (area = {roc_auc:.4f})')
    plt.plot([0,1], [0,1], 'k--')  # Diagonal line
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for Healthiness Prediction')
    plt.legend(loc='lower right')
    plt.savefig('healthiness_roc_curve.png')
    plt.close()
    print("ROC curve plot saved as 'healthiness_roc_curve.png'.")
    logging.info("ROC curve plot saved as 'healthiness_roc_curve.png'.")

    # Plot Training History
    print("Plotting Training History...")
    logging.info("Plotting Training History...")
    plt.figure(figsize=(12, 4))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='lower right')

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')

    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()
    print("Training history plot saved as 'training_history.png'.")
    logging.info("Training history plot saved as 'training_history.png'.")

    print("Saving Keras Model...")
    logging.info("Saving Keras Model...")
    # Save the trained model
    model.save(HEALTH_MODEL_PATH)
    print(f"Healthiness prediction model saved to {HEALTH_MODEL_PATH}.")
    logging.info(f"Healthiness prediction model saved to {HEALTH_MODEL_PATH}.")

    return model

# --------------------------- Main Execution --------------------------- #
def main():
    try:
        # Step 1: Load Data
        core_recipes, raw_recipes, core_train, core_valid, core_test, raw_interactions = load_data()

        # Step 2: Preprocess Data
        (
            core_recipes,
            raw_recipes,
            core_train,
            core_valid,
            core_test,
            raw_interactions,
            le_user,
            le_recipe,
        ) = preprocess_data(core_recipes, raw_recipes, core_train, core_valid, core_test, raw_interactions)

        # Step 3: Feature Engineering
        (
            ingredient_tfidf_core,
            ingredient_tfidf_raw,
            core_image_features,
            raw_image_features,
            tfidf_vectorizer,
            scaler,
        ) = feature_engineering(core_recipes, raw_recipes)

        # Step 4: Build and Evaluate Recommender System (Using SVD)
        svd_model, svd_rmse = build_and_evaluate_svd(core_train, core_valid, core_test)

        print(f"SVD Recommender System RMSE: {svd_rmse:.3f}")
        logging.info(f"SVD Recommender System RMSE: {svd_rmse:.3f}")

        # Step 5: Build Healthiness Prediction Model
        health_model = build_healthiness_model(core_recipes)

        print("All tasks completed successfully.")
        logging.info("All tasks completed successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")
        logging.error(f"An error occurred: {e}", exc_info=True)

if __name__ == '__main__':
    main()


2024-09-23 23:29:46,591:INFO:1 Physical GPU(s), 1 Logical GPU(s) found.
2024-09-23 23:29:46,594:INFO:Loading Core Recipe Data...


1 Physical GPU(s), 1 Logical GPU(s) found.
Loading Core Recipe Data...


2024-09-23 23:29:48,734:INFO:Core Recipes Shape: (45630, 6)
2024-09-23 23:29:48,735:INFO:Loading Raw Recipe Data...


Core Recipes Shape: (45630, 6)
Loading Raw Recipe Data...


2024-09-23 23:30:09,176:INFO:Raw Recipes Shape: (49698, 9)
2024-09-23 23:30:09,177:INFO:Loading Interaction Data...


Raw Recipes Shape: (49698, 9)
Loading Interaction Data...


2024-09-23 23:30:14,067:INFO:Handling Missing Values...


Handling Missing Values...


2024-09-23 23:30:14,533:INFO:Missing values handled.
2024-09-23 23:30:14,534:INFO:Encoding Categorical Variables...


Missing values handled.
Encoding Categorical Variables...


2024-09-23 23:30:31,828:INFO:Categorical variables encoded.
2024-09-23 23:30:31,829:INFO:Preprocessing Text Data...


Categorical variables encoded.
Preprocessing Text Data...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mbpd1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbpd1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-09-23 23:31:05,531:INFO:Text data preprocessed.


Text data preprocessed.


2024-09-23 23:31:06,046:INFO:Label encoders saved.
2024-09-23 23:31:06,047:INFO:Parsing 'nutritions' column with nested dictionary structure...


Label encoders saved.
Parsing 'nutritions' column with nested dictionary structure...


2024-09-23 23:31:40,684:INFO:Columns after parsing 'nutritions': ['recipe_id', 'recipe_name', 'image_url', 'ingredients', 'cooking_directions', 'recipe_id_encoded', 'ingredients_clean', 'calories', 'protein', 'fat', 'carbohydrates', 'fiber']
2024-09-23 23:31:40,685:INFO:Parsing 'nutritions' column completed.


Columns after parsing 'nutritions':
['recipe_id', 'recipe_name', 'image_url', 'ingredients', 'cooking_directions', 'recipe_id_encoded', 'ingredients_clean', 'calories', 'protein', 'fat', 'carbohydrates', 'fiber']
Parsing 'nutritions' column completed.
Parsing 'nutritions' column with nested dictionary structure...


2024-09-23 23:31:41,140:INFO:Parsing 'nutritions' column with nested dictionary structure...
2024-09-23 23:32:17,842:INFO:Columns after parsing 'nutritions': ['recipe_id', 'recipe_name', 'aver_rate', 'image_url', 'review_nums', 'ingredients', 'cooking_directions', 'reviews', 'recipe_id_encoded', 'ingredients_clean', 'calories', 'protein', 'fat', 'carbohydrates', 'fiber']
2024-09-23 23:32:17,843:INFO:Parsing 'nutritions' column completed.


Columns after parsing 'nutritions':
['recipe_id', 'recipe_name', 'aver_rate', 'image_url', 'review_nums', 'ingredients', 'cooking_directions', 'reviews', 'recipe_id_encoded', 'ingredients_clean', 'calories', 'protein', 'fat', 'carbohydrates', 'fiber']
Parsing 'nutritions' column completed.


2024-09-23 23:32:18,283:INFO:Vectorizing Ingredients with TF-IDF...
2024-09-23 23:32:18,284:INFO:Loading TF-IDF Vectorizer from tfidf_vectorizer.pkl...


Vectorizing Ingredients with TF-IDF...
Loading TF-IDF Vectorizer from tfidf_vectorizer.pkl...


2024-09-23 23:32:19,402:INFO:Ingredient vectorization completed.
2024-09-23 23:32:19,403:INFO:Scaling Nutritional Features...
2024-09-23 23:32:19,439:INFO:Loading Scaler from scaler.pkl...
2024-09-23 23:32:19,453:INFO:Nutritional features scaled.
2024-09-23 23:32:19,456:INFO:Extracting Image Features...


Ingredient vectorization completed.
Scaling Nutritional Features...
Loading Scaler from scaler.pkl...
Nutritional features scaled.
Extracting Image Features...
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


2024-09-23 23:32:21,027:INFO:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
2024-09-23 23:32:21,574:INFO:Loading image features from core_image_features.npy...
2024-09-23 23:32:21,708:INFO:Loading image features from raw_image_features.npy...


Loading image features from core_image_features.npy...
Loading image features from raw_image_features.npy...


2024-09-23 23:32:21,852:INFO:Image feature extraction completed.
2024-09-23 23:32:21,853:INFO:Building and Evaluating SVD Recommender System...


Image feature extraction completed.
Building and Evaluating SVD Recommender System...


2024-09-23 23:32:25,383:INFO:Training SVD Model...


Training SVD Model...


2024-09-23 23:32:39,875:INFO:SVD model training completed.


RMSE: 0.7961


2024-09-23 23:32:58,770:INFO:SVD Model saved to 'svd_model.pkl'.
2024-09-23 23:32:58,771:INFO:SVD Recommender System RMSE: 0.796


SVD Model saved to 'svd_model.pkl'.
SVD Recommender System RMSE: 0.796


2024-09-23 23:32:59,091:INFO:SVD Recommender System RMSE: 0.796
2024-09-23 23:32:59,092:INFO:Defining Healthiness Metric...
2024-09-23 23:32:59,095:INFO:Preparing Features and Target...
2024-09-23 23:32:59,109:INFO:Training Samples: 27378, Validation Samples: 9126, Testing Samples: 9126
2024-09-23 23:32:59,110:INFO:Building Keras Model...


SVD Recommender System RMSE: 0.796
Defining Healthiness Metric...
Preparing Features and Target...
Training Samples: 27378, Validation Samples: 9126, Testing Samples: 9126
Building Keras Model...
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


2024-09-23 23:32:59,112:INFO:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:32:59,327:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:32:59,335:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:32:59,344:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:32:59,349:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
2024-09-23 23:32:59,358:INFO:Training Keras Model with Early Stopping...
2024-09-23 23:32:59,370:INFO:Computed class weights: {0: 2.9337762537505356, 1: 0.6027210285311729}


Training Keras Model with Early Stopping...
Epoch 1/100
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:33:00,263:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:33:00,271:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:33:00,278:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:33:00,300:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:33:00,310:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


2024-09-23 23:33:01,639:INFO:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


2024-09-23 23:33:52,860:INFO:Model Training Completed.
2024-09-23 23:33:52,860:INFO:Evaluating Model on Test Set...


Model Training Completed.
Evaluating Model on Test Set...


2024-09-23 23:33:54,893:INFO:Test Accuracy: 0.9977


Test Accuracy: 0.9977


2024-09-23 23:33:57,442:INFO:Classification Report:
2024-09-23 23:33:57,442:INFO:              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1530
           1       1.00      1.00      1.00      7596

    accuracy                           1.00      9126
   macro avg       0.99      1.00      1.00      9126
weighted avg       1.00      1.00      1.00      9126

2024-09-23 23:33:57,446:INFO:ROC AUC Score: 1.0000


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1530
           1       1.00      1.00      1.00      7596

    accuracy                           1.00      9126
   macro avg       0.99      1.00      1.00      9126
weighted avg       1.00      1.00      1.00      9126

ROC AUC Score: 1.0000


2024-09-23 23:33:57,729:INFO:Confusion matrix plot saved as 'healthiness_confusion_matrix.png'.


Confusion matrix plot saved as 'healthiness_confusion_matrix.png'.
ROC curve plot saved as 'healthiness_roc_curve.png'.


2024-09-23 23:33:57,904:INFO:ROC curve plot saved as 'healthiness_roc_curve.png'.
2024-09-23 23:33:57,905:INFO:Plotting Training History...


Plotting Training History...


2024-09-23 23:33:58,251:INFO:Training history plot saved as 'training_history.png'.
2024-09-23 23:33:58,252:INFO:Saving Keras Model...
2024-09-23 23:33:58,306:INFO:Healthiness prediction model saved to healthiness_model.h5.
2024-09-23 23:33:58,307:INFO:All tasks completed successfully.


Training history plot saved as 'training_history.png'.
Saving Keras Model...
Healthiness prediction model saved to healthiness_model.h5.
All tasks completed successfully.
