In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
import joblib
import warnings
import os
import sys

warnings.filterwarnings('ignore')

# Plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# =====================================================================
# GOOGLE COLAB SETUP
# =====================================================================

def setup_colab_environment():
    """
    Detect if running in Google Colab and mount Google Drive.
    Returns: (base_path, in_colab)
        base_path: folder corresponding to 'My Drive'
    """
    try:
        import google.colab
        IN_COLAB = True
        print("\n" + "="*70)
        print("üîó GOOGLE COLAB DETECTED")
        print("="*70)

        from google.colab import drive
        print("\nüìÅ Mounting Google Drive...")
        drive.mount('/content/drive')
        print("‚úì Google Drive mounted successfully!")

        # Base path points to 'My Drive'
        base_path = '/content/drive/MyDrive'
        print(f"‚úì Working base path (My Drive): {base_path}")

        return base_path, IN_COLAB

    except ImportError:
        # Not in Colab
        print("\nüíª Running locally")
        return '', False


# Initialize environment (GLOBAL)
BASE_PATH, IN_COLAB = setup_colab_environment()

# Change this if your file is in a subfolder of My Drive
DATASET_NAME = 'food_health_dataset.csv'  # Must exist under BASE_PATH


class FoodRecommendationModel:
    """Food Recommendation ML Model Trainer"""

    def __init__(self, dataset_path=DATASET_NAME):
        """Initialize the model trainer"""
        self.dataset_path = dataset_path
        self.df = None

        self.feature_cols = None
        self.target_cols = None

        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

        self.X_train_scaled = None
        self.X_test_scaled = None

        self.scaler = None
        self.model = None

    def load_data(self):
        """Load dataset from CSV file"""
        print("\n" + "="*70)
        print("üìä LOADING DATASET")
        print("="*70)

        # Build full path
        if os.path.isabs(self.dataset_path):
            full_dataset_path = self.dataset_path
        elif BASE_PATH:
            full_dataset_path = os.path.join(BASE_PATH, self.dataset_path)
        else:
            full_dataset_path = self.dataset_path

        try:
            self.df = pd.read_csv(full_dataset_path)
            print(f"‚úì Dataset loaded successfully!")
            print(f"  - Path: {full_dataset_path}")
            print(f"  - Shape: {self.df.shape}")
            print(f"  - Food items: {len(self.df)}")
            print("\nüìã First few rows:")
            print(self.df.head())
            return True

        except FileNotFoundError:
            print("‚úó Error: Dataset file not found!")
            print(f"  Looking for: {full_dataset_path}")
            if IN_COLAB:
                print("\nüí° Tip: Make sure this file is in your Google Drive:")
                print(f"  My Drive / {self.dataset_path}")
            else:
                print(f"  Please make sure '{self.dataset_path}' exists in the current directory.")
            return False

        except Exception as e:
            print(f"‚úó Error loading dataset: {str(e)}")
            return False

    def explore_data(self):
        """Perform exploratory data analysis"""
        print("\n" + "="*70)
        print("üîç EXPLORATORY DATA ANALYSIS")
        print("="*70)

        print("\nüìä Dataset Info:")
        print("-" * 70)
        print(self.df.info())

        print("\nüìà Statistical Summary:")
        print("-" * 70)
        print(self.df.describe())

        # Check for missing values
        missing = self.df.isnull().sum()
        if missing.any():
            print("\n‚ö† Missing Values:")
            print(missing[missing > 0])
        else:
            print("\n‚úì No missing values found!")

        # Category distribution
        if 'category' in self.df.columns:
            print("\nüìä Category Distribution:")
            print(self.df['category'].value_counts())
        else:
            print("\n‚ö† 'category' column not found in dataset!")

        # Visualizations
        self._create_visualizations()

    def _create_visualizations(self):
        """Create data visualizations"""
        print("\nüìä Creating visualizations...")

        # 1. Distribution of health scores
        score_cols = [
            'diabetes_score', 'hypertension_score', 'heart_disease_score',
            'cholesterol_score', 'obesity_score', 'kidney_score'
        ]
        titles = [
            'Diabetes', 'Hypertension', 'Heart Disease',
            'Cholesterol', 'Obesity', 'Kidney Disease'
        ]

        available_score_cols = [c for c in score_cols if c in self.df.columns]
        if available_score_cols:
            fig, axes = plt.subplots(2, 3, figsize=(18, 10))
            fig.suptitle('Distribution of Health Suitability Scores',
                         fontsize=16, fontweight='bold')

            for idx, (col, title) in enumerate(zip(score_cols, titles)):
                if col not in self.df.columns:
                    continue
                row = idx // 3
                col_idx = idx % 3
                axes[row, col_idx].hist(
                    self.df[col],
                    bins=10,
                    edgecolor='black',
                    alpha=0.7,
                    color='steelblue'
                )
                axes[row, col_idx].set_title(
                    f'{title} Suitability Scores',
                    fontweight='bold'
                )
                axes[row, col_idx].set_xlabel('Score (1-10)')
                axes[row, col_idx].set_ylabel('Frequency')
                axes[row, col_idx].grid(alpha=0.3)

            plt.tight_layout()
            viz_path = os.path.join(
                BASE_PATH if BASE_PATH else '.',
                'health_scores_distribution.png'
            )
            plt.savefig(viz_path, dpi=300, bbox_inches='tight')
            print(f"  ‚úì Saved: {viz_path}")
            plt.close()

        # 2. Correlation heatmap
        correlation_cols = [c for c in ['calories', 'protein', 'carbohydrates']
                            if c in self.df.columns]
        if len(correlation_cols) >= 2:
            plt.figure(figsize=(12, 8))
            correlation_matrix = self.df[correlation_cols].corr()
            sns.heatmap(
                correlation_matrix,
                annot=True,
                fmt='.2f',
                cmap='coolwarm',
                center=0,
                square=True,
                linewidths=1
            )
            plt.title('Correlation Matrix of Nutritional Features',
                      fontsize=14, fontweight='bold', pad=20)
            plt.tight_layout()
            viz_path = os.path.join(
                BASE_PATH if BASE_PATH else '.',
                'correlation_matrix.png'
            )
            plt.savefig(viz_path, dpi=300, bbox_inches='tight')
            print(f"  ‚úì Saved: {viz_path}")
            plt.close()

        # 3. Category distribution
        if 'category' in self.df.columns:
            plt.figure(figsize=(12, 6))
            category_counts = self.df['category'].value_counts()
            plt.bar(
                range(len(category_counts)),
                category_counts.values,
                color='steelblue'
            )
            plt.xlabel('Category', fontweight='bold')
            plt.ylabel('Count', fontweight='bold')
            plt.title('Food Items by Category',
                      fontsize=14, fontweight='bold', pad=20)
            plt.xticks(
                range(len(category_counts)),
                category_counts.index,
                rotation=45,
                ha='right'
            )
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            viz_path = os.path.join(
                BASE_PATH if BASE_PATH else '.',
                'category_distribution.png'
            )
            plt.savefig(viz_path, dpi=300, bbox_inches='tight')
            print(f"  ‚úì Saved: {viz_path}")
            plt.close()

    def preprocess_data(self):
        """Preprocess and prepare data for training"""
        print("\n" + "="*70)
        print("‚öô DATA PREPROCESSING")
        print("="*70)

        # Encode categorical variable
        if 'category' not in self.df.columns:
            raise ValueError("Dataset must contain a 'category' column.")

        label_encoder = LabelEncoder()
        self.df['category_encoded'] = label_encoder.fit_transform(self.df['category'])
        print("\n‚úì Category encoding completed")
        print(f"  - Categories: {list(label_encoder.classes_)}")

        # Create binary suitability labels (score >= 7 is suitable)
        threshold = 7
        score_label_pairs = [
            ('diabetes_score', 'diabetes_suitable'),
            ('hypertension_score', 'hypertension_suitable'),
            ('heart_disease_score', 'heart_disease_suitable'),
            ('cholesterol_score', 'cholesterol_suitable'),
            ('obesity_score', 'obesity_suitable'),
            ('kidney_score', 'kidney_suitable')
        ]

        for score_col, label_col in score_label_pairs:
            if score_col not in self.df.columns:
                raise ValueError(f"Missing required column in dataset: {score_col}")
            self.df[label_col] = (self.df[score_col] >= threshold).astype(int)

        print(f"\n‚úì Binary labels created (threshold: {threshold})")

        # Define feature and target columns
        self.feature_cols = ['calories', 'protein', 'carbohydrates', 'category_encoded']
        self.target_cols = [
            'diabetes_suitable', 'hypertension_suitable',
            'heart_disease_suitable', 'cholesterol_suitable',
            'obesity_suitable', 'kidney_suitable'
        ]

        for col in self.feature_cols:
            if col not in self.df.columns:
                raise ValueError(f"Missing feature column: {col}")

        X = self.df[self.feature_cols]
        y = self.df[self.target_cols]

        print(f"\n‚úì Features prepared:")
        print(f"  - Feature shape: {X.shape}")
        print(f"  - Features: {self.feature_cols}")
        print(f"\n‚úì Targets prepared:")
        print(f"  - Target shape: {y.shape}")
        print(f"  - Targets: {self.target_cols}")

        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        print(f"\n‚úì Data split completed:")
        print(f"  - Training set: {self.X_train.shape[0]} samples")
        print(f"  - Testing set: {self.X_test.shape[0]} samples")

        # Feature scaling
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)

        print(f"\n‚úì Feature scaling completed")

    def train_models(self):
        """Train multiple ML models and select the best one"""
        print("\n" + "="*70)
        print("ü§ñ MODEL TRAINING")
        print("="*70)

        models = {
            'Random Forest': RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                max_depth=10,
                min_samples_split=5
            ),
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=100,
                random_state=42,
                max_depth=5,
                learning_rate=0.1
            ),
            'Decision Tree': DecisionTreeClassifier(
                random_state=42,
                max_depth=8
            )
        }

        results = {}

        for name, base_model in models.items():
            print(f"\nüîÑ Training {name}...")
            model = MultiOutputClassifier(base_model)
            model.fit(self.X_train_scaled, self.y_train)

            y_pred = model.predict(self.X_test_scaled)
            accuracy = accuracy_score(self.y_test, y_pred)

            results[name] = {
                'model': model,
                'accuracy': accuracy,
                'predictions': y_pred
            }

            print(f"  ‚úì {name} accuracy: {accuracy:.4f}")

        # Select best model
        best_model_name = max(results, key=lambda x: results[x]['accuracy'])
        self.model = results[best_model_name]['model']

        print(f"\nüèÜ Best Model: {best_model_name}")
        print(f"  - Accuracy: {results[best_model_name]['accuracy']:.4f}")

        return results, best_model_name

    def evaluate_model(self):
        """Evaluate model performance"""
        print("\n" + "="*70)
        print("üìä MODEL EVALUATION")
        print("="*70)

        y_pred = self.model.predict(self.X_test_scaled)
        overall_accuracy = accuracy_score(self.y_test, y_pred)

        print(f"\nüéØ Overall Accuracy: {overall_accuracy:.4f}")
        print("\n" + "-"*70)
        print("Accuracy per Health Condition:")
        print("-" * 70)

        # Per-condition accuracy
        for idx, condition in enumerate(self.target_cols):
            acc = accuracy_score(self.y_test.iloc[:, idx], y_pred[:, idx])
            print(f"  {condition:30s}: {acc:.4f}")

        print("\n" + "-"*70)
        print("Detailed Classification Reports:")
        print("-" * 70)

        for idx, condition in enumerate(self.target_cols):
            print(f"\n{condition}:")
            print(
                classification_report(
                    self.y_test.iloc[:, idx],
                    y_pred[:, idx],
                    target_names=['Not Suitable', 'Suitable']
                )
            )

        # Feature importance (RandomForest / Tree / GB)
        if hasattr(self.model.estimators_[0], 'feature_importances_'):
            self._plot_feature_importance()

    def _plot_feature_importance(self):
        """Plot feature importance"""
        print("\nüìä Creating feature importance visualization...")

        importances = self.model.estimators_[0].feature_importances_
        feature_importance_df = pd.DataFrame({
            'feature': self.feature_cols,
            'importance': importances
        }).sort_values('importance', ascending=False)

        plt.figure(figsize=(10, 6))
        plt.barh(
            feature_importance_df['feature'],
            feature_importance_df['importance'],
            color='steelblue'
        )
        plt.xlabel('Importance', fontweight='bold')
        plt.title('Feature Importance for Food Recommendations',
                  fontsize=14, fontweight='bold', pad=20)
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        viz_path = os.path.join(
            BASE_PATH if BASE_PATH else '.',
            'feature_importance.png'
        )
        plt.savefig(viz_path, dpi=300, bbox_inches='tight')
        print(f"  ‚úì Saved: {viz_path}")
        plt.close()

    def save_models(self, output_dir='models'):
        """Save trained models and preprocessing objects"""
        print("\n" + "="*70)
        print("üíæ SAVING MODELS")
        print("="*70)

        # Adjust output directory for Colab
        if IN_COLAB:
            output_dir = os.path.join(BASE_PATH, output_dir)

        os.makedirs(output_dir, exist_ok=True)

        model_path = os.path.join(output_dir, 'food_recommendation_model.pkl')
        scaler_path = os.path.join(output_dir, 'feature_scaler.pkl')
        features_path = os.path.join(output_dir, 'feature_columns.pkl')

        joblib.dump(self.model, model_path)
        joblib.dump(self.scaler, scaler_path)
        joblib.dump(self.feature_cols, features_path)

        print(f"\n‚úì Models saved successfully!")
        print(f"  - Model:   {model_path}")
        print(f"  - Scaler:  {scaler_path}")
        print(f"  - Features:{features_path}")

        if IN_COLAB:
            print("\nüìÅ Models are in your Google Drive under:")
            print(f"  My Drive / {os.path.relpath(output_dir, BASE_PATH)}")

    def test_prediction(self):
        """Test the model with sample predictions"""
        print("\n" + "="*70)
        print("üß™ TESTING PREDICTIONS")
        print("="*70)

        # Example test cases (you can adjust categories according to your encoding)
        test_foods = [
            {
                'name': 'Grilled Chicken Salad',
                'calories': 250,
                'protein': 35,
                'carbohydrates': 15,
                'category_encoded': 6  # Example code
            },
            {
                'name': 'Fried Chicken Burger',
                'calories': 680,
                'protein': 28,
                'carbohydrates': 52,
                'category_encoded': 2
            },
            {
                'name': 'Vegetable Soup',
                'calories': 120,
                'protein': 5,
                'carbohydrates': 18,
                'category_encoded': 8
            }
        ]

        for food in test_foods:
            print(f"\nüçΩ  {food['name']}")
            print(f"  Nutritional Info: {food['calories']} cal, "
                  f"{food['protein']}g protein, "
                  f"{food['carbohydrates']}g carbs")

            features = np.array([[
                food['calories'],
                food['protein'],
                food['carbohydrates'],
                food['category_encoded']
            ]])

            features_scaled = self.scaler.transform(features)
            predictions = self.model.predict(features_scaled)[0]

            print("  Suitability:")
            for idx, condition in enumerate(self.target_cols):
                status = "‚úì Suitable" if predictions[idx] == 1 else "‚úó Not Suitable"
                print(f"    {condition:30s}: {status}")


def main():
    """Main training pipeline"""
    print("\n" + "="*70)
    print("üçΩ  OLIVE FOODS - ML MODEL TRAINING")
    print("="*70)
    print("\nHealth-Based Food Recommendation System")
    print("Training machine learning model for personalized food recommendations")
    print("="*70)

    trainer = FoodRecommendationModel(dataset_path=DATASET_NAME)

    # Load data
    if not trainer.load_data():
        print("\n‚ùå Training aborted due to data loading error.")
        sys.exit(1)

    # Explore data
    trainer.explore_data()

    # Preprocess data
    trainer.preprocess_data()

    # Train models
    results, best_model = trainer.train_models()

    # Evaluate model
    trainer.evaluate_model()

    # Save models
    trainer.save_models(output_dir='Olive-Foods-ML-models')

    # Test predictions
    trainer.test_prediction()

    print("\n" + "="*70)
    print("‚úÖ MODEL TRAINING COMPLETED SUCCESSFULLY!")
    print("="*70)
    print("\nüì¶ Next Steps:")
    print("  1. Download the .pkl files from your Google Drive")
    print("  2. Place them in your Flask app's 'models' directory")
    print("  3. Start your Flask ML service and connect it to your MERN backend")
    print("\n" + "="*70)


if __name__ == "__main__":
    main()



üîó GOOGLE COLAB DETECTED

üìÅ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úì Google Drive mounted successfully!
‚úì Working base path (My Drive): /content/drive/MyDrive

üçΩ  OLIVE FOODS - ML MODEL TRAINING

Health-Based Food Recommendation System
Training machine learning model for personalized food recommendations

üìä LOADING DATASET
‚úì Dataset loaded successfully!
  - Path: /content/drive/MyDrive/food_health_dataset.csv
  - Shape: (5000, 12)
  - Food items: 5000

üìã First few rows:
   food_id             food_name         category  calories  protein  \
0        1      Greek Veggie Mix  Salads & Greens       150       12   
1        2        Grilled Quinoa  Salads & Greens       190       18   
2        3  Avocado Super Greens  Salads & Greens       150       10   
3        4         Fresh Spinach  Salads & Greens       171       20   
4        5  Herbed Chicken Sa