<a href="https://colab.research.google.com/github/BRV12G/Final_year_Project/blob/main/Naive_bayes_using_updated_diet_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = '/content/nutrition_dataset_with_fiber_water_intake.csv'  # Update this path
data = pd.read_csv(file_path)

# Inspect column names
print("Dataset Columns:", data.columns.tolist())

# Drop the Person ID column as it's not useful for classification
data = data.drop(columns=['Person ID'])

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Ensure all targets are encoded properly as categorical
output_features = [
    'Health Status','BMI Values' , 'BMI Class', 'Calories (kcal)', 'Carbohydrates (g)',
    'Proteins (g)', 'Fats (g)', 'Vitamin A (mcg)', 'Vitamin C (mg)',
    'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)', 'Magnesium (mg)',
    'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)'
]

for feature in output_features:
    if feature in data.columns and data[feature].dtype != 'int':
        label_encoders[feature] = LabelEncoder()
        data[feature] = label_encoders[feature].fit_transform(data[feature])

# Define input features
input_features = [
    'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep',
    'Activity Level', 'Stress Level', 'Blood Pressure Category',
    'Systolic', 'Diastolic', 'Heart Rate', 'Daily Steps',
    'Height (cm)', 'Weight (kg)'
]

# Split the data into features (X) and targets (y)
X = data[input_features]
y = data[output_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models for each output feature
models = {}
results = {}

for target in output_features:
    # Define target-specific training and testing data
    y_train_target = y_train[target]
    y_test_target = y_test[target]

    # Train a Gaussian Naive Bayes classifier
    model = GaussianNB()
    model.fit(X_train, y_train_target)
    models[target] = model

    # Test the model
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test_target, y_pred)
    results[target] = {
        'accuracy': accuracy,
        'classification_report': classification_report(y_test_target, y_pred, zero_division=0),
        'confusion_matrix': confusion_matrix(y_test_target, y_pred)
    }

    # Print evaluation metrics
    print(f"\nTarget: {target}")
    print(f"Accuracy: {accuracy}")
    # print("Classification Report:\n", results[target]['classification_report'])
    print("Confusion Matrix:\n", results[target]['confusion_matrix'])

# Function to predict multiple outputs for new user inputs
def predict_user_input():
    # Gather user input
    print("\nEnter the following details:")
    user_data = {
        'Gender': input("Gender (Male/Female): "),
        'Age': int(input("Age: ")),
        'Occupation': input("Occupation: "),
        'Sleep Duration': int(input("Sleep Duration (hours): ")),
        'Quality of Sleep': input("Quality of Sleep (Excellent/Good/Fair/Poor): "),
        'Activity Level': input("Activity Level (Low/Medium/High): "),
        'Stress Level': input("Stress Level (Low/Medium/High): "),
        'Blood Pressure Category': input("Blood Pressure Category (Normal/Prehypertension/Hypertension): "),
        'Systolic': int(input("Systolic Blood Pressure: ")),
        'Diastolic': int(input("Diastolic Blood Pressure: ")),
        'Heart Rate': int(input("Heart Rate: ")),
        'Daily Steps': int(input("Daily Steps: ")),
        'Height (cm)': int(input("Height (cm): ")),
        'Weight (kg)': int(input("Weight (kg): "))
    }

    # Convert input to DataFrame
    input_df = pd.DataFrame([user_data])

    # Encode categorical features
    for column, encoder in label_encoders.items():
        if column in input_df.columns:
            try:
                input_df[column] = encoder.transform(input_df[column])
            except ValueError:
                # Assign the most frequent category for unseen labels
                input_df[column] = encoder.transform([data[column].mode()[0]])

    # Predict outputs for each target feature
    predictions = {}
    for target, model in models.items():
        prediction = model.predict(input_df)
        if target in label_encoders:  # Decode categorical outputs
            predictions[target] = label_encoders[target].inverse_transform(prediction)[0]
        else:
            predictions[target] = prediction[0]

    # Display predictions
    print("\nPredicted Outputs:")
    for key, value in predictions.items():
        print(f"{key}: {value}")

# Run prediction function
predict_user_input()





Dataset Columns: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Activity Level', 'Stress Level', 'Weight (kg)', 'Height (cm)', 'Blood Pressure Category', 'Systolic', 'Diastolic', 'Heart Rate', 'Daily Steps', 'Health Status', 'BMI Values', 'BMI Class', 'Calories (kcal)', 'Carbohydrates (g)', 'Proteins (g)', 'Fats (g)', 'Vitamin A (mcg)', 'Vitamin C (mg)', 'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)', 'Magnesium (mg)', 'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)']

Target: Health Status
Accuracy: 0.736
Confusion Matrix:
 [[ 728   22]
 [1034 2216]]

Target: BMI Values
Accuracy: 0.93125
Confusion Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Target: BMI Class
Accuracy: 0.787
Confusion Matrix:
 [[1822    0  460  103]
 [  27    0   78    0]
 [ 180    1 1281    0]
 [   3    0    0   45]]

Target: Calories (kcal)
Accuracy: 0.00875
Confusion Matrix:


KeyboardInterrupt: Interrupted by user

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = '/content/nutrition_dataset_with_fiber_water_intake.csv'  # Update this path
data = pd.read_csv(file_path)

# Inspect column names
print("Dataset Columns:", data.columns.tolist())

# Drop the Person ID column as it's not useful for classification
data = data.drop(columns=['Person ID'])

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Ensure all targets are encoded properly as categorical
output_features = [
    'Health Status', 'BMI Values', 'BMI Class', 'Calories (kcal)', 'Carbohydrates (g)',
    'Proteins (g)', 'Fats (g)', 'Vitamin A (mcg)', 'Vitamin C (mg)',
    'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)', 'Magnesium (mg)',
    'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)'
]

for feature in output_features:
    if feature in data.columns and data[feature].dtype != 'int':
        label_encoders[feature] = LabelEncoder()
        data[feature] = label_encoders[feature].fit_transform(data[feature])

# Define input features
input_features = [
    'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep',
    'Activity Level', 'Stress Level', 'Blood Pressure Category',
    'Systolic', 'Diastolic', 'Heart Rate', 'Daily Steps',
    'Height (cm)', 'Weight (kg)'
]

# Split the data into features (X) and targets (y)
X = data[input_features]
y = data[output_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models for each output feature
models = {}
results = {}
overall_accuracy_sum = 0  # To calculate the mean accuracy across all models

for target in output_features:
    # Define target-specific training and testing data
    y_train_target = y_train[target]
    y_test_target = y_test[target]

    # Train a Gaussian Naive Bayes classifier
    model = GaussianNB()
    model.fit(X_train, y_train_target)
    models[target] = model

    # Test the model
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test_target, y_pred)
    overall_accuracy_sum += accuracy
    results[target] = {
        'accuracy': accuracy,
        'classification_report': classification_report(y_test_target, y_pred, zero_division=0),
        'confusion_matrix': confusion_matrix(y_test_target, y_pred)
    }

    # Print evaluation metrics
    print(f"\nTarget: {target}")
    print(f"Accuracy: {accuracy}")
    # print("Classification Report:\n", results[target]['classification_report'])
    print("Confusion Matrix:\n", results[target]['confusion_matrix'])

# Calculate and display overall model accuracy
overall_accuracy = overall_accuracy_sum / len(output_features)
print(f"\nOverall Model Accuracy (Mean Accuracy across targets): {overall_accuracy:.4f}")

# Function to predict multiple outputs for new user inputs
def predict_user_input():
    # Gather user input
    print("\nEnter the following details:")
    user_data = {
        'Gender': input("Gender (Male/Female): "),
        'Age': int(input("Age: ")),
        'Occupation': input("Occupation: "),
        'Sleep Duration': int(input("Sleep Duration (hours): ")),
        'Quality of Sleep': input("Quality of Sleep (Excellent/Good/Fair/Poor): "),
        'Activity Level': input("Activity Level (Low/Medium/High): "),
        'Stress Level': input("Stress Level (Low/Medium/High): "),
        'Blood Pressure Category': input("Blood Pressure Category (Normal/Prehypertension/Hypertension): "),
        'Systolic': int(input("Systolic Blood Pressure: ")),
        'Diastolic': int(input("Diastolic Blood Pressure: ")),
        'Heart Rate': int(input("Heart Rate: ")),
        'Daily Steps': int(input("Daily Steps: ")),
        'Height (cm)': int(input("Height (cm): ")),
        'Weight (kg)': int(input("Weight (kg): "))
    }

    # Convert input to DataFrame
    input_df = pd.DataFrame([user_data])

    # Encode categorical features
    for column, encoder in label_encoders.items():
        if column in input_df.columns:
            try:
                input_df[column] = encoder.transform(input_df[column])
            except ValueError:
                # Assign the most frequent category for unseen labels
                input_df[column] = encoder.transform([data[column].mode()[0]])

    # Predict outputs for each target feature
    predictions = {}
    for target, model in models.items():
        prediction = model.predict(input_df)
        if target in label_encoders:  # Decode categorical outputs
            predictions[target] = label_encoders[target].inverse_transform(prediction)[0]
        else:
            predictions[target] = prediction[0]

    # Display predictions
    print("\nPredicted Outputs:")
    for key, value in predictions.items():
        print(f"{key}: {value}")

# Run prediction function
predict_user_input()


Dataset Columns: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Activity Level', 'Stress Level', 'Weight (kg)', 'Height (cm)', 'Blood Pressure Category', 'Systolic', 'Diastolic', 'Heart Rate', 'Daily Steps', 'Health Status', 'BMI Values', 'BMI Class', 'Calories (kcal)', 'Carbohydrates (g)', 'Proteins (g)', 'Fats (g)', 'Vitamin A (mcg)', 'Vitamin C (mg)', 'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)', 'Magnesium (mg)', 'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)']

Target: Health Status
Accuracy: 0.736
Confusion Matrix:
 [[ 728   22]
 [1034 2216]]

Target: BMI Values
Accuracy: 0.93125
Confusion Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Target: BMI Class
Accuracy: 0.787
Confusion Matrix:
 [[1822    0  460  103]
 [  27    0   78    0]
 [ 180    1 1281    0]
 [   3    0    0   45]]

Target: Calories (kcal)
Accuracy: 0.00875
Confusion Matrix:


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression

# Load the dataset
file_path = '/content/nutrition_dataset_with_fiber_water_intake.csv'  # Update this path
data = pd.read_csv(file_path)

# Inspect column names
print("Dataset Columns:", data.columns.tolist())

# Drop the Person ID column as it's not useful for classification or regression
data = data.drop(columns=['Person ID'])

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')  # Replace missing values with the most frequent value
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Ensure no invalid or infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)  # Drop rows with NaN after replacement

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Add new feature: BMI (Body Mass Index) and handle division by zero
data['Height (cm)'] = data['Height (cm)'].replace(0, np.nan)  # Replace zero heights with NaN
data['BMI'] = data['Weight (kg)'] / ((data['Height (cm)'] / 100) ** 2)
data['BMI'].replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(subset=['BMI'], inplace=True)  # Drop rows with NaN BMI

# Standardize numerical features
numerical_features = [
    'Age', 'Sleep Duration', 'Systolic', 'Diastolic', 'Heart Rate',
    'Daily Steps', 'Height (cm)', 'Weight (kg)', 'BMI'
]
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Define input and output features
input_features = [
    'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep',
    'Activity Level', 'Stress Level', 'Blood Pressure Category',
    'Systolic', 'Diastolic', 'Heart Rate', 'Daily Steps',
    'Height (cm)', 'Weight (kg)', 'BMI'
]
categorical_targets = ['Health Status', 'BMI Class']
continuous_targets = [
    'BMI Values', 'Calories (kcal)', 'Carbohydrates (g)', 'Proteins (g)', 'Fats (g)',
    'Vitamin A (mcg)', 'Vitamin C (mg)', 'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)',
    'Magnesium (mg)', 'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)'
]

# Split data
X = data[input_features]
X_train, X_test, y_train, y_test = train_test_split(X, data, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE for categorical targets
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = {}, {}
for target in categorical_targets:
    X_train_resampled[target], y_train_resampled[target] = smote.fit_resample(X_train, y_train[target])

# Train models
models = {}
results = {}

# Train classifiers for categorical targets
overall_confusion_matrix = np.zeros((2, 2))
total_samples = 0
correct_predictions = 0

for target in categorical_targets:
    model = GaussianNB()
    model.fit(X_train_resampled[target], y_train_resampled[target])
    models[target] = model
    y_pred = model.predict(X_test)

    # Evaluate the model
    results[target] = {
        'accuracy': accuracy_score(y_test[target], y_pred),
        'classification_report': classification_report(y_test[target], y_pred, zero_division=0),
        'confusion_matrix': confusion_matrix(y_test[target], y_pred)
    }

    # Update overall confusion matrix and accuracy
    cm = results[target]['confusion_matrix']
    overall_confusion_matrix += cm
    total_samples += len(y_test[target])
    correct_predictions += np.trace(cm)

    print(f"\nCategorical Target: {target}")
    print(f"Accuracy: {results[target]['accuracy']}")
    print("Confusion Matrix:\n", cm)

# Calculate overall accuracy for classification
overall_accuracy = correct_predictions / total_samples
print("\nOverall Model Accuracy (Categorical Targets):", overall_accuracy)
print("Overall Confusion Matrix (Categorical Targets):\n", overall_confusion_matrix)

# Train regressors for continuous targets
for target in continuous_targets:
    model = LinearRegression()
    model.fit(X_train, y_train[target])
    models[target] = model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)
    results[target] = {'mean_squared_error': mse}
    print(f"\nContinuous Target: {target}")
    print(f"Mean Squared Error: {mse}")

# User Input Prediction
def predict_user_input():
    user_data = {}
    for feature in input_features:
        user_data[feature] = float(input(f"Enter value for {feature}: "))
    user_data_df = pd.DataFrame([user_data])
    user_data_df[numerical_features] = scaler.transform(user_data_df[numerical_features])  # Standardize
    predictions = {}

    # Predict for each target
    for target in categorical_targets:
        predictions[target] = models[target].predict(user_data_df)[0]
        predictions[target] = label_encoders[target].inverse_transform([int(predictions[target])])[0]
    for target in continuous_targets:
        predictions[target] = models[target].predict(user_data_df)[0]
    return predictions

print("\n--- Enter User Data for Prediction ---")
user_predictions = predict_user_input()
print("\nPredicted Values for User Input:", user_predictions)




Dataset Columns: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Activity Level', 'Stress Level', 'Weight (kg)', 'Height (cm)', 'Blood Pressure Category', 'Systolic', 'Diastolic', 'Heart Rate', 'Daily Steps', 'Health Status', 'BMI Values', 'BMI Class', 'Calories (kcal)', 'Carbohydrates (g)', 'Proteins (g)', 'Fats (g)', 'Vitamin A (mcg)', 'Vitamin C (mg)', 'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)', 'Magnesium (mg)', 'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)']


  data.replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['BMI'].replace([np.inf, -np.inf], np.nan, inplace=True)



Categorical Target: Health Status
Accuracy: 0.69425
Confusion Matrix:
 [[ 750    0]
 [1223 2027]]


ValueError: operands could not be broadcast together with shapes (2,2) (4,4) (2,2) 