<a href="https://colab.research.google.com/github/BRV12G/Final_year_Project/blob/main/XGBOOST_DIET_NEW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://chatgpt.com/share/677799d5-0314-800b-a1c8-2b674de1094e

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb

# Load your dataset
# Replace 'file_path.csv' with your actual file path
data = pd.read_csv('/content/nutrition_dataset_with_fiber_water_intake (1).csv')

# Separate features and targets
# Replace 'target_columns' with your actual output column names
target_columns = [
    'Calories (kcal)', 'Carbohydrates (g)', 'Proteins (g)', 'Fats (g)', 'Vitamin A (mcg)',
    'Vitamin C (mg)', 'Vitamin D (mcg)', 'Sodium (mg)', 'Potassium (mg)', 'Magnesium (mg)',
    'Iron (mg)', 'Zinc (mg)', 'Fiber Intake (g)', 'Water Intake (L)'
]
features = data.drop(columns=target_columns)
targets = data[target_columns]

# Drop 'Health Status' from features
features = data.drop(columns=target_columns + ['Health Status'])
targets = data[target_columns]

# Identify categorical and numerical columns
categorical_columns = ['Gender', 'Occupation', 'Blood Pressure Category', 'Activity Level', 'Quality of Sleep', 'Stress Level', 'BMI Class']
numerical_columns = features.drop(columns=categorical_columns + ['Person ID']).columns.tolist()

# Preprocessing for numerical and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder='drop'
)

# Create a pipeline for preprocessing and model building
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MultiOutputRegressor(xgb_model))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)

# Display evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Save the model for later use
import joblib
joblib.dump(pipeline, 'xgboost_multioutput_model.pkl')


Mean Absolute Error (MAE): 0.7218572497367859
Mean Squared Error (MSE): 6.992604732513428
R² Score: 0.9968385696411133


['xgboost_multioutput_model.pkl']

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
import xgboost as xgb

# Load dataset
# Replace 'file_path.csv' with your actual dataset file path
data = pd.read_csv('/content/nutrition_dataset_with_fiber_water_intake (1).csv')

# 1. Drop 'Person ID' and 'Health Status' columns
data = data.drop(columns=['Person ID', 'Health Status'])

# 2. Check for missing values and handle them
# Fill missing numerical values with the mean, and categorical values with the mode
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].mean(), inplace=True)

# 3. Handle outliers using the IQR method
def handle_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df

numerical_columns = [
    'Age', 'Sleep Duration', 'Weight (kg)', 'Height (cm)', 'Systolic',
    'Diastolic', 'Heart Rate', 'Daily Steps', 'BMI Values'
]
data = handle_outliers(data, numerical_columns)

# Separate inputs and outputs
input_columns = numerical_columns + [
    'Gender', 'Occupation', 'Quality of Sleep', 'Activity Level',
    'Stress Level', 'Blood Pressure Category', 'BMI Class'
]
output_columns = [
    'Calories (kcal)', 'Carbohydrates (g)', 'Proteins (g)', 'Fats (g)',
    'Vitamin A (mcg)', 'Vitamin C (mg)', 'Vitamin D (mcg)', 'Sodium (mg)',
    'Potassium (mg)', 'Magnesium (mg)', 'Iron (mg)', 'Zinc (mg)',
    'Fiber Intake (g)', 'Water Intake (L)'
]
X = data[input_columns]
y = data[output_columns]

# 4. Preprocess categorical and numerical data
# Binary encode Gender
X['Gender'] = X['Gender'].map({'Male': 0, 'Female': 1})

# Label encode categorical columns
label_columns = ['Quality of Sleep', 'Activity Level', 'Stress Level', 'Blood Pressure Category', 'BMI Class']
label_encoders = {col: LabelEncoder() for col in label_columns}
for col in label_columns:
    X[col] = label_encoders[col].fit_transform(X[col])

# One-hot encode Occupation
X = pd.get_dummies(X, columns=['Occupation'], drop_first=True)

# Standardize and normalize numerical columns
scaler = Pipeline(steps=[
    ('standardize', StandardScaler()),
    ('normalize', MinMaxScaler())
])
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# 5. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train the model using XGBoost
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)
multioutput_model = MultiOutputRegressor(xgb_model)
multioutput_model.fit(X_train, y_train)

# 7. Evaluate the model
y_pred = multioutput_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# 8. Save the model for later use
import joblib
joblib.dump(multioutput_model, 'xgboost_multioutput_model.pkl')







# import pickle

# # Save the preprocessing pipeline
# with open('preprocessing_pipeline.pkl', 'wb') as file:
#     pickle.dump(preprocessor, file)

# # # Save the trained model
# # with open('xgboost_model.pkl', 'wb') as file:
# #     pickle.dump(model, file)

# # Load preprocessing pipeline
# with open('preprocessing_pipeline.pkl', 'rb') as file:
#     preprocessor = pickle.load(file)

# # Load the trained model
# with open('xgboost_model.pkl', 'rb') as file:
#     model = pickle.load(file)

# import numpy as np
# import pandas as pd

# def get_user_prediction():
#     # Collect user inputs
#     user_data = {
#         "Gender": input("Enter Gender (Male/Female): "),
#         "Age": float(input("Enter Age: ")),
#         "Occupation": input("Enter Occupation: "),
#         "Sleep Duration": float(input("Enter Sleep Duration (hours): ")),
#         "Quality of Sleep": input("Enter Quality of Sleep (Excellent/Good/Fair): "),
#         "Activity Level": input("Enter Activity Level (Low/Medium/High): "),
#         "Stress Level": input("Enter Stress Level (Low/Medium/High): "),
#         "Weight (kg)": float(input("Enter Weight (kg): ")),
#         "Height (cm)": float(input("Enter Height (cm): ")),
#         "Blood Pressure Category": input("Enter Blood Pressure Category (Normal/Prehypertension/Hypertension): "),
#         "Systolic": float(input("Enter Systolic Blood Pressure: ")),
#         "Diastolic": float(input("Enter Diastolic Blood Pressure: ")),
#         "Heart Rate": float(input("Enter Heart Rate: ")),
#         "Daily Steps": float(input("Enter Daily Steps: ")),
#         "BMI Values": float(input("Enter BMI Value: ")),
#         "BMI Class": input("Enter BMI Class (Underweight/Normal/Overweight/Obese): "),
#     }

#     # Convert user data to DataFrame
#     user_df = pd.DataFrame([user_data])

#     # Preprocess the user input
#     processed_user_input = preprocessor.transform(user_df)

#     # Predict using the model
#     predictions = model.predict(processed_user_input)

#     # Display predictions
#     output_columns = [
#         "Calories (kcal)", "Carbohydrates (g)", "Proteins (g)", "Fats (g)",
#         "Vitamin A (mcg)", "Vitamin C (mg)", "Vitamin D (mcg)", "Sodium (mg)",
#         "Potassium (mg)", "Magnesium (mg)", "Iron (mg)", "Zinc (mg)",
#         "Fiber Intake (g)", "Water Intake (L)"
#     ]
#     predicted_values = dict(zip(output_columns, predictions[0]))

#     print("\nPredicted Nutritional Needs:")
#     for key, value in predicted_values.items():
#         print(f"{key}: {value:.2f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

Mean Absolute Error (MAE): 0.7579558491706848
Mean Squared Error (MSE): 7.934813022613525
R² Score: 0.9969084858894348


EOFError: Ran out of input