<a href="https://colab.research.google.com/github/BRV12G/Final_year_Project/blob/main/KNN_on_new_diet_dataset_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import classification_report, accuracy_score, recall_score, mean_squared_error, r2_score

# Load the dataset
file_path = '/content/nutrition_dataset_with_fiber_water_intake.csv'
data = pd.read_csv(file_path)

# Remove leading and trailing spaces from column names
data.columns = data.columns.str.strip()

# Define input and output columns
input_columns = [
    "Gender", "Age", "Occupation", "Sleep Duration",
    "Quality of Sleep", "Activity Level", "Stress Level",
    "Blood Pressure Category", "Systolic", "Diastolic", "Heart Rate",
    "Daily Steps", "Height (cm)", "Weight (kg)"
]
output_columns = [
    "Health Status", "BMI Values", "BMI Class", "Calories (kcal)",
    "Carbohydrates (g)", "Proteins (g)", "Fats (g)", "Vitamin A (mcg)",
    "Vitamin C (mg)", "Vitamin D (mcg)", "Sodium (mg)", "Potassium (mg)",
    "Magnesium (mg)", "Iron (mg)", "Zinc (mg)", "Fiber Intake (g)",
    "Water Intake (L)"
]

# Filter the dataset for input and output columns
X = data[input_columns]
y = data[output_columns]

# Encode categorical variables in inputs
label_encoders = {}
for col in ["Gender", "Occupation", "Quality of Sleep", "Activity Level", "Stress Level", "Blood Pressure Category"]:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Standardize numerical input columns
scaler = StandardScaler()
X[X.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(X.select_dtypes(include=['float64', 'int64']))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the appropriate model for each output column
knn_models = {}
overall_accuracy = []
overall_recall = []

for col in output_columns:
    if y_train[col].dtype in ['float64', 'int64']:  # Use KNeighborsRegressor for continuous
        knn = KNeighborsRegressor(n_neighbors=5)
        metric = 'r2_score'
    else:  # Use KNeighborsClassifier for categorical
        knn = KNeighborsClassifier(n_neighbors=5)
        metric = 'accuracy_score'

    # Train the model
    knn.fit(X_train, y_train[col])
    knn_models[col] = {'model': knn, 'metric': metric}

    # Evaluate the model
    y_pred = knn.predict(X_test)
    if metric == 'r2_score':
        score = r2_score(y_test[col], y_pred)
        overall_accuracy.append(score)
        print(f"{col} - R-squared Score: {score:.2f}")
    else:
        accuracy = accuracy_score(y_test[col], y_pred)
        recall = recall_score(y_test[col], y_pred, average='weighted', zero_division=0)
        overall_accuracy.append(accuracy)
        overall_recall.append(recall)
        print(f"{col} - Accuracy Score: {accuracy:.2f}, Recall Score: {recall:.2f}")

# Calculate the overall metrics
average_accuracy = sum(overall_accuracy) / len(overall_accuracy)
average_recall = sum(overall_recall) / len(overall_recall) if overall_recall else 0

# Output the overall metrics
print(f"\nOverall Model Accuracy (Unweighted Average): {average_accuracy * 100:.2f}%")
print(f"Overall Model Recall (Unweighted Average): {average_recall * 100:.2f}%")

# Interactive user input for predictions
def predict_for_user():
    user_input = {}
    for col in input_columns:
        if col in label_encoders:
            options = label_encoders[col].classes_
            print(f"Select {col} ({', '.join(options)}):")
            value = input(f"{col}: ").strip()
            user_input[col] = label_encoders[col].transform([value])[0]
        else:
            value = float(input(f"Enter {col} (numeric): ").strip())
            user_input[col] = value

    # Preprocess and scale user input
    user_input = scaler.transform(pd.DataFrame([user_input]))

    # Make predictions
    predictions = {}
    for col in output_columns:
        predictions[col] = knn_models[col]['model'].predict(user_input)[0]
    return predictions

# Run interactive prediction
print("\nProvide user details for prediction:")
predictions = predict_for_user()
print("\nPredictions for the user:")
for key, value in predictions.items():
    print(f"{key}: {value}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Health Status - Accuracy Score: 0.79, Recall Score: 0.79
BMI Values - R-squared Score: 0.75
BMI Class - Accuracy Score: 0.80, Recall Score: 0.80
Calories (kcal) - R-squared Score: 0.91
Carbohydrates (g) - R-squared Score: 0.89
Proteins (g) - R-squared Score: 0.89
Fats (g) - R-squared Score: 0.89
Vitamin A (mcg) - R-squared Score: 1.00
Vitamin C (mg) - R-squared Score: 0.94
Vitamin D (mcg) - R-squared Score: 0.33
Sodium (mg) - R-squared Score: 1.00
Potassium (mg) - R-squared Score: 0.96
Magnesium (mg) - R-squared Score: 0.87
Iron (mg) - R-squared Score: 1.00
Zinc (mg) - R-squared Score: 1.00
Fiber Intake (g) - R-squared Score: 0.89
Water Intake (L) - R-squared Score: 0.87

Overall Model Accuracy (Unweighted Average): 86.95%
Overall Model Recall (Unweighted Average): 79.59%

Provide user details for prediction:
Select Gender (Female, Male):
Gender: Male
Enter Age (numeric): 65
Select Occupation (Artist, Athlete, Doctor, Engineer, Labourer, Manager, Scientist, Teacher):
Occupation: Manage

