In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [7]:

# 1. LOAD & PREPARE DATA FOR CLASSIFICATION
# Load Data (Adjust path as needed)
df = pd.read_csv('/content/drive/MyDrive/NutritionDS.csv')


In [8]:
# Filter for Obesity Question
target_question = 'Percent of adults aged 18 years and older who have obesity'
df_filtered = df[df['Question'] == target_question].copy()


In [9]:
# *** CRITICAL STEP: CREATE CLASSES ***
# The original 'Data_Value' is a number (34.5).
# We convert it to a Binary Class (0 or 1) based on the median value.
# 0 = "Lower Obesity Rate" (Below Median)
# 1 = "Higher Obesity Rate" (Above Median)
median_val = df_filtered['Data_Value'].median()
df_filtered['Obesity_Class'] = (df_filtered['Data_Value'] > median_val).astype(int)

print(f"Threshold used for classification: {median_val}%")
print("Class 0: Rate <= Threshold | Class 1: Rate > Threshold\n")

feature_cols = ['YearStart', 'LocationDesc', 'Stratification1']
target_col = 'Obesity_Class'

# Drop rows with missing values
df_model = df_filtered[feature_cols + [target_col]].dropna()

X = df_model[feature_cols]
y = df_model[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




Threshold used for classification: 31.5%
Class 0: Rate <= Threshold | Class 1: Rate > Threshold



In [10]:

# 2. DEFINE PREPROCESSOR
numerical_features = ['YearStart']
categorical_features = ['LocationDesc', 'Stratification1']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])



In [11]:

# 3. DEFINE THE 3 CLASSIFIERS

# Model 1: Random Forest Classifier
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Model 2: Logistic Regression (The "Classification" version of Linear Regression)
log_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Model 3: SVC (Support Vector Classifier)
svc_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='rbf', C=1.0))
])