In [10]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score

In [6]:
import pandas as pd
df = pd.read_csv("obesity.csv")
print(df.shape)
df.head(3)

(2111, 19)


Unnamed: 0,Gender,Age,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,Automobile,Bike,Motorbike,Public_Transportation,Walking,NObeyesdad
0,0,21.0,1,0,2.0,3.0,1,0,2.0,0,0.0,1.0,0,0,0,0,1,0,0
1,0,21.0,1,0,3.0,3.0,1,1,3.0,1,3.0,0.0,1,0,0,0,1,0,0
2,1,23.0,1,0,2.0,3.0,1,0,2.0,0,2.0,1.0,2,0,0,0,1,0,0


In [26]:
y = df['NObeyesdad']
X = df[['Age', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SCC', 'FAF']]

In [33]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=200, random_state=42),
    'SVM': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, alpha=0.0001, solver='adam', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [34]:
# Define a function to evaluate models
def evaluate_model(model, X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    return scores

In [35]:
# Evaluate each model
results = {}
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results[name] = {
        'Mean Accuracy': np.mean(scores),
        'Std Accuracy': np.std(scores)
    }

# Display the results
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Mean Accuracy,Std Accuracy
Logistic Regression,0.761001,0.01522
SVM,0.763694,0.018777
Neural Network,0.763033,0.015609
Gradient Boosting,0.82126,0.022246
Random Forest,0.819908,0.011565


In [36]:
# Optionally, you can also fit and evaluate on the test set for final performance
for name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {accuracy:.2f}")

Logistic Regression Test Accuracy: 0.80
SVM Test Accuracy: 0.80
Neural Network Test Accuracy: 0.79
Gradient Boosting Test Accuracy: 0.83
Random Forest Test Accuracy: 0.82


# Hyper Parameter Tuning

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
# Initialize the model
gbm = GradientBoostingClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],              # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],            # Step size for each iteration
    'max_depth': [3, 4, 5],                       # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],              # Minimum number of samples required to split an internal node
    'subsample': [0.8, 0.9, 1.0]                 # Fraction of samples used for fitting each base learner
}

# Set up Grid Search
grid_search = GridSearchCV(estimator=gbm,
                           param_grid=param_grid,
                           cv=5,                       # Number of folds in cross-validation
                           scoring='accuracy',        # Scoring metric
                           n_jobs=-1,                  # Use all available processors
                           verbose=1)                  # Verbosity level

# Fit Grid Search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters found: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.9}
Best cross-validation score: 0.7494022609884261
