In [106]:
import pandas                      as pd
import numpy                       as np
from sklearn.preprocessing         import StandardScaler, OneHotEncoder
from sklearn.linear_model          import LogisticRegression
from sklearn.model_selection       import train_test_split, cross_val_score, StratifiedKFold
from sklearn.multiclass            import OneVsOneClassifier, OneVsRestClassifier
from sklearn.compose               import ColumnTransformer
from sklearn.pipeline              import Pipeline
import warnings

warnings.filterwarnings(
    "ignore",
    message=r"^Found unknown categories",
    module=r"sklearn\.preprocessing\._encoders"
)

file_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/GkDzb7bWrtvGXdPOfk6CIg/Obesity-level-prediction-dataset.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [107]:
# Separate our independent and dependent features.
X = data.drop(columns=["NObeyesdad"])
y = data["NObeyesdad"]

# Split our data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [108]:
# Split our data into numerical and categorical features.
numerical_Features = [feature for feature in X_train.columns if X_train[feature].dtype != 'O']
categorical_features = [feature for feature in X_train.columns if X_train[feature].dtype == 'O']

In [112]:
# Create a pipeline that standardizes our numerical features and encodes our categorical features.
preprocess = ColumnTransformer([
        ('scaler', StandardScaler(), numerical_Features),
        ('encoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown="ignore"), categorical_features),
    ])

In [113]:
# Create a pipeline that preprocesses the data using the ColumnTransformer and train the OvR logistic regression model.
pipeline_OvR = Pipeline([
    ('preprocessor', preprocess),
    ('model', OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])

# Create a pipeline that preprocesses the data using the ColumnTransformer and train the OvO logistic regression model.
pipeline_OvO = Pipeline([
    ('preprocessor', preprocess),
    ('model', OneVsOneClassifier(LogisticRegression(max_iter=1000)))
])

In [114]:
# Evaluation using StratifiedKFold and f1_macro scoring.
k_fold = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

# f1 macro score for OvR
f1_macro_OvR = cross_val_score(pipeline_OvR, X_train, y_train,
                               cv=k_fold, scoring='f1_macro').mean()

# f1 macro score for OvO
f1_macro_OvO = cross_val_score(pipeline_OvO, X_train, y_train,
                               cv=k_fold, scoring='f1_macro').mean()

print(f"f1 macro score for OvR strategy is {np.round(100 * f1_macro_OvR, 2)}% ")
print(f"f1 macro score for OvO strategy is {np.round(100 * f1_macro_OvO, 2)}% ")

f1 macro score for OvR strategy is 75.11% 
f1 macro score for OvO strategy is 93.21% 
