In [None]:
import pandas as pd
train_df = pd.read_csv("/content/train.csv")


In [None]:
# Step1
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, log_loss

train = "train.csv"
test = "test.csv"
target_column = "NObeyesdad"

train_df = pd.read_csv(train)

In [None]:
#Step2

# X, y
X = train_df.drop(columns=[target_column])
y = train_df[target_column]

for col in ["id"]:
    if col in X.columns:
        X = X.drop(columns=[col])

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

FEATURE_CONFIG = {
    "NUMERIC": numeric_features,
    "CATEGORICAL": categorical_features,
    "TARGET": target_column
}

In [None]:
# Step3

def create_preprocessor(numeric_cols, categorical_cols):
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ],
        remainder="drop"
    )
    return preprocessor

preprocessor = create_preprocessor(numeric_features, categorical_features)

In [None]:
# Step4

baseline_clf = LogisticRegression(
    penalty="l2",
    solver="saga",
    multi_class="multinomial",
    max_iter=2000,
    random_state=42
)


baseline_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", baseline_clf)
])

baseline_pipeline.fit(X_train, y_train)
y_val_pred = baseline_pipeline.predict(X_val)
y_val_proba = baseline_pipeline.predict_proba(X_val)

val_acc = accuracy_score(y_val, y_val_pred)
val_logloss = log_loss(y_val, y_val_proba)
macro_f1 = f1_score(y_val, y_val_pred, average="macro")

print(f"Validation accuracy: {val_acc:.4f}")
print(f"Validation log loss: {val_logloss:.4f}")
print(f"Validation macro F1: {macro_f1:.4f}")

print(classification_report(y_val, y_val_pred))

joblib.dump(baseline_pipeline, "logistic_regression_model.pkl")



Validation accuracy: 0.8680
Validation log loss: 0.3998
Validation macro F1: 0.8532
                     precision    recall  f1-score   support

Insufficient_Weight       0.89      0.95      0.92       505
      Normal_Weight       0.87      0.82      0.84       617
     Obesity_Type_I       0.81      0.85      0.83       582
    Obesity_Type_II       0.93      0.96      0.95       650
   Obesity_Type_III       1.00      1.00      1.00       809
 Overweight_Level_I       0.75      0.71      0.73       485
Overweight_Level_II       0.73      0.70      0.72       504

           accuracy                           0.87      4152
          macro avg       0.85      0.85      0.85      4152
       weighted avg       0.87      0.87      0.87      4152



['logistic_regression_model.pkl']

In [None]:
# 2nd model

X = train_df.drop(columns=[target_column])
y = train_df[target_column]

for col in ["id", "Weight"]:
    if col in X.columns:
        X = X.drop(columns=[col])

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

FEATURE_CONFIG = {
    "NUMERIC": numeric_features,
    "CATEGORICAL": categorical_features,
    "TARGET": target_column
}

preprocessor = create_preprocessor(numeric_features, categorical_features)

baseline_clf = LogisticRegression(
    penalty="l2",
    solver="saga",
    multi_class="multinomial",
    max_iter=2000,
    random_state=42
)


baseline_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", baseline_clf)
])

baseline_pipeline.fit(X_train, y_train)
y_val_pred = baseline_pipeline.predict(X_val)
y_val_proba = baseline_pipeline.predict_proba(X_val)

val_acc = accuracy_score(y_val, y_val_pred)
val_logloss = log_loss(y_val, y_val_proba)
macro_f1 = f1_score(y_val, y_val_pred, average="macro")

print(f"Validation accuracy: {val_acc:.4f}")
print(f"Validation log loss: {val_logloss:.4f}")
print(f"Validation macro F1: {macro_f1:.4f}")

print(classification_report(y_val, y_val_pred))

joblib.dump(baseline_pipeline, "logistic_regression_2_model.pkl")



Validation accuracy: 0.6344
Validation log loss: 0.9474
Validation macro F1: 0.5824
                     precision    recall  f1-score   support

Insufficient_Weight       0.58      0.67      0.62       505
      Normal_Weight       0.48      0.43      0.45       617
     Obesity_Type_I       0.51      0.64      0.57       582
    Obesity_Type_II       0.65      0.89      0.75       650
   Obesity_Type_III       0.95      1.00      0.97       809
 Overweight_Level_I       0.53      0.29      0.38       485
Overweight_Level_II       0.48      0.25      0.33       504

           accuracy                           0.63      4152
          macro avg       0.60      0.60      0.58      4152
       weighted avg       0.62      0.63      0.61      4152



['logistic_regression_2_model.pkl']

In [None]:
test_df_original = pd.read_csv("/content/test.csv")
test_df = test_df_original.copy()


for col in ["id"]:
    if col in test_df.columns:
        test_df = test_df.drop(columns=[col])

model = joblib.load("logistic_regression_model.pkl")

test_pred = model.predict(test_df)

submission = pd.DataFrame({
    "id": test_df_original["id"],
    "NObeyesdad": test_pred
})

submission.to_csv("submission.csv", index=False)