We are using CatBoost model to train and predict the obesity of individuals.

In [1]:
# Part 1: Install CatBoost
!pip install catboost --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Part 2: Imports & basic setup
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 42
TARGET_COL = "NObeyesdad"

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)


In [3]:
# Part 3: Load data
train_path = "sample_data/train.csv"
test_path = "sample_data/test.csv"
sub_path = "sample_data/sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sub_path)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("\nColumns in train:")
print(train_df.columns.tolist())

train_df.head()


Train shape: (20758, 18)
Test shape : (13840, 17)

Columns in train:
['id', 'Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad']


Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
# Part 4: Basic preprocessing – split features/target & detect categorical columns
# Separate target
y = train_df[TARGET_COL]
X = train_df.drop(columns=[TARGET_COL])

# If there is an 'id' column that should not be used as a feature, drop it.
if "id" in X.columns:
    X = X.drop(columns=["id"])
    print("Dropped 'id' from features.")

# Same for test (keep a copy of ids if needed)
test_ids = None
if "id" in test_df.columns:
    test_ids = test_df["id"].copy()
    test_df = test_df.drop(columns=["id"])
    print("Dropped 'id' from test features and stored as test_ids.")

# Detect categorical vs numeric features
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

print("\nTarget distribution:")
print(y.value_counts())


Dropped 'id' from features.
Dropped 'id' from test features and stored as test_ids.

Numeric columns: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
Categorical columns: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

Target distribution:
NObeyesdad
Obesity_Type_III       4046
Obesity_Type_II        3248
Normal_Weight          3082
Obesity_Type_I         2910
Insufficient_Weight    2523
Overweight_Level_II    2522
Overweight_Level_I     2427
Name: count, dtype: int64


In [5]:
# Part 5: Train/Validation split
# Stratify by target to keep class distribution similar
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print("X_train:", X_train.shape)
print("X_val  :", X_val.shape)

# For CatBoost, we pass categorical feature indices (positions, not names)
cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_cols]
print("\nCategorical feature indices (CatBoost):", cat_feature_indices)


X_train: (16606, 16)
X_val  : (4152, 16)

Categorical feature indices (CatBoost): [0, 4, 5, 8, 9, 11, 14, 15]


In [6]:
# Part 6: Create CatBoost Pools
# Pools hold data + info about categorical columns for CatBoost
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_feature_indices
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_feature_indices
)


In [7]:
# Part 7: Define and train CatBoostClassifier
# Multi-class classification for obesity levels
model = CatBoostClassifier(
    loss_function="MultiClass",      # multi-class classification
    eval_metric="TotalF1",           # good metric for imbalanced multi-class
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=3.0,
    random_seed=RANDOM_STATE,
    iterations=1000,                 # max iterations (trees)
    early_stopping_rounds=100,       # stop if no improvement on val
    verbose=100                      # print every 100 iterations
)

model.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True
)


0:	learn: 0.6962536	test: 0.7092343	best: 0.7092343 (0)	total: 434ms	remaining: 7m 13s
100:	learn: 0.9025213	test: 0.8952932	best: 0.8952936 (99)	total: 37.5s	remaining: 5m 33s
200:	learn: 0.9167682	test: 0.9047369	best: 0.9056865 (180)	total: 1m	remaining: 4m 1s
300:	learn: 0.9222686	test: 0.9066090	best: 0.9068982 (268)	total: 1m 22s	remaining: 3m 12s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9068982148
bestIteration = 268

Shrink model to first 269 iterations.


<catboost.core.CatBoostClassifier at 0x7c576dc64740>

In [8]:
# Part 8: Evaluation on validation set
# CatBoost model.predict returns class labels directly for MultiClass
y_val_pred = model.predict(val_pool)

# model.predict returns an array of shape (n_samples, 1), so flatten:
y_val_pred = y_val_pred.reshape(-1)

val_acc = accuracy_score(y_val, y_val_pred)
val_macro_f1 = f1_score(y_val, y_val_pred, average="macro")

print(f"Validation Accuracy : {val_acc:.4f}")
print(f"Validation Macro-F1 : {val_macro_f1:.4f}")

print("\nClassification report:")
print(classification_report(y_val, y_val_pred))

print("\nConfusion matrix:")
cm = confusion_matrix(y_val, y_val_pred, labels=model.classes_)
cm_df = pd.DataFrame(cm, index=model.classes_, columns=model.classes_)
cm_df


Validation Accuracy : 0.9073
Validation Macro-F1 : 0.8972

Classification report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.93      0.96      0.95       505
      Normal_Weight       0.90      0.88      0.89       617
     Obesity_Type_I       0.88      0.89      0.89       582
    Obesity_Type_II       0.96      0.97      0.97       650
   Obesity_Type_III       0.99      1.00      1.00       809
 Overweight_Level_I       0.81      0.76      0.79       485
Overweight_Level_II       0.80      0.82      0.81       504

           accuracy                           0.91      4152
          macro avg       0.90      0.90      0.90      4152
       weighted avg       0.91      0.91      0.91      4152


Confusion matrix:


Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
Insufficient_Weight,485,19,0,0,0,1,0
Normal_Weight,34,541,1,0,0,33,8
Obesity_Type_I,1,0,519,17,5,8,32
Obesity_Type_II,0,0,20,630,0,0,0
Obesity_Type_III,0,0,2,1,806,0,0
Overweight_Level_I,1,39,13,0,0,371,61
Overweight_Level_II,0,3,35,5,0,46,415


In [9]:
# Part 9: Train final model on full training data
# (re-fit using best params and best iteration)
# We re-create Pools using the full train set X, y
full_pool = Pool(
    data=X,
    label=y,
    cat_features=[X.columns.get_loc(col) for col in categorical_cols]
)

# Get the best iteration from the previous fit (if early stopping was used)
best_iter = model.get_best_iteration()
print("Best iteration from validation training:", best_iter)

# Rebuild a new model with the same params but limit iterations to best_iter
final_model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1",
    learning_rate=model.get_params()["learning_rate"],
    depth=model.get_params()["depth"],
    l2_leaf_reg=model.get_params()["l2_leaf_reg"],
    random_seed=RANDOM_STATE,
    iterations=best_iter if best_iter > 0 else model.get_params()["iterations"],
    verbose=100
)

final_model.fit(
    full_pool,
    verbose=100
)

# Optionally save the final model
final_model.save_model("catboost_obesity_model.cbm")
print("Final CatBoost model saved to catboost_obesity_model.cbm")


Best iteration from validation training: 268
0:	learn: 0.7249645	total: 264ms	remaining: 1m 10s
100:	learn: 0.9034887	total: 27.7s	remaining: 45.7s
200:	learn: 0.9162604	total: 54.2s	remaining: 18.1s
267:	learn: 0.9202500	total: 1m 17s	remaining: 0us
Final CatBoost model saved to catboost_obesity_model.cbm


In [10]:
# Part 10: Predict on test set and create submission
# Build Pool for test data
test_pool = Pool(
    data=test_df,
    cat_features=[test_df.columns.get_loc(col) for col in categorical_cols]
)

test_preds = final_model.predict(test_pool)
test_preds = test_preds.reshape(-1)  # flatten

# Prepare submission
submission = sample_submission.copy()
submission[TARGET_COL] = test_preds

# If competition expects exactly the same column order as sample_submission,
# we keep those columns:
submission.to_csv("catboost_submission.csv", index=False)

print("Submission file 'catboost_submission.csv' has been created.")
submission.head()


Submission file 'catboost_submission.csv' has been created.


Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
