In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("/content/drive/MyDrive/ait-511-course-project-1-obesity-risk/train.csv")

df_train['BMI'] = df_train['Weight'] / (df_train['Height'] ** 2)

X = df_train.drop(['id', 'WeightCategory'], axis=1)
y = df_train['WeightCategory']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_encoded = pd.get_dummies(X, drop_first=True)

X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    eval_metric='merror',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

param_grid = {
    'n_estimators': [300, 400, 500],
    'learning_rate': [0.05, 0.1, 0.3,],
    'max_depth': [3,5,7],
    'subsample': [0.7,0.8]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    verbose=1,
    n_jobs=-1
)

print("Starting XGBoost Hyperparameter Tuning...")
grid_search.fit(X_train, y_train)

print("\nBest Parameters found by Grid Search:")
print(grid_search.best_params_)

print(f"\nBest Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_val)

final_accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy with Tuned XGBoost: {final_accuracy:.4f}")

Starting XGBoost Hyperparameter Tuning...
Fitting 5 folds for each of 54 candidates, totalling 270 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best Parameters found by Grid Search:
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}

Best Cross-Validation Accuracy: 0.9047
Validation Accuracy with Tuned XGBoost: 0.9038


In [2]:
x_test = pd.read_csv("/content/drive/MyDrive/ait-511-course-project-1-obesity-risk/test.csv")
test_ids = x_test['id']
x_test = x_test.drop(['id'], axis=1)
x_test['BMI'] = x_test['Weight'] / (x_test['Height'] ** 2)
x_test_encoded = pd.get_dummies(x_test, drop_first=True)
y_pred = best_xgb_model.predict(x_test_encoded)
y_test_pred = le.inverse_transform(y_pred)
submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': y_test_pred
})
submission.to_csv('/content/drive/MyDrive/ait-511-course-project-1-obesity-risk/submission_xg_bmi.csv', index=False)