In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(r"D:\kaggle datasets\playground-series-s3e24\train.csv")
x_test = pd.read_csv(r"D:\kaggle datasets\playground-series-s3e24\test.csv")

In [3]:
df["BMI"] = df["weight(kg)"] / ((df["height(cm)"] / 100) ** 2)
df["waist_height_ratio"] = df["waist(cm)"] / df["height(cm)"]
df["cholesterol_hdl_ratio"] = df["Cholesterol"] / df["HDL"]
df["ldl_hdl_ratio"] = df["LDL"] / df["HDL"]
df["triglyceride_hdl_ratio"] = df["triglyceride"] / df["HDL"]

X = df.drop(columns=["id", "smoking"])
y = df["smoking"]

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "use_label_encoder": False,
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True)
    }

    model = XGBClassifier(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds)

In [6]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)

[I 2025-11-01 22:12:44,927] A new study created in memory with name: no-name-69f4bd95-fc74-4cc8-af4d-5088cc9def2b
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 22:12:50,454] Trial 0 finished with value: 0.7790405626020344 and parameters: {'n_estimators': 342, 'max_depth': 9, 'learning_rate': 0.07218246409406744, 'subsample': 0.9518897198398747, 'colsample_bytree': 0.8747359454675382, 'gamma': 3.9460379698034576, 'reg_alpha': 0.05566013410648711, 'reg_lambda': 0.004314220754202066}. Best is trial 0 with value: 0.7790405626020344.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 22:12:57,153] Trial 1 finished with value: 0.7743940725857089 and parameters: {'n_estimators': 208, 'max_depth': 10, 'learning_rate': 0.010311761256626117, 'subsample': 0.77495333588314, 'colsample_bytree': 0.5883652559960402, 'gamma': 2.552049448567733, 'reg_alpha': 9.292374066826354e-06, 'r

Best parameters: {'n_estimators': 630, 'max_depth': 9, 'learning_rate': 0.03341911483903381, 'subsample': 0.9040191061016054, 'colsample_bytree': 0.5406749741409635, 'gamma': 0.7037362553151528, 'reg_alpha': 1.0903560608329809e-07, 'reg_lambda': 4.888810781048282e-07}
Best accuracy: 0.7828393821424087


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 18:29:33,510] Trial 13 finished with value: 0.780641717945498 and parameters: {'n_estimators': 1000, 'max_depth': 9, 'learning_rate': 0.02174841055861922, 'subsample': 0.5168068022803978, 'colsample_bytree': 0.6908573976077236, 'gamma': 0.18712320021873552, 'reg_alpha': 2.908801506010217e-06, 'reg_lambda': 5.630348494561821}. Best is trial 11 with value: 0.7807045083511239.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 18:29:47,400] Trial 14 finished with value: 0.7826196157227175 and parameters: {'n_estimators': 813, 'max_depth': 8, 'learning_rate': 0.025305793337317705, 'subsample': 0.506184905340631, 'colsample_bytree': 0.7262529425308694, 'gamma': 0.03496716930861399, 'reg_alpha': 9.334512747637693e-06, 'reg_lambda': 5.887406174231734}. Best is trial 14 with value: 0.7826196157227175.
Parameters: { "use_label_enc

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 18:32:33,510] Trial 26 finished with value: 0.7803591611201809 and parameters: {'n_estimators': 830, 'max_depth': 5, 'learning_rate': 0.024646654963985718, 'subsample': 0.6414443505867148, 'colsample_bytree': 0.7898579336791169, 'gamma': 1.7600908631682974, 'reg_alpha': 4.3961687982297903e-07, 'reg_lambda': 0.0002197013094604724}. Best is trial 14 with value: 0.7826196157227175.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 18:32:42,707] Trial 27 finished with value: 0.7801079994976767 and parameters: {'n_estimators': 676, 'max_depth': 7, 'learning_rate': 0.015107659377525885, 'subsample': 0.7541185462936127, 'colsample_bytree': 0.8779725683888379, 'gamma': 1.1400240153914807, 'reg_alpha': 8.04535256236618e-06, 'reg_lambda': 1.4602151095094718}. Best is trial 14 with value: 0.7826196157227175.
Parameters: { "use_labe

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 18:34:40,083] Trial 39 finished with value: 0.7796998618611076 and parameters: {'n_estimators': 861, 'max_depth': 7, 'learning_rate': 0.09300816820335069, 'subsample': 0.7057804395907151, 'colsample_bytree': 0.8499489621456732, 'gamma': 0.8021546657199212, 'reg_alpha': 0.04805940169053744, 'reg_lambda': 0.0006632726875053539}. Best is trial 14 with value: 0.7826196157227175.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-11-01 18:34:52,965] Trial 40 finished with value: 0.780955669973628 and parameters: {'n_estimators': 911, 'max_depth': 7, 'learning_rate': 0.012882801840016792, 'subsample': 0.8163374605391676, 'colsample_bytree': 0.9422041257344227, 'gamma': 3.175379639252462, 'reg_alpha': 0.0005045640647385102, 'reg_lambda': 2.2167842235169804e-06}. Best is trial 14 with value: 0.7826196157227175.
Parameters: { "use_label

In [7]:
best_params = study.best_params
final_model = XGBClassifier(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X_train, y_train)


feature_names = X.columns
importances = final_model.feature_importances_

importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)


In [11]:
import joblib
x_test["BMI"] = x_test["weight(kg)"] / ((x_test["height(cm)"] / 100) ** 2)
x_test["waist_height_ratio"] = x_test["waist(cm)"] / x_test["height(cm)"]
x_test["cholesterol_hdl_ratio"] = x_test["Cholesterol"] / x_test["HDL"]
x_test["ldl_hdl_ratio"] = x_test["LDL"] / x_test["HDL"]
x_test["triglyceride_hdl_ratio"] = x_test["triglyceride"] / x_test["HDL"]
x_test = x_test.drop(columns=["id"])

joblib.dump(final_model, "model2.pkl")
loaded_model = joblib.load("model2.pkl")
pred = loaded_model.predict(x_test)
print(pred)

[1 1 1 ... 1 1 1]


In [12]:
submission_df = pd.DataFrame({
    'id':x_test['id'],
    'smoking': pred
})
# file_path = r"D:\kaggle datasets\playground s5ep10\playground-series-s5e10\my_submission5.csv"
# submission_df.to_csv(file_path,index=False)

KeyError: 'id'