In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc

In [10]:
TRAIN_PATH = 'dataset/train.csv'
TEST_PATH = 'dataset/test.csv'
SUBMISSION_PATH = 'dataset/final_submission.csv'
N_TRIALS = 50

In [11]:
def load_and_prep_data(train_path, test_path):
    print("Loading and preparing data...")
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    test_ids = test_df['id']
    test_df['y'] = np.nan
    full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

    binary_map = {'yes': 1, 'no': 0}
    for col in ['default', 'housing', 'loan']:
        full_df[col] = full_df[col].map(binary_map)
    full_df['was_previously_contacted'] = (full_df['pdays'] != -1).astype(int)
    for col in full_df.select_dtypes(include='object').columns:
        full_df[col] = full_df[col].astype('category')
    
    return full_df, test_ids

full_df, test_ids = load_and_prep_data(TRAIN_PATH, TEST_PATH)
display(full_df.head())

Loading and preparing data...


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,was_previously_contacted
0,0,42,technician,married,secondary,0,7,0,0,cellular,25,aug,117,3,-1,0,unknown,0.0,0
1,1,38,blue-collar,married,secondary,0,514,0,0,unknown,18,jun,185,1,-1,0,unknown,0.0,0
2,2,36,blue-collar,married,secondary,0,602,1,0,unknown,14,may,111,2,-1,0,unknown,0.0,0
3,3,27,student,single,secondary,0,34,1,0,unknown,28,may,10,2,-1,0,unknown,0.0,0
4,4,26,technician,married,secondary,0,889,1,0,cellular,3,feb,902,1,-1,0,unknown,1.0,0


In [12]:
train_df = full_df[full_df['y'].notna()]
test_df = full_df[full_df['y'].isna()]

features = [col for col in train_df.columns if col not in ['id', 'y']]
X = train_df[features]
y = train_df['y']
X_test = test_df[features]

print("Train shape:", X.shape)
print("Test shape:", X_test.shape)

Train shape: (750000, 17)
Test shape: (250000, 17)


In [13]:
def objective(trial, X, y):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 0, 5),
        'alpha': trial.suggest_float('alpha', 0, 5),
        'use_label_encoder': False,
        'enable_categorical': True,
        'seed': 42,
        'n_jobs': -1
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=100,
            verbose=False
        )
        
        preds = model.predict_proba(X_val)[:, 1]
        fold_scores.append(roc_auc_score(y_val, preds))
        
    return np.mean(fold_scores)


In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X, y), n_trials=N_TRIALS)

print("\n--- Optuna Study Complete ---")
print(f"Best Trial AUC: {study.best_value}")
print("Best hyperparameters:")
print(study.best_params)


[I 2025-08-20 15:06:08,172] A new study created in memory with name: no-name-7e187e12-a1bc-4372-ad04-db9b2ad6b709
[W 2025-08-20 15:06:08,275] Trial 0 failed with parameters: {'learning_rate': 0.025547857785640362, 'max_depth': 8, 'subsample': 0.772092041665337, 'colsample_bytree': 0.949600578196718, 'gamma': 4.62997535237995, 'lambda': 0.6832655307917246, 'alpha': 1.404886004923539} because of the following error: TypeError("XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "c:\Users\bharg\miniconda3\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\bharg\AppData\Local\Temp\ipykernel_56668\1726472384.py", line 2, in <lambda>
    study.optimize(lambda trial: objective(trial, X, y), n_trials=N_TRIALS)
                                 ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\bharg\AppData\Local\Temp\ipykernel_5666

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'