In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc

TRAIN_PATH = "dataset/train.csv"
TEST_PATH = "dataset/test.csv"
SUBMISSION_PATH = "dataset/submission_xgboost.csv"

In [9]:
def load_data(train_path, test_path):
    print("Loading data...")
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    test_ids = test_df['id']
    test_df['y'] = np.nan
    full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
    
    print(f"Combined data shape: {full_df.shape}")
    return full_df, test_ids

full_df, test_ids = load_data(TRAIN_PATH, TEST_PATH)
full_df.head()


Loading data...
Combined data shape: (1000000, 18)


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0.0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0.0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0.0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0.0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1.0


In [10]:
def feature_engineer(df):
    print("Starting feature engineering...")
    binary_map = {'yes': 1, 'no': 0}
    for col in ['default', 'housing', 'loan']:
        if col in df.columns:
            df[col] = df[col].map(binary_map)

    df['was_previously_contacted'] = (df['pdays'] != -1).astype(int)

    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype('category')
        
    print("Feature engineering complete.")
    return df

engineered_df = feature_engineer(full_df)
engineered_df.head()


Starting feature engineering...
Feature engineering complete.


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,was_previously_contacted
0,0,42,technician,married,secondary,0,7,0,0,cellular,25,aug,117,3,-1,0,unknown,0.0,0
1,1,38,blue-collar,married,secondary,0,514,0,0,unknown,18,jun,185,1,-1,0,unknown,0.0,0
2,2,36,blue-collar,married,secondary,0,602,1,0,unknown,14,may,111,2,-1,0,unknown,0.0,0
3,3,27,student,single,secondary,0,34,1,0,unknown,28,may,10,2,-1,0,unknown,0.0,0
4,4,26,technician,married,secondary,0,889,1,0,cellular,3,feb,902,1,-1,0,unknown,1.0,0


In [11]:
def train_xgboost_model(full_df, test_ids):
    print("Starting model training with XGBoost (GPU)...")

    train_df = full_df[full_df['y'].notna()].copy()
    test_df = full_df[full_df['y'].isna()].copy()

    features = [col for col in train_df.columns if col not in ['id', 'y']]
    X = train_df[features]
    y = train_df['y']
    X_test = test_df[features]

    params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',  # Use 'hist' instead of deprecated 'gpu_hist'
    'device': 'cuda',       # Tell XGBoost to use GPU
    'learning_rate': 0.01,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'gamma': 0.1,
    'scale_pos_weight': 1,
    'seed': 42
}


    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(train_df))
    test_preds = np.zeros(len(test_df))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"--- Fold {fold+1}/5 ---")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
        dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dval, "validation")],
            early_stopping_rounds=100,
            verbose_eval=100
        )

        val_preds = model.predict(dval)
        oof_preds[val_idx] = val_preds
        fold_auc = roc_auc_score(y_val, val_preds)
        fold_scores.append(fold_auc)
        print(f"Fold {fold+1} AUC: {fold_auc:.5f}")

        test_preds += model.predict(dtest) / skf.n_splits

        del X_train, y_train, X_val, y_val, dtrain, dval, model
        gc.collect()

    overall_auc = roc_auc_score(y, oof_preds)
    print(f"\nOverall OOF AUC: {overall_auc:.5f}")
    print(f"Mean Fold AUC: {np.mean(fold_scores):.5f} (+/- {np.std(fold_scores):.5f})")

    return test_preds

final_predictions = train_xgboost_model(engineered_df, test_ids)


Starting model training with XGBoost (GPU)...
--- Fold 1/5 ---
[0]	validation-auc:0.71673
[100]	validation-auc:0.95499
[200]	validation-auc:0.95721
[300]	validation-auc:0.95861
[400]	validation-auc:0.95972
[500]	validation-auc:0.96066
[600]	validation-auc:0.96150
[700]	validation-auc:0.96216
[800]	validation-auc:0.96274
[900]	validation-auc:0.96328
[1000]	validation-auc:0.96378
[1100]	validation-auc:0.96421
[1200]	validation-auc:0.96456
[1300]	validation-auc:0.96487
[1400]	validation-auc:0.96512
[1500]	validation-auc:0.96535
[1600]	validation-auc:0.96559
[1700]	validation-auc:0.96577
[1800]	validation-auc:0.96593
[1900]	validation-auc:0.96607
[1999]	validation-auc:0.96620
Fold 1 AUC: 0.96620
--- Fold 2/5 ---
[0]	validation-auc:0.71385
[100]	validation-auc:0.95455
[200]	validation-auc:0.95648
[300]	validation-auc:0.95773
[400]	validation-auc:0.95873
[500]	validation-auc:0.95953
[600]	validation-auc:0.96035
[700]	validation-auc:0.96095
[800]	validation-auc:0.96154
[900]	validation-auc:0.

In [12]:
def generate_submission(test_ids, predictions, file_path):
    print("Generating submission file...")
    submission_df = pd.DataFrame({'id': test_ids, 'y': predictions})
    submission_df.to_csv(file_path, index=False)
    print(f"Submission file saved to {file_path}")
    return submission_df.head()

generate_submission(test_ids, final_predictions, SUBMISSION_PATH)


Generating submission file...
Submission file saved to dataset/submission_xgboost.csv


Unnamed: 0,id,y
0,750000,0.004563
1,750001,0.143179
2,750002,0.000465
3,750003,0.000158
4,750004,0.034046
