In [29]:
# STEP 1 — IMPORTS
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import xgboost as xgb


In [30]:
# STEP 2 — LOAD DATA
train = pd.read_csv(r"C:\Users\Anannya\demo-project\data\training_data.csv")
test  = pd.read_csv(r"C:\Users\Anannya\demo-project\data\test_data.csv")

# Separate features + target
X = train.drop("target", axis=1)
y = train["target"]

# Make a copy for test predictions later
test_df = test.copy()

In [31]:
# STEP 3 — FEATURE TYPE IDENTIFICATION

numeric_cols = []
binary_cols = []
categorical_cols = []

for col in X.columns:
    if X[col].dtype == 'object':
        categorical_cols.append(col)

    elif X[col].nunique() == 2:
        binary_cols.append(col)

    elif X[col].dtype in ['int64', 'float64']:
        numeric_cols.append(col)

print("Numeric Columns:", numeric_cols)
print("Binary Columns:", binary_cols)
print("Categorical Columns:", categorical_cols)


Numeric Columns: ['feature_1', 'feature_2', 'feature_3', 'feature_7', 'feature_9', 'feature_10', 'feature_12', 'feature_13', 'feature_17', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_29', 'feature_31', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_40', 'feature_43', 'feature_45', 'feature_47', 'feature_48', 'feature_50']
Binary Columns: ['feature_4', 'feature_5', 'feature_6', 'feature_8', 'feature_11', 'feature_14', 'feature_15', 'feature_16', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_27', 'feature_28', 'feature_30', 'feature_32', 'feature_39', 'feature_41', 'feature_42', 'feature_44', 'feature_46', 'feature_49']
Categorical Columns: ['id']


In [32]:
print(X.columns.tolist())


['id', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50']


In [41]:
# STEP 4 — CLEAN DATA & DEFINE FEATURE SET

# Drop ID column if present in training and test sets
if 'id' in X.columns:
    X = X.drop(columns=['id'])

if 'id' in test_df.columns:
    test_df = test_df.drop(columns=['id'])

# Make copies
X = X.copy()
test_df = test_df.copy()

# All columns are numeric already → nothing else to encode
print("Dropped id if present. Final feature count:", X.shape[1])


Dropped id if present. Final feature count: 50


In [34]:
# Prepare OOF and test prediction arrays
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test_df))

# 5-fold stratified CV
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [35]:
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.02,
    'max_depth': 5,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'random_state': 42
}

In [36]:
print(test_df.columns)


Index(['id', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
       'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10',
       'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15',
       'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20',
       'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25',
       'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30',
       'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35',
       'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40',
       'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45',
       'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50'],
      dtype='object')


In [43]:
for fold, (tr, val) in enumerate(kf.split(X, y)):
    print(f"\n========== Fold {fold+1} ==========")

    X_tr, X_val = X.iloc[tr], X.iloc[val]
    y_tr, y_val = y[tr], y[val]

    # Train model
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    oof_preds[val] = model.predict_proba(X_val)[:, 1]

    # Test predictions averaged
    test_preds += model.predict_proba(test_df)[:, 1] / 5

    # Fold metrics
    fold_auc = roc_auc_score(y_val, oof_preds[val])
    fold_gini = 2 * fold_auc - 1
    

    print(f"Fold AUC:  {fold_auc:.5f} | Fold Gini: {fold_gini:.5f}")



Fold AUC:  0.63550 | Fold Gini: 0.27099

Fold AUC:  0.63582 | Fold Gini: 0.27164

Fold AUC:  0.63732 | Fold Gini: 0.27464

Fold AUC:  0.64317 | Fold Gini: 0.28634

Fold AUC:  0.64452 | Fold Gini: 0.28904


In [44]:
from sklearn.metrics import roc_auc_score

# Final OOF metrics
overall_auc = roc_auc_score(y, oof_preds)
overall_gini = 2 * overall_auc - 1

print("\n================ FINAL METRICS ================")
print(f"OOF AUC:  {overall_auc:.6f}")
print(f"OOF Gini: {overall_gini:.6f}")



OOF AUC:  0.639253
OOF Gini: 0.278506


In [45]:
# ==============================
# XGBOOST TEST PROBABILITIES
# ==============================

# Attach predictions to test dataframe
test_df["target_probability"] = test_preds

# Save to CSV with desired name
test_df.to_csv(
    r"C:\Users\Anannya\demo-project\data\xgboost_test_prob.csv",
    index=False
)

print("XGBoost test probabilities saved successfully!")
print(test_df.head())


XGBoost test probabilities saved successfully!
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0          0         14          1          1          0          0   
1          0          9          1          1          0          0   
2          0          9          1          1          0          0   
3          0          8          1          1          0          1   
4          0          7          1          1          1          0   

   feature_7  feature_8  feature_9  feature_10  ...  feature_42  feature_43  \
0          1        NaN        3.0         0.2  ...         1.0         0.8   
1         11        NaN        2.0         0.7  ...         0.0         0.9   
2         14        NaN        3.0         0.2  ...         1.0         0.9   
3          1        1.0        3.0         0.4  ...         1.0         0.7   
4          0        0.0        3.0         0.4  ...         1.0         0.6   

   feature_44  feature_45  feature_46  feature_47  