In [1]:
import pandas as pd
import numpy as np


In [2]:
train = pd.read_csv('./data/dftrain.csv')
test = pd.read_csv('./data/dftest.csv')
extra = pd.read_csv('./data/bank-full.csv', delimiter=';')
print(train.head())


   id  age          job  marital  education default  balance housing loan  \
0   0   42   technician  married  secondary      no        7      no   no   
1   1   38  blue-collar  married  secondary      no      514      no   no   
2   2   36  blue-collar  married  secondary      no      602     yes   no   
3   3   27      student   single  secondary      no       34     yes   no   
4   4   26   technician  married  secondary      no      889     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome  y  
0  cellular   25   aug       117         3     -1         0  unknown  0  
1   unknown   18   jun       185         1     -1         0  unknown  0  
2   unknown   14   may       111         2     -1         0  unknown  0  
3   unknown   28   may        10         2     -1         0  unknown  0  
4  cellular    3   feb       902         1     -1         0  unknown  1  


In [3]:
train.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [5]:
# EDA - Correlation Analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical features for correlation
num_features = train.select_dtypes(include=[np.number]).columns.tolist()

print("=" * 60)
print("CORRELATION WITH TARGET")
print("=" * 60)
correlations = train[num_features].corr()['y'].sort_values(ascending=False)
print(correlations)

# Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(train[num_features].corr(), annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'seaborn'

In [None]:
# EDA - Categorical Features
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

print("=" * 60)
print("CATEGORICAL FEATURES")
print("=" * 60)

for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(train[col].value_counts().head(10))
    print(f"Unique values: {train[col].nunique()}")


In [None]:
# EDA - Numerical Features Distribution
import matplotlib.pyplot as plt
import seaborn as sns

numerical_cols = train.select_dtypes(include=[np.number]).columns.tolist()
if 'y' in numerical_cols:
    numerical_cols.remove('y')

print("=" * 60)
print("NUMERICAL FEATURES STATISTICS")
print("=" * 60)
print(train[numerical_cols].describe())

# Plot distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols[:6]):
    axes[idx].hist(train[col], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [6]:
# EDA - Dataset Overview
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"\nTrain shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Extra shape: {extra.shape}")

print("\n" + "=" * 60)
print("TARGET DISTRIBUTION")
print("=" * 60)
print(train['y'].value_counts())
print(f"\nTarget balance: {train['y'].value_counts(normalize=True).to_dict()}")


DATASET OVERVIEW

Train shape: (750000, 18)
Test shape: (250000, 17)
Extra shape: (45211, 17)

TARGET DISTRIBUTION
y
0    659512
1     90488
Name: count, dtype: int64

Target balance: {0: 0.8793493333333333, 1: 0.12065066666666667}


In [7]:
# Feature engineering with trigonometric features
train['_duration_sin'] = np.sin(2*np.pi * train['duration'] / 540).astype('float32')
train['_duration_cos'] = np.cos(2*np.pi * train['duration'] / 540).astype('float32')
train['_balance_log'] = (np.sign(train['balance']) * np.log1p(np.abs(train['balance']))).astype('float32')
train['_balance_sin'] = np.sin(2*np.pi * train['balance'] / 1000).astype('float32')
train['_balance_cos'] = np.cos(2*np.pi * train['balance'] / 1000).astype('float32')
train['_age_sin'] = np.sin(2*np.pi * train['age'] / 10).astype('float32')
train['_pdays_sin'] = np.sin(2*np.pi * train['pdays'] / 7).astype('float32')

test['_duration_sin'] = np.sin(2*np.pi * test['duration'] / 540).astype('float32')
test['_duration_cos'] = np.cos(2*np.pi * test['duration'] / 540).astype('float32')
test['_balance_log'] = (np.sign(test['balance']) * np.log1p(np.abs(test['balance']))).astype('float32')
test['_balance_sin'] = np.sin(2*np.pi * test['balance'] / 1000).astype('float32')
test['_balance_cos'] = np.cos(2*np.pi * test['balance'] / 1000).astype('float32')
test['_age_sin'] = np.sin(2*np.pi * test['age'] / 10).astype('float32')
test['_pdays_sin'] = np.sin(2*np.pi * test['pdays'] / 7).astype('float32')

# Convert target
train['y'] = (train['y'] == 'yes').astype('int32')
print(f"Target distribution: {train['y'].value_counts().to_dict()}")
print(f"Features created: {[c for c in train.columns if c.startswith('_')]}")


Target distribution: {0: 750000}
Features created: ['_duration_sin', '_duration_cos', '_balance_log', '_balance_sin', '_balance_cos', '_age_sin', '_pdays_sin']


In [8]:
# Prepare features - select only numerical and categorical columns
%pip install xgboost
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Identify columns
CATS = []
NUMS = []
for c in train.columns:
    if c == 'y':
        continue
    if train[c].dtype == 'object':
        CATS.append(c)
    else:
        NUMS.append(c)

print(f"Categorical: {CATS}")
print(f"Numerical: {NUMS}")

# Encode categorical features
for c in CATS:
    le = LabelEncoder()
    train[c] = le.fit_transform(train[c].astype(str))
    test[c] = le.transform(test[c].astype(str))

X_train = train[CATS + NUMS]
y_train = train['y']
X_test = test[CATS + NUMS]

print(f"\nTraining shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")


[H[2J[32m
╭──────────────────────────────────────────────────────────╮
│                                                          │
│  "As long as I’m alive, there are infinite chances.     │
│   You can’t give up. That’s what it means to be a pirate!"│
│                                                          │
│                     — Monkey D. Luffy                    │
╰──────────────────────────────────────────────────────────╯
[m
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Categorical: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical: ['id', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', '_duration_sin', '_duration_cos', '_balance_log', '_balance_sin', '_balance_cos', '_age_sin', '_pdays_sin']

Training shape: (750000, 24)
Test shape: (250000, 24)


In [11]:
# XGBoost Training with Out-of-Fold (OOF) Predictions
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Initialize OOF arrays
oof_preds = np.zeros(len(X_train))
test_preds = np.zeros(len(X_test))
fold_scores = []

# 5-Fold Cross-Validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

print("Training XGBoost with 5-Fold CV...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr = X_train.iloc[train_idx]
    X_val = X_train.iloc[val_idx]
    y_tr = y_train.iloc[train_idx].values
    y_val = y_train.iloc[val_idx].values
    
    model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=10,
        base_score=0.5,
        verbosity=0
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # OOF predictions
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    
    # Test predictions
    test_preds += model.predict_proba(X_test)[:, 1] / n_folds
    
    # Fold score
    fold_score = roc_auc_score(y_val, oof_preds[val_idx])
    fold_scores.append(fold_score)
    print(f"Fold {fold+1}/{n_folds} - AUC: {fold_score:.6f}")

# Overall OOF score
oof_score = roc_auc_score(y_train, oof_preds)
print(f"\nOverall OOF AUC: {oof_score:.6f}")
print(f"Mean Fold AUC: {np.mean(fold_scores):.6f} (+/- {np.std(fold_scores):.6f})")


Training XGBoost with 5-Fold CV...
Fold 1/5 - AUC: nan
Fold 2/5 - AUC: nan
Fold 3/5 - AUC: nan
Fold 4/5 - AUC: nan
Fold 5/5 - AUC: nan

Overall OOF AUC: nan
Mean Fold AUC: nan (+/- nan)
