# ***Loading Data***

In [None]:
import pandas as pd
import numpy as np
from  sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score , classification_report, confusion_matrix


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


# ***PRE-Processing***

In [None]:
test_ids =test['id']

def pre_processing(df):
    df = df.copy()
    df.drop(['id', 'CustomerId', 'Surname'], axis=1, inplace=True)
    df =pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)
    return df

X_train = pre_processing(train.drop('Exited', axis=1))
y_train = train['Exited']
X_test = pre_processing(test)

X_train, X_test = X_train.align(X_test, join='inner', axis=1, fill_value=0)



# ***CLUSTERING***

In [None]:
cluster_features = ['Age', 'Balance', 'EstimatedSalary', 'CreditScore']

In [None]:
scaler_cluster = StandardScaler()
X_train_cluster_scaled = scaler_cluster.fit_transform(X_train[cluster_features])
X_test_cluster_scaled = scaler_cluster.transform(X_test[cluster_features])

In [None]:
kmeans=KMeans(n_clusters=5, random_state = 2025, n_init=10)
X_train['cluster'] = kmeans.fit_predict(X_train_cluster_scaled)
X_test['cluster'] = kmeans.predict(X_test_cluster_scaled)

In [None]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_cluster_ohe = ohe.fit_transform(X_train[['cluster']])
test_cluster_ohe = ohe.transform(X_test[['cluster']])

In [None]:
cluster_cols = [f'cluster_{i}' for i in range(train_cluster_ohe.shape[1])]
X_train = pd.concat([X_train.drop('cluster', axis=1).reset_index(drop=True),
                     pd.DataFrame(train_cluster_ohe, columns=cluster_cols)], axis=1)
X_test = pd.concat([X_test.drop('cluster', axis=1).reset_index(drop=True),
                    pd.DataFrame(test_cluster_ohe, columns=cluster_cols)], axis=1)

In [None]:
scaler_final = StandardScaler()
X_train_scaled = scaler_final.fit_transform(X_train)
X_test_scaled = scaler_final.transform(X_test)

# ***MODEL TRAINING***

In [None]:
from sklearn.model_selection import RandomizedSearchCV
hgb_base = HistGradientBoostingClassifier(random_state=298)

In [None]:
param_dist = {
    'max_iter': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.08, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_leaf': [10, 20, 30, 40, 50],
    'l2_regularization': [0.0, 0.1, 0.5, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=hgb_base,
    param_distributions=param_dist,
    n_iter=25,
    cv=5,
    scoring='roc_auc',
    random_state=7691,
    n_jobs=-1,
    verbose=2
)

random_search.fit(X_train_scaled, y_train)

print(f"\nBest  CV Score (ROC AUC): {random_search.best_score_:.5f}")
print(f"Best Parameters Found: {random_search.best_params_}")
best_model = random_search.best_estimator_

Fitting 5 folds for each of 25 candidates, totalling 125 fits

Best Kaggle CV Score (ROC AUC): 0.92891
Best Parameters Found: {'min_samples_leaf': 40, 'max_iter': 300, 'max_depth': 3, 'learning_rate': 0.08, 'l2_regularization': 0.5}


In [None]:
test_preds_prob = best_model.predict_proba(X_test_scaled)[:, 1]
submission = pd.DataFrame({
    'id': test_ids,
    'Exited': test_preds_prob
})

submission.to_csv('submission_tuned_clustered.csv', index=False)


Saved perfectly tuned submission!
