In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the data


In [None]:
train_df = pd.read_csv('/kaggle/input/mlp-term-2-2025-kaggle-assignment-2/train.csv')
test_df = pd.read_csv('/kaggle/input/mlp-term-2-2025-kaggle-assignment-2/test.csv')

train_df.head()

# Criteria 1
## Identify data types of different columns

In [None]:
train_df.dtypes

# Criteria 2
## Present descriptive statistics of numerical columns

In [None]:
train_df.describe().T

# Criteria 3
## Identify and handle the missing values

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

# We will impute missing values
* credit_score:- 9556 & 3185 null values in train and test df respectively, will impute using median as it is more robust to outliers.
* country:- 6021 & 4606 null values in train and test df respectively, will impute using mode because it is a categorical feature.
* acc_balance:- 7257 & 5251 null values in train and test df respectively, will impute using median.
* prod_count:- 4863 & 1717 null values in train and test df respectively, will impute using median

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
mean_col=['credit_score', 'acc_balance','prod_count']
mode_col=['country']

mean_imputer=SimpleImputer(strategy='mean')
mode_imputer=SimpleImputer(strategy='most_frequent')

train_df[mean_col]=mean_imputer.fit_transform(train_df[mean_col])
train_df[mode_col]=mode_imputer.fit_transform(train_df[mode_col])

test_df[mean_col]= mean_imputer.transform(test_df[mean_col])
test_df[mode_col]= mode_imputer.transform(test_df[mode_col])


In [None]:
train_df.isnull().sum()

# Criteria 4
## Identify and handle duplicates

In [None]:
train_df.drop("id", axis=1, inplace=True)
train_df = train_df[train_df.duplicated()==False]
train_df.reset_index(inplace=True, drop=True)
print("Duplicates:", train_df.duplicated().sum())
#No duplicates

# Criteria 5
## Identify and handle outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numerical_cols = ['credit_score','age','tenure','estimated_salary','acc_balance']

plt.figure(figsize=(15, 20))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(len(numerical_cols), 1, i)
    sns.boxplot(x=train_df[col])
    plt.title(f'Boxplot for {col}')
plt.tight_layout()
plt.show()


## Keeping the outliers
* credit_score
    
      1. Lower outliers below ~400.
      2. A low credit score is not an error — it's a valid and important indicator of financial risk.
      3. Customers with poor credit might be more likely to exit or churn — this pattern could help the model.
* age

      1. Outliers above ~60–90 years.
      2. Senior customers may behave differently in retention
      3. Age extremes are informative, not noise. 

# Criteria 6
## Present at least three visualizations and provide insights for the same

In [None]:
# Exit rate by country
sns.countplot(x='country', hue='exit_status', data=train_df)
plt.title('Churn Rate by Country')
plt.show()

## Observation
* France has the largest number of customers, but a lower proportion of churners compared to its total customer base.

* Germany has a relatively high churn rate, as the number of churned customers (exit_status = 1) is closer in proportion to those who stayed.

* Spain has moderate churn counts but lower than France and Germany overall.

In [None]:
# Age distribution for churned customers
sns.histplot(train_df[train_df['exit_status'] == 1]['age'], bins=30, kde=True)
plt.title("Age Distribution of Exiting Customers")
plt.xlabel("Age")
plt.show()

## Observation
* The churn rate peaks around age 45 — this age group has the highest number of exits.

* The distribution is right-skewed, indicating fewer churners among older customers (age 60+).

* Very young customers (under 30) also churn less frequently than middle-aged ones.

* Middle-aged customers (35–50 years old) are the most likely to churn, based on this distribution.
This group likely has more financial products, responsibilities, or service expectations making them more sensitive to dissatisfaction.

In [None]:
#Credit Score by Exit Status
plt.figure(figsize=(10, 5))
sns.histplot(data=train_df, x='credit_score', hue='exit_status', bins=30, kde=True, palette='Set1', alpha=0.6)
plt.title("Credit Score Distribution: Exited vs Stayed")
plt.xlabel("Credit Score")
plt.ylabel("Customer Count")
plt.show()

## Observation
1. Stayed customers (exit_status = 0): 
    * Have a wide distribution centered around scores of 650–700.
    * Count drops off after 750 and below 600.
2. Exited customers (exit_status = 1):
    * Also concentrated in the 600–700 range, but with fewer customers across all score   bands.
    * Interestingly, many churned customers also have decent credit scores, meaning good credit alone doesn't guarantee retention.

# Criteria 7
## Scale Numerical features and Encode Categorical features

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_cols = ['credit_score', 'age', 'tenure', 'acc_balance', 'prod_count', 'estimated_salary']
cat_cols = ['country', 'gender', 'has_card', 'is_active']

tree_preprocessor = ColumnTransformer([
    ('num', 'passthrough', num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary'), cat_cols)
])

mlp_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

## Reasoning for scaling numerical features
Numerical features like credit_score, age, tenure, etc., vary in scale — some are in hundreds (like credit_score), while others are in single digits (like tenure or prod_count). This variation can negatively impact models that:
* Are sensitive to feature scale (e.g., Logistic Regression, SVM, KNN, SGD, Neural Networks).

Scaling ensures that all features contribute equally to the learning process and prevents dominance of high-magnitude features.

## Reason for choosing StandardScaler
StandardScaler standardizes features by removing the mean and scaling to unit variance (i.e., it transforms the distribution to have mean = 0 and standard deviation = 1).
We chose StandardScaler because it works well with most models that assume or benefit from normalized inputs.

## Reason for encoding categorical features
Machine learning models can’t interpret textual or categorical variables directly. These need to be converted into numeric values. If left unencoded, models like logistic regression or tree ensembles can’t use them effectively.

## Reason for choosing OneHotEncoder
OneHotEncoder creates a new binary column for each category. This is the safest and most general encoding method for nominal (unordered) categorical variables. It works best for non-ordinal categorical variables, and it avoids misleading numerical ordering.

# Criteria 8
## Model Building (at least 7)

In [None]:
!pip install xgboost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier


X = train_df.drop('exit_status', axis=1)
y = train_df['exit_status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=500, class_weight='balanced',random_state=42),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced',random_state=42),
    "Random Forest": RandomForestClassifier(class_weight='balanced',random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),  
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', 
                              scale_pos_weight=(y == 0).sum() / (y == 1).sum(),random_state=42),
    "KNN": KNeighborsClassifier(),  
    "Neural Net": MLPClassifier(max_iter=300, early_stopping=True), 
    "AdaBoost": AdaBoostClassifier(n_estimators=100,random_state=42),
    "SGDClassifier": SGDClassifier(loss='log_loss', max_iter=1000, random_state=42, class_weight='balanced')
}

scaled_models = [
    "Logistic Regression", "KNN", "Neural Net", "SGDClassifier"
]

print("Model Performance on Validation Set:")
print("-" * 60)

for name, model in models.items():
    preprocessor = mlp_preprocessor if name in scaled_models else tree_preprocessor

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    y_prob = pipeline.predict_proba(X_val)[:, 1] if hasattr(pipeline.named_steps['classifier'], 'predict_proba') else None

    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_prob) if y_prob is not None else "N/A"
    
    print(f"{name:<20} | F1 Score: {f1:.4f} | ROC AUC: {roc_auc if roc_auc=='N/A' else round(roc_auc, 4)}")


In [None]:
num_cols = ['credit_score', 'age', 'tenure', 'acc_balance', 'prod_count', 'estimated_salary']
cat_cols = ['country', 'gender', 'has_card', 'is_active']

tree_preprocessor = ColumnTransformer([
    ('num', 'passthrough', num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary'), cat_cols)
])

mlp_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', 
                    scale_pos_weight=(y == 0).sum() / (y == 1).sum(), random_state=42)
gb = GradientBoostingClassifier(random_state=42)
mlp = MLPClassifier(max_iter=300, early_stopping=True, random_state=42)

xgb_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('classifier', xgb)
])

gb_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('classifier', gb)
])

mlp_pipe = Pipeline([
    ('preprocessor', mlp_preprocessor),
    ('classifier', mlp)
])


voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_pipe),
        ('gb', gb_pipe),
        ('mlp', mlp_pipe),
    ],
    voting='soft'
)

voting_clf.fit(X_train, y_train)
y_pred_vote = voting_clf.predict(X_val)
y_prob_vote = voting_clf.predict_proba(X_val)[:, 1]

f1_vote = f1_score(y_val, y_pred_vote)
roc_auc_vote = roc_auc_score(y_val, y_prob_vote)

print(f"VotingClassifier        | F1 Score: {f1_vote:.4f} | ROC AUC: {roc_auc_vote:.4f}")


# Criteria 9
## Hyperparameter Tuning on any 3 of the models

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

import warnings
warnings.filterwarnings('ignore')

In [None]:
mlp_pipe = Pipeline([
    ('preprocessor', mlp_preprocessor),
    ('classifier', MLPClassifier(max_iter=1000, early_stopping=True, random_state=42))
])

mlp_param_dist = {
    'classifier__hidden_layer_sizes': [(100,), (100, 50), (150, 100), (128, 64)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__alpha': uniform(0.0001, 0.01),
    'classifier__learning_rate_init': uniform(0.001, 0.009),
    'classifier__solver': ['adam', 'sgd'],
    'classifier__batch_size': [64, 128, 'auto']
}

mlp_search = RandomizedSearchCV(
    mlp_pipe,
    mlp_param_dist,
    scoring='f1',
    cv=cv,
    n_iter=30,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [None]:
xgb_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                                 scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
                                 random_state=42))
])

xgb_param_dist = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__max_depth': randint(3, 10),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__colsample_bytree': uniform(0.6, 0.4),
    'classifier__gamma': uniform(0, 0.3),
    'classifier__reg_alpha': uniform(0, 0.2),
    'classifier__reg_lambda': uniform(0.5, 1.5)
}

xgb_search = RandomizedSearchCV(
    xgb_pipe,
    xgb_param_dist,
    scoring='f1',
    cv=cv,
    n_iter=30,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [None]:
gb_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

gb_param_dist = {
    'classifier__n_estimators': randint(100, 250),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__max_depth': randint(3, 10),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__min_samples_split': randint(2, 10),
    'classifier__min_samples_leaf': randint(1, 5),
    'classifier__max_features': ['sqrt', 'log2', None]
}

gb_search = RandomizedSearchCV(
    gb_pipe,
    gb_param_dist,
    scoring='f1',
    cv=cv,
    n_iter=30,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [None]:
xgb_search.fit(X_train, y_train)
print("\nBest XGB F1 Score:", xgb_search.best_score_)
print("Best XGB Params:", xgb_search.best_params_)

gb_search.fit(X_train, y_train)
print("\nBest Gradient Boost F1 Score:", gb_search.best_score_)
print("Best GB Params:", gb_search.best_params_)

mlp_search.fit(X_train, y_train)
print("\nBest MLP F1 Score:", mlp_search.best_score_)

## Checking tuned models on Validation set

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

xgb_tuned = xgb_search.best_estimator_
gb_tuned = gb_search.best_estimator_
mlp_tuned = mlp_search.best_estimator_


for name, model in zip(['XGBoost', 'Gradient Boosting', 'MLP'], [xgb_tuned, gb_tuned, mlp_tuned]):
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1]

    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_prob)

    print(f"{name} (Tuned) - F1 Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

## Trying voting classifier on tuned models

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score

xgbVC_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('classifier', xgb_search.best_estimator_.named_steps['classifier'])
])

gbVC_pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('classifier', gb_search.best_estimator_.named_steps['classifier'])
])

mlpVC_pipe = Pipeline([
    ('preprocessor', mlp_preprocessor),
    ('classifier', mlp_search.best_estimator_.named_steps['classifier'])
])


voting_clf_tuned = VotingClassifier(
    estimators=[
        ('xgb', xgbVC_pipe),
        ('gb', gbVC_pipe),
        ('mlp', mlpVC_pipe),
    ],
    voting='soft',
    weights=[6, 5, 4]
)

voting_clf_tuned.fit(X_train, y_train)
y_prob_vote = voting_clf_tuned.predict_proba(X_val)[:, 1]
y_pred_vote = (y_prob_vote >= 0.40).astype(int)

f1_vote = f1_score(y_val, y_pred_vote)
roc_auc_vote = roc_auc_score(y_val, y_prob_vote)

print("VotingClassifier (Tuned Models + Separate Preprocessing)")
print(f"F1 Score : {f1_vote:.4f}")
print(f"ROC AUC  : {roc_auc_vote:.4f}")


VotingClassifier (Tuned Models + Separate Preprocessing)
F1 Score : 0.6425
ROC AUC  : 0.8791

In [None]:
import numpy as np
from sklearn.metrics import f1_score

voting_probs = voting_clf_tuned.predict_proba(X_val)[:, 1]

best_thresh_vote = 0.5
best_f1_vote = 0

for thresh in np.arange(0.3, 0.7, 0.01):
    preds = (voting_probs >= thresh).astype(int)
    score = f1_score(y_val, preds)
    if score > best_f1_vote:
        best_f1_vote = score
        best_thresh_vote = thresh

print(f"[Voting] Best Threshold: {best_thresh_vote:.2f}, Best F1 Score: {best_f1_vote:.4f}")


Best Threshold: 0.45, Best F1 Score: 0.6463


# Criteria 10
## Comparison of model performances

In [None]:
import pandas as pd

validation_f1_scores = {
    'Logistic Regression': 0.5460,
    'Decision Tree': 0.5102,
    'Random Forest': 0.5921,
    'Gradient Boosting': 0.6051,
    'XGBoost': 0.6243,
    'KNN': 0.5778,
    'Neural Net (MLP)': 0.5968,
    'AdaBoost': 0.5993,
    'SGDClassifier': 0.5374,
    'VotingClassifier': 0.6349,
    
    'XGBoost (Tuned)': 0.6305,
    'Gradient Boosting (Tuned)': 0.6102,
    'MLPClassifier (Tuned)': 0.6010,
    'VotingClassifier (Tuned)': 0.6463,  
}

f1_comparison_df = pd.DataFrame.from_dict(validation_f1_scores, orient='index', columns=['F1 Score'])
f1_comparison_df = f1_comparison_df.sort_values(by='F1 Score', ascending=False)

print("F1 Score Comparison Across Models (Validation Set Only):")
print(f1_comparison_df)

In [None]:
test_probs = voting_clf_tuned.predict_proba(test_df)[:, 1]
test_preds = (test_probs >= 0.45).astype(int)  # Use best threshold

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'exit_status': test_preds
})

submission_df.to_csv('submission.csv', index=False)
print("Submission file created using threshold 0.40 and tuned ensemble.")
