In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# **Data types of different columns**

In [None]:
df = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-2/train.csv")
df.info()

# **Descriptive statistics of numerical columns**

In [None]:
df.describe()

In [None]:
df.columns

# **Visualizing Data**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings

# Suppress future warning from seaborn/pandas
warnings.filterwarnings("ignore", category=FutureWarning, message=".*use_inf_as_na.*")

# Drop columns not needed
df_viz = df.drop(columns=['id', 'customer_id','exit_status','last_name'])

# Set seaborn style
sns.set(style="whitegrid")

# Plot each column
for col in df_viz.columns:
    plt.figure(figsize=(10, 5))
    
    if df_viz[col].dtype == 'object':
        sns.countplot(data=df_viz, x=col, order=df_viz[col].value_counts().index[:10])
        plt.title(f"Count Plot of '{col}'")
        plt.xticks(rotation=45)
    else:
        sns.histplot(data=df_viz, x=col, kde=True, bins=30)
        plt.title(f"Distribution of '{col}'")
    
    plt.tight_layout()
    plt.show()

# **Identify and handle duplicates**

In [None]:
subset_cols = ['last_name', 'credit_score', 'country', 'gender',
       'age', 'tenure', 'acc_balance', 'prod_count', 'has_card', 'is_active',
       'estimated_salary']

# Identify duplicates
duplicates = df[df[subset_cols].duplicated()]
print(f"Number of duplicate rows before droping: {len(duplicates)}")

# Drop duplicates (keep the first occurrence)
df = df.drop_duplicates(subset=subset_cols, keep='first').reset_index(drop=True)

duplicates = df[df[subset_cols].duplicated()]
print(f"Number of duplicate rows after droping: {len(duplicates)}")

# **Handling Null Values**

In [None]:
null_counts = df.isnull().sum(axis=1)

# Identify rows with more than 2 NaNs
rows_with_many_nulls = df[null_counts > 2]
print(f"Number of rows with more than 2 nulls before droping: {len(rows_with_many_nulls)}")

# Drop those rows
df = df[null_counts <= 2].reset_index(drop=True)


null_counts = df.isnull().sum(axis=1)

# Identify rows with more than 2 NaNs
rows_with_many_nulls = df[null_counts > 2]
print(f"Number of rows with more than 2 nulls after droping: {len(rows_with_many_nulls)}")

# **Handling outliers**

In [None]:
target_cols = ['credit_score',
       'age', 'tenure', 'acc_balance', 'prod_count',
       'estimated_salary'] 
df_cleaned = df.copy()

for col in target_cols:
    # Calculate Q1, Q3 only from non-NaN values
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out only valid (non-NaN) rows that are within bounds, or keep NaNs as-is
    with np.errstate(invalid='ignore'):
        condition = (df_cleaned[col].isna()) | ((df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound))
    outliers = (~condition).sum()

    print(f"{col}: {outliers} outliers removed")
    df_cleaned = df_cleaned[condition]

df_cleaned.reset_index(drop=True, inplace=True)
print("Final shape after removing outliers:", df_cleaned.shape)

In [None]:
df = df_cleaned

# **Dropping unnecessary columns**

In [None]:
df.drop(columns=['id', 'customer_id','last_name'], inplace=True)

# **Train Test split**

In [None]:
y = df.pop('exit_status')
x_train , x_val , y_train , y_val = train_test_split(df,y,random_state=39,test_size=0.2)
print("x_train shape:", x_train.shape)
print("x_val shape:", x_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

# **Imputing missing values**

In [None]:
list_tran_1 = []
count = 0
for i in list(x_train.columns):
    if  x_train[i].dtype == 'O':
        list_tran_1.append(("v{}".format(count),SimpleImputer(strategy="most_frequent") ,[i]))
    else:
        list_tran_1.append(("v{}".format(count),SimpleImputer(strategy="mean"),[i]))
    count += 1
print(list_tran_1)

In [None]:
tran_1 = ColumnTransformer(list_tran_1)
tran_1.set_output(transform="pandas")
x_train_tran_1 = tran_1.fit_transform(x_train)

# **Encoding Categorical Columns and Scaling Numerical Columns**

In [None]:
print(x_train_tran_1.columns)
x_train_tran_1.head()
cat_cols = x_train_tran_1.select_dtypes(include=['object']).columns

# Print unique values for each categorical column
print("\nUnique values in categorical columns:")
for col in cat_cols:
    print(f"\n{col}:")
    print(x_train_tran_1[col].unique())

In [None]:
ordinal_cols = ['v2__gender']
ordinal_values = [
    ['Male' ,'Female']   
]

nominal_cols = ['v1__country']

# Identify numerical columns
num_cols = ['v0__credit_score', 'v3__age',
       'v4__tenure', 'v5__acc_balance', 'v6__prod_count', 'v7__has_card',
       'v8__is_active', 'v9__estimated_salary']

# ColumnTransformer setup
tran_2 = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(categories=ordinal_values), ordinal_cols),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), nominal_cols),
    ('scaler', StandardScaler(), num_cols)
])

In [None]:
x_train_tran_2 = tran_2.fit_transform(x_train_tran_1)

In [None]:
x_val_tran_1 = tran_1.transform(x_val)
x_val_tran_2 = tran_2.transform(x_val_tran_1)

# **Traning 7 Different models**

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
import pandas as pd
import numpy as np

# --- Models (7) ---
cls_models = {
    'Logistic': LogisticRegression(random_state=39, max_iter=1000),
    'XGBoost':  XGBClassifier(random_state=39, eval_metric='logloss', use_label_encoder=False),
    'DecisionTree': DecisionTreeClassifier(random_state=39),
    'RandomForest': RandomForestClassifier(random_state=39),
    'GradientBoosting': GradientBoostingClassifier(random_state=39),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state=39, probability=True)  # enable predict_proba for ROC-AUC
}

cls_results = []

# Helper: safely compute ROC-AUC (binary vs multi-class)
def safe_roc_auc(y_true, y_proba):
    # y_proba: array of shape (n_samples, n_classes) or (n_samples,) for binary
    try:
        if y_proba.ndim == 1 or y_proba.shape[1] == 1:  # binary probs or decision scores
            return roc_auc_score(y_true, y_proba)
        else:
            # multi-class: one-vs-rest macro average
            return roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
    except Exception:
        return np.nan

for name, model in cls_models.items():
    # Fit
    model.fit(x_train_tran_2, y_train)

    # Predict class labels
    y_pred_train = model.predict(x_train_tran_2)
    y_pred_val   = model.predict(x_val_tran_2)

    # Predict probabilities or decision scores for ROC-AUC
    if hasattr(model, "predict_proba"):
        proba_train = model.predict_proba(x_train_tran_2)
        proba_val   = model.predict_proba(x_val_tran_2)
        # for binary, roc_auc_score expects prob of positive class
        if proba_train.shape[1] == 2:
            proba_train = proba_train[:, 1]
            proba_val   = proba_val[:, 1]
    elif hasattr(model, "decision_function"):
        proba_train = model.decision_function(x_train_tran_2)
        proba_val   = model.decision_function(x_val_tran_2)
    else:
        proba_train = np.full(len(y_train), np.nan)
        proba_val   = np.full(len(y_val), np.nan)

    # Metrics (macro average handles class imbalance better than micro for precision/recall/F1)
    acc_train = accuracy_score(y_train, y_pred_train)
    prec_train = precision_score(y_train, y_pred_train, average='macro', zero_division=0)
    rec_train  = recall_score(y_train, y_pred_train, average='macro', zero_division=0)
    f1_train   = f1_score(y_train, y_pred_train, average='macro', zero_division=0)
    auc_train  = safe_roc_auc(y_train, proba_train)

    acc_val = accuracy_score(y_val, y_pred_val)
    prec_val = precision_score(y_val, y_pred_val, average='macro', zero_division=0)
    rec_val  = recall_score(y_val, y_pred_val, average='macro', zero_division=0)
    f1_val   = f1_score(y_val, y_pred_val, average='macro', zero_division=0)
    auc_val  = safe_roc_auc(y_val ,proba_val)

    cls_results.append({
        'Model': name,
        'Acc_Train': acc_train, 'Prec_Train': prec_train, 'Rec_Train': rec_train, 'F1_Train': f1_train, 'AUC_Train': auc_train,
        'Acc_Val': acc_val, 'Prec_Val': prec_val, 'Rec_Val': rec_val, 'F1_Val': f1_val, 'AUC_Val': auc_val
    })

# Results table
cls_results_df = pd.DataFrame(cls_results).sort_values(by='Acc_Val', ascending=False)
print(cls_results_df)

# **Comparision between 7 models**

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assuming 'cls_results_df' from your classification metrics (already created)
# cls_results_df = pd.DataFrame(cls_results).sort_values(by='Acc_Val', ascending=False)
cls_results_df.set_index('Model', inplace=True)

# Common parameters
bar_width = 0.35
index = np.arange(len(cls_results_df))

# -------- Accuracy Plot --------
plt.figure(figsize=(10, 6))
bar1 = plt.bar(index, cls_results_df['Acc_Train'], bar_width, label='Train Accuracy', color='skyblue')
bar2 = plt.bar(index + bar_width, cls_results_df['Acc_Val'], bar_width, label='Validation Accuracy', color='navy')

plt.xticks(index + bar_width / 2, cls_results_df.index, rotation=45)
plt.title('Accuracy: Train vs Validation')
plt.ylabel('Accuracy')
plt.ylim(0, 1.05)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)

for bar in bar1 + bar2:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01, f"{height:.2f}", ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# -------- Precision Plot --------
plt.figure(figsize=(10, 6))
bar1 = plt.bar(index, cls_results_df['Prec_Train'], bar_width, label='Train Precision', color='lightcoral')
bar2 = plt.bar(index + bar_width, cls_results_df['Prec_Val'], bar_width, label='Validation Precision', color='darkred')

plt.xticks(index + bar_width / 2, cls_results_df.index, rotation=45)
plt.title('Precision: Train vs Validation')
plt.ylabel('Precision')
plt.ylim(0, 1.05)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)

for bar in bar1 + bar2:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01, f"{height:.2f}", ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# -------- Recall Plot --------
plt.figure(figsize=(10, 6))
bar1 = plt.bar(index, cls_results_df['Rec_Train'], bar_width, label='Train Recall', color='lightgreen')
bar2 = plt.bar(index + bar_width, cls_results_df['Rec_Val'], bar_width, label='Validation Recall', color='green')

plt.xticks(index + bar_width / 2, cls_results_df.index, rotation=45)
plt.title('Recall: Train vs Validation')
plt.ylabel('Recall')
plt.ylim(0, 1.05)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)

for bar in bar1 + bar2:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01, f"{height:.2f}", ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# -------- F1 Score Plot --------
plt.figure(figsize=(10, 6))
bar1 = plt.bar(index, cls_results_df['F1_Train'], bar_width, label='Train F1', color='orchid')
bar2 = plt.bar(index + bar_width, cls_results_df['F1_Val'], bar_width, label='Validation F1', color='purple')

plt.xticks(index + bar_width / 2, cls_results_df.index, rotation=45)
plt.title('F1 Score: Train vs Validation')
plt.ylabel('F1 Score')
plt.ylim(0, 1.05)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)

for bar in bar1 + bar2:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01, f"{height:.2f}", ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# -------- AUC Plot --------
plt.figure(figsize=(10, 6))
bar1 = plt.bar(index, cls_results_df['AUC_Train'], bar_width, label='Train AUC', color='gold')
bar2 = plt.bar(index + bar_width, cls_results_df['AUC_Val'], bar_width, label='Validation AUC', color='darkgoldenrod')

plt.xticks(index + bar_width / 2, cls_results_df.index, rotation=45)
plt.title('ROC-AUC: Train vs Validation')
plt.ylabel('AUC')
plt.ylim(0, 1.05)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)

for bar in bar1 + bar2:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.01, f"{height:.2f}", ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# **Hyper Parameter Tuning of XGBClassifier , GradientBoostingClassifier , RandomForestClassifier finding Best model with best hyper parameters**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer

# F1 scorer (for binary or multi-class)
f1_scorer = make_scorer(f1_score, average='weighted')

# XGBoost Classifier
xgb_params = {
    'n_estimators': [200,300],
    'max_depth': [5,7],
    'learning_rate': [0.1,0.5],
    'subsample': [0.8,1],
    'colsample_bytree': [0.8,1],
    'reg_lambda': [1,0.1]
    
}
xgb = XGBClassifier(random_state=39, use_label_encoder=False, eval_metric='logloss')
grid_xgb = GridSearchCV(xgb, xgb_params, cv=5, scoring=f1_scorer, n_jobs=-1, verbose=1)
grid_xgb.fit(x_train_tran_2, y_train)
xgb_best = grid_xgb.best_estimator_

# Gradient Boosting Classifier
gb_params = {
    'n_estimators': [200,300],
    'max_depth': [5, 7],
    'learning_rate': [0.1]
}
gb = GradientBoostingClassifier(random_state=39)
grid_gb = GridSearchCV(gb, gb_params, cv=5, scoring=f1_scorer, n_jobs=-1, verbose=1)
grid_gb.fit(x_train_tran_2, y_train)
gb_best = grid_gb.best_estimator_

# Random Forest Classifier
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5,7]
}
rf = RandomForestClassifier(random_state=39)
grid_rf = GridSearchCV(rf, rf_params, cv=5, scoring=f1_scorer, n_jobs=-1, verbose=1)
grid_rf.fit(x_train_tran_2, y_train)
rf_best = grid_rf.best_estimator_

# Compare on Validation Set
models = {
    'XGBoost': xgb_best,
    'GradientBoosting': gb_best,
    'RandomForest': rf_best
}

print("\nModel Comparison on Validation Set:")
for name, model in models.items():
    y_pred = model.predict(x_val_tran_2)
    f1 = f1_score(y_val, y_pred, average='weighted')
    print(f"{name:17s} | F1 Score: {f1:.4f}")

# Best model by F1
best_model_name = max(models, key=lambda m: f1_score(y_val, models[m].predict(x_val_tran_2), average='weighted'))
print(f"\nBest Model: {best_model_name}")

In [None]:
print("XGBoost Best Params:", grid_xgb.best_params_)
print("Gradient Boosting Best Params:", grid_gb.best_params_)
print("Random Forest Best Params:", grid_rf.best_params_)

# **Final Model**

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# --- Train XGBoost Classifier ---
model = XGBClassifier(
    colsample_bytree=1,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=200,
    reg_lambda=0.1,
    subsample=0.8,
    random_state=39,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(x_train_tran_2, y_train)

# --- Predictions ---
y_pred_train = model.predict(x_train_tran_2)
y_pred_test  = model.predict(x_val_tran_2)

# --- F1 Score ---
f1_train = f1_score(y_train, y_pred_train)
f1_test  = f1_score(y_val, y_pred_test)

print(f"XGBoost F1 Score (Train): {f1_train:.4f}")
print(f"XGBoost F1 Score (Test) : {f1_test:.4f}")

# **Submission**

In [None]:
x_test = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-2/test.csv")

id_col = x_test['id']
x_test.drop(columns=['id', 'customer_id','last_name'], inplace=True)
x_test_tran_1 = tran_1.transform(x_test)
x_test_tran_2 = tran_2.transform(x_test_tran_1)

y_pred = model.predict(x_test_tran_2)

submission = pd.DataFrame({
    'id': id_col,
    'exit_status': y_pred
})

submission.to_csv("/kaggle/working/submission.csv", index=False)