In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mlp_term_2_2025_kaggle_assignment_2_path = kagglehub.competition_download('mlp-term-2-2025-kaggle-assignment-2')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Data Loading

In [None]:
PATH_TO_DATA = "/kaggle/input/mlp-term-2-2025-kaggle-assignment-2/"
train_df = pd.read_csv(PATH_TO_DATA + "train.csv")
test_df = pd.read_csv(PATH_TO_DATA + "test.csv")

In [None]:
# Create copies of original dataframes for potential later use (e.g., submission ID)
train_df_copy = train_df.copy(deep=True)
test_df_copy = test_df.copy(deep=True)


# Data Preprocessing

In [None]:
# --- 3. Initial Data Inspection & Preprocessing ---
print("--- Initial Data Inspection ---")
print("Train DataFrame Head:")
print(train_df.head(10))
print("\nTrain DataFrame Shape:")
print(train_df.shape)
print("\nTrain DataFrame Info:")
train_df.info()
print("\nTrain DataFrame Descriptive Statistics:")
print(train_df.describe().rename(index={"50%": "median"}))

# Drop unnecessary columns

In [None]:
cols_to_remove = ['id', 'customer_id', 'last_name']
train_df = train_df.drop(cols_to_remove, axis=1)
test_df = test_df.drop(cols_to_remove, axis=1)

# Fixing Missing Data

In [None]:
print('\nMissing rows in Train DataFrame (before imputation):')
print(train_df.isnull().sum())
print('\nMissing rows ratio in Train DataFrame (before imputation):')
print(train_df.isnull().sum() / train_df.shape[0])

# Impute missing values for numerical columns with mean and adding indicator

In [None]:
from sklearn.impute import SimpleImputer
numerical_impute_cols = ['credit_score', 'acc_balance', 'prod_count']
for col in numerical_impute_cols:
    mean_imputer = SimpleImputer(strategy='mean', add_indicator=True)

    # Fit on train and transform both train and test
    train_df_tmp = mean_imputer.fit_transform(train_df[[col]])
    train_df.loc[:, col] = train_df_tmp[:, 0]
    train_df.loc[:, f'{col}_missing_indicator'] = train_df_tmp[:, 1]

    test_df_tmp = mean_imputer.transform(test_df[[col]])
    test_df.loc[:, col] = test_df_tmp[:, 0]
    test_df.loc[:, f'{col}_missing_indicator'] = test_df_tmp[:, 1]

# Impute missing values for categorical 'country' column with most frequent and add indicator
country_imputer = SimpleImputer(strategy='most_frequent', add_indicator=True)
train_country_tmp = country_imputer.fit_transform(train_df[['country']])
train_df.loc[:, 'country'] = train_country_tmp[:, 0]
train_df.loc[:, 'country_missing_indicator'] = train_country_tmp[:, 1]

test_country_tmp = country_imputer.transform(test_df[['country']])
test_df.loc[:, 'country'] = test_country_tmp[:, 0]
test_df.loc[:, 'country_missing_indicator'] = test_country_tmp[:, 1]


# Handling duplicate rows

In [None]:
num_dups = train_df.shape[0] - train_df.drop_duplicates().shape[0]
print(f'\nNumber of duplicate rows before dropping: {num_dups}')
train_df = train_df.drop_duplicates()
print(f'Number of duplicate rows after dropping: {train_df.shape[0] - train_df.drop_duplicates().shape[0]}')

# Exploratory Data Analysis

In [None]:
print("\n--- Exploratory Data Analysis (EDA) ---")

# Graph 1: Histograms for numerical distributions (with KDE)
numerical_features_for_hist = ['credit_score', 'age', 'tenure', 'acc_balance', 'prod_count', 'estimated_salary']
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()
for i, col in enumerate(numerical_features_for_hist):
    sns.histplot(data=train_df, x=col, kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
plt.tight_layout()
plt.suptitle('Histograms of Numerical Features', y=1.02, fontsize=16)
plt.show()

# Graph 2: Count Plots for categorical features
categorical_features_for_count = ['country', 'gender', 'has_card', 'is_active', 'exit_status']
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()
for i, col in enumerate(categorical_features_for_count):
    if i < len(categorical_features_for_count): # Ensure we don't go out of bounds for axes
        sns.countplot(data=train_df, x=col, ax=axes[i])
        axes[i].set_title(f'Count of {col}')
        axes[i].tick_params(axis='x', rotation=45) # Rotate labels for better readability
    else:
        fig.delaxes(axes[i]) # Remove unused subplots if the number of features is less than subplots
plt.tight_layout()
plt.suptitle('Count Plots of Categorical Features', y=1.02, fontsize=16)
plt.show()

# Graph 3: Pair Plot for selected numerical features (sampled for large datasets)
print("\nGenerating Pair Plot (may take some time for large datasets)...")
# Taking a smaller sample for pair plot if the dataset is very large for performance
if train_df.shape[0] > 10000:
    sns.pairplot(train_df.sample(n=5000, random_state=42)[['credit_score', 'age', 'acc_balance', 'exit_status']], hue='exit_status', diag_kind='kde')
else:
    sns.pairplot(train_df[['credit_score', 'age', 'acc_balance', 'exit_status']], hue='exit_status', diag_kind='kde')
plt.suptitle('Pair Plot of Selected Numerical Features by Exit Status', y=1.02, fontsize=16)
plt.show()

# Graph 4: Correlation Heatmap of numerical features
print("\nGenerating Correlation Heatmap...")
plt.figure(figsize=(10, 8))
# Identify only the truly numerical columns (excluding boolean indicators if not desired in heatmap)
numerical_cols_for_corr = train_df.select_dtypes(include=np.number).columns.tolist()
# Exclude missing indicator columns from the correlation heatmap if they clutter it
numerical_cols_for_corr = [col for col in numerical_cols_for_corr if '_missing_indicator' not in col]
numerical_correlation_matrix = train_df[numerical_cols_for_corr].corr()
sns.heatmap(numerical_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()


# Outlier Detection

In [None]:
print("\n--- Outlier Detection using Z-score Method ---")
from scipy.stats import zscore
zscore_threshold = 3
numerical_cols_for_outliers_check = ['credit_score', 'age', 'tenure', 'acc_balance', 'prod_count', 'estimated_salary']

for col in numerical_cols_for_outliers_check:
    # Calculate Z-scores, dropping NaNs to prevent errors if any were missed
    col_zscores = np.abs(zscore(train_df[col].dropna()))
    # Identify rows where the absolute Z-score exceeds the threshold
    outliers_zscore_indices = np.where(col_zscores > zscore_threshold)[0]
    num_outliers = len(outliers_zscore_indices)
    percent_outliers = (num_outliers / train_df.shape[0]) * 100

    print(f'\nAnalyzing column: {col}')
    print(f'Number of outliers (Z-score > {zscore_threshold}): {num_outliers}')
    print(f'Percentage of outliers: {percent_outliers:.2f}%')
    print('Sample of outlier values:', train_df[col].iloc[outliers_zscore_indices].head())


# Preprocessing Pipeline Setup for Modeling

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
import tempfile # For caching pipelines

# Create a temporary directory for caching
cachedir1 = tempfile.mkdtemp()

# Define column types for preprocessing
numerical_cols = ['credit_score', 'age', 'tenure', 'acc_balance', 'prod_count', 'estimated_salary', 'has_card', 'is_active']
# Dynamically get missing indicator columns
missing_indicator_cols = [col for col in train_df.columns if '_missing_indicator' in col]
categorical_cols = ['country', 'gender']

# Numerical transformation pipeline
numeric_transformer = StandardScaler()
numeric_pipeline = Pipeline([
    ('scaler', numeric_transformer)
])

# Categorical transformation pipeline
one_hot_categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='if_binary')

# Combine all transformers using ColumnTransformer
preprocessor_one_hot = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_cols),
        ('cat', one_hot_categorical_transformer, categorical_cols),
        ('missing_indicator', 'passthrough', missing_indicator_cols) # Pass through missing indicators
    ]
)

# Add VarianceThreshold for feature selection (removes zero-variance features)
var_thresh_preprocessor_one_hot = Pipeline(steps=[
    ('preprocessor', preprocessor_one_hot),
    ('var_thresh', VarianceThreshold(threshold=0.0))
],
    memory=cachedir1 # Cache the pipeline steps
)

# Map preprocessors to model types (all use the same preprocessor here)
preprocessor_pipeline_map = {
    'logistic_regression': var_thresh_preprocessor_one_hot,
    'perceptron': var_thresh_preprocessor_one_hot,
    'random_forest_classifier': var_thresh_preprocessor_one_hot,
    'gradient_boosting_classifier': var_thresh_preprocessor_one_hot,
    'ada_boost_classifier': var_thresh_preprocessor_one_hot,
    'decision_tree_classifier': var_thresh_preprocessor_one_hot,
    'knn_classifier': var_thresh_preprocessor_one_hot,
    'mlp_classifier': var_thresh_preprocessor_one_hot # Added MLP to ensure 7 models
}


In [None]:
# Define X_train, y_train, X_test
X_train = train_df.drop(columns='exit_status')
y_train = train_df['exit_status']
X_test = test_df

In [None]:
# Fit the preprocessor to training data and inspect features
var_thresh_preprocessor_one_hot.fit(X_train)
processed_feature_names = var_thresh_preprocessor_one_hot.get_feature_names_out()
print('\nNo. of features after preprocessing:', len(processed_feature_names))
print('Features after preprocessing:', processed_feature_names) # Uncomment to see feature names

# Model Definitions

In [None]:
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

model_estimators = {
    'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
    'perceptron': Perceptron(random_state=42),
    'random_forest_classifier': RandomForestClassifier(random_state=42),
    'gradient_boosting_classifier': GradientBoostingClassifier(random_state=42),
    'ada_boost_classifier': AdaBoostClassifier(random_state=42),
    'decision_tree_classifier': DecisionTreeClassifier(random_state=42),
    'knn_classifier': KNeighborsClassifier(n_neighbors=3),
    'mlp_classifier': MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', alpha=0.0001, learning_rate='adaptive', max_iter=200, random_state=42)
}

# Create full pipelines for each model
modeling_pipelines = {
    m_name:
    Pipeline(steps=[
        ('pre', preprocessor_pipeline_map[m_name]),
        ('model', m_head)
    ])
    for m_name, m_head in model_estimators.items()
}
print("\nModels initialized for evaluation:")
print(list(modeling_pipelines.keys()))

print("\nTarget variable (exit_status) distribution in training data:")
print(train_df['exit_status'].value_counts())


# This function is used to handle imbalanced datasets by oversampling the minority class.
def data_oversampler(X, y, group_rules):
    all_oversampled_indices = []
    for rule_func, multiplier in group_rules:
        group_subset_index = list(y[rule_func(y)].index)
        num_group_members = len(group_subset_index)
        # Randomly sample with replacement to oversample
        group_sampled_indices = np.random.choice(group_subset_index, size=int(num_group_members * multiplier), replace=True)
        all_oversampled_indices.extend(group_sampled_indices)

    combined_indices = list(y.index) + all_oversampled_indices
    np.random.shuffle(combined_indices) # Shuffle to mix original and oversampled data

    X_oversampled = X.loc[combined_indices].reset_index(drop=True)
    y_oversampled = y.loc[combined_indices].reset_index(drop=True)
    return X_oversampled, y_oversampled

# K-Fold Cross-Validation and Initial Model Evaluation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.base import clone

kf = KFold(n_splits=5, shuffle=True, random_state=42) # 5-fold cross-validation

model_performance_scores = {
    'model_name': [],
    'f1_score': []
}

print("\n--- Starting K-Fold Cross-Validation for Initial Model Evaluation ---")
for current_model_name, current_model_pipeline in modeling_pipelines.items():

    fold_f1_scores = []
    fitted_fold_models = []

    for fold, (train_split_idx, val_split_idx) in enumerate(kf.split(X_train)):
        X_training, X_validation = X_train.iloc[train_split_idx], X_train.iloc[val_split_idx]
        y_training, y_validation = y_train.iloc[train_split_idx], y_train.iloc[val_split_idx]

        model_instance = clone(current_model_pipeline) # Clone model for each fold

        # Apply oversampling to the training data of the current fold
        # Here, we oversample the positive class (exit_status == 1) by 1.4 times
        oversampling_rules = [(lambda x: x == 1, 1.4)]
        X_training_sampled, y_training_sampled = data_oversampler(X_training, y_training, oversampling_rules)

        model_instance.fit(X_training_sampled, y_training_sampled) # Fit the model
        y_predicted = model_instance.predict(X_validation) # Make predictions on validation set
        f1 = f1_score(y_validation, y_predicted) # Calculate F1 score

        fold_f1_scores.append(f1)
        fitted_fold_models.append(model_instance)

    # Store the best performing model from the current model type's folds
    best_fold_f1_score = np.max(fold_f1_scores)
    index_of_best_fold = np.argmax(fold_f1_scores)
    best_fitted_model_for_type = fitted_fold_models[index_of_best_fold]

    # Update the modeling_pipelines dictionary with the best fitted model for this type
    modeling_pipelines[current_model_name] = best_fitted_model_for_type

    print(f'Model: {current_model_name} | F1 Scores across folds: {fold_f1_scores}, | Avg F1: {np.mean(fold_f1_scores):.4f} +/- {np.std(fold_f1_scores):.4f}')

    model_performance_scores['model_name'].append(current_model_name)
    model_performance_scores['f1_score'].append(best_fold_f1_score)

print("\n--- Initial Model Performance Summary (before Hyperparameter Tuning) ---")
initial_perf_df = pd.DataFrame(model_performance_scores).sort_values(by='f1_score', ascending=False)
print(initial_perf_df)

# Hyperparameter Tuning for Top 3 Best Models

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Get the names of the top 3 models for Hyperparameter Tuning
top_3_models_for_hpt = initial_perf_df['model_name'].head(3).tolist()
print(f"\n--- Starting Hyperparameter Tuning for Top 3 Models: {top_3_models_for_hpt} ---")

# Define hyperparameter grids for tuning
# Gradient Boosting Classifier HPT
if 'gradient_boosting_classifier' in top_3_models_for_hpt:
    gb_hyperparam_grid = {
        'model__n_estimators': np.linspace(100, 300, 10, dtype=int),
        'model__learning_rate': np.logspace(-2, -0.5, 10),
        'model__max_depth': np.arange(3, 30),
        'model__min_samples_split': np.linspace(2, 20, 10, dtype=int),
        'model__min_samples_leaf': np.linspace(1, 20, 10, dtype=int),
        'model__subsample': np.linspace(0.5, 1.0, 6),
        'model__loss': ['log_loss', 'exponential']
    }

    print("\nPerforming Randomized Search for Gradient Boosting Classifier...")
    gb_rand_search = RandomizedSearchCV(clone(modeling_pipelines['gradient_boosting_classifier']), gb_hyperparam_grid, cv=5, scoring='f1', n_jobs=-1, random_state=42, verbose=1)
    gb_rand_search.fit(X_train, y_train) # Fit on full training data
    print('Gradient Boosting best score/params:', gb_rand_search.best_score_, gb_rand_search.best_params_)
    model_performance_scores['model_name'].append('gradient_boosting_classifier_tuned')
    modeling_pipelines['gradient_boosting_classifier_tuned'] = gb_rand_search # Store the best estimator from RandomizedSearchCV
    model_performance_scores['f1_score'].append(gb_rand_search.best_score_)

# AdaBoost Classifier HPT
if 'ada_boost_classifier' in top_3_models_for_hpt:
    ada_hyperparam_grid = {
        'model__n_estimators': np.linspace(50, 500, 10, dtype=int),
        'model__learning_rate': np.logspace(-2, 0, 20),
        'model__algorithm': ['SAMME', 'SAMME.R']
    }

    print("\nPerforming Randomized Search for AdaBoost Classifier...")
    ada_rand_search = RandomizedSearchCV(clone(modeling_pipelines['ada_boost_classifier']), ada_hyperparam_grid, cv=5, scoring='f1', n_jobs=-1, random_state=42, verbose=1)
    ada_rand_search.fit(X_train, y_train)
    print('AdaBoost best score/params:', ada_rand_search.best_score_, ada_rand_search.best_params_)
    model_performance_scores['model_name'].append('ada_boost_classifier_tuned')
    modeling_pipelines['ada_boost_classifier_tuned'] = ada_rand_search
    model_performance_scores['f1_score'].append(ada_rand_search.best_score_)

# Random Forest Classifier HPT
if 'random_forest_classifier' in top_3_models_for_hpt:
    rf_hyperparam_grid = {
        'model__min_samples_split': np.arange(2, 20),
        'model__min_samples_leaf': np.arange(1, 10),
        'model__n_estimators': np.linspace(100, 600, 10, dtype=int),
        'model__max_features': ['sqrt', 'log2', 0.5, 0.7],
    }

    print("\nPerforming Randomized Search for Random Forest Classifier...")
    rf_rand_search = RandomizedSearchCV(clone(modeling_pipelines['random_forest_classifier']), rf_hyperparam_grid, cv=5, scoring='f1', n_jobs=-1, random_state=42, verbose=1)
    rf_rand_search.fit(X_train, y_train)
    print('Random Forest best score/params:', rf_rand_search.best_score_, rf_rand_search.best_params_)
    model_performance_scores['model_name'].append('random_forest_classifier_tuned')
    modeling_pipelines['random_forest_classifier_tuned'] = rf_rand_search
    model_performance_scores['f1_score'].append(rf_rand_search.best_score_)

# Final Model Performance Summary

In [None]:
final_model_performance_df = pd.DataFrame(model_performance_scores)
final_model_performance_df = final_model_performance_df.sort_values(by=['f1_score'], axis=0, ascending=False)
print("\n--- Final Model Performance Summary (including tuned models) ---")
print(final_model_performance_df)

best_model_for_submission_name = final_model_performance_df.iloc[0, 0]
print(f'\nBest performing Model selected for submission: {best_model_for_submission_name}')
best_model_for_submission = modeling_pipelines[best_model_for_submission_name]

# Generate Predictions and Creating Submission File

In [None]:
print("\n--- Generating Predictions for Submission ---")
# Predict on the test data using the best model
y_predictions_on_test = best_model_for_submission.predict(X_test)

# Create submission DataFrame, preserving the original 'id' from test_df_copy
submission_df = pd.DataFrame({
    'id': test_df_copy['id'],
    'exit_status': y_predictions_on_test
})

print("\nSubmission DataFrame Head:")
print(submission_df.head())

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully.")