In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import EasyEnsembleClassifier

## What Question is Being Answered?

The problem aims to predict if an account holder will fail to make a payment next month based on their payment history and other banking details. This is a binary classification task since the outcomes are either 'Default' or 'Not Default'. Thus, the target variable is categorical with two categories.

## Dataset EDA

In [None]:
file = 'default of credit card clients.xls'

df = pd.read_excel(file)

# Drop the first column from the DataFrame
df.drop(df.columns[0], axis=1, inplace=True)

# Replace the column names with the values from the second row
new_column_names = df.iloc[0]
df.columns = new_column_names

# Resetting the index
df = df.iloc[1:].reset_index(drop=True)

df = df.rename(columns={"default payment next month": "Y"})

# Display the DataFrame with the updated column names
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Average Payment Delay
df['avg_payment_delay'] = df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)

# Outstanding Balance for each month
for i in range(1, 7):
    df[f'outstanding_balance_{i}'] = df[f'BILL_AMT{i}'] - df[f'PAY_AMT{i}']

# Credit Utilization for each month
for i in range(1, 7):
    df[f'credit_utilization_{i}'] = df[f'BILL_AMT{i}'] / df['LIMIT_BAL']

# Replacing any infinity values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Debt Ratio for each month
for i in range(1, 7):
    # Avoid division by zero
    df[f'debt_ratio_{i}'] = (df[f'BILL_AMT{i}'] - df[f'PAY_AMT{i}']) / df[f'BILL_AMT{i}']
    df.loc[df[f'BILL_AMT{i}'] == 0, f'debt_ratio_{i}'] = 0

# Age Binning
age_bins = [20, 30, 40, 50, 60, 70, 80]
age_labels = ['20-30', '30-40', '40-50', '50-60', '60-70', '70-80']
df['age_group'] = pd.cut(df['AGE'], bins=age_bins, labels=age_labels, right=False)

# Consistent Bill Amount
df['consistent_bill_amount'] = df[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].std(axis=1)

# List of columns to drop
columns_to_drop = ['AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
                   'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                   'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

df.head(10)

In [None]:
df.info()

In [None]:
# Check for missing values in the DataFrame
df.isnull().sum()

In [None]:
# Count of each class in 'Y' column
y_counts = df['Y'].value_counts()

plt.figure(figsize=(8,6))  # Set the figure size
sns.barplot(x=y_counts.index, y=y_counts.values, palette="muted")  # Create a barplot

plt.title('Default Payment Next Month (Y)', fontsize=14)  # Set the title and its fontsize
plt.xlabel('Default Payment', fontsize=12)  # Set x-axis label and its fontsize
plt.ylabel('Count', fontsize=12)  # Set y-axis label and its fontsize

# Annotate the heights of bars for better visual understanding
for i, v in enumerate(y_counts.values):
    plt.text(i, v, str(v), ha='center', va='bottom', fontsize=12)

plt.xticks(ticks=[0, 1], labels=['(0) No', '(1) Yes'])  # Replace 0,1 labels with No and Yes for better understanding
plt.show()

In [None]:
df.info()

In [None]:
# Compute correlation matrix for all features
correlation_matrix = df.corr()

# Set up the figure size
plt.figure(figsize=(18, 15))

# Create a heatmap with reduced font size in annotations and set a threshold for displaying annotations
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", annot_kws={'size': 8})

# Add a title to the plot
plt.title('Correlation Heatmap of All Features', fontsize=14)

# Show the plot
plt.show()


In [None]:
# Compute correlation matrix for all features
correlation_matrix = df.corr()

# Set a threshold for significant correlations
threshold = 0.5

# Mask the values that are below the threshold
mask = np.abs(correlation_matrix) < threshold

plt.figure(figsize=(18, 15))

# Create a heatmap with masked insignificant correlations, and reduced font size in annotations
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f", annot_kws={'size': 8})

plt.title('Correlation Heatmap of Significant Features (> 0.5)', fontsize=14)
plt.show()

## Model Selection and Confusion Matrices

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Support Vector Machines': LinearSVC(max_iter=10000, C=10),
    'Decision Trees': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Random Forest': RandomForestClassifier(),
    'EasyEnsemble': EasyEnsembleClassifier(n_estimators = 200)
}

# Identify categorical and numerical columns
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'age_group']
numeric_cols = df.columns.drop(cat_cols + ["Y"]).tolist()

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.2, random_state=42)

# Convert y_train and y_test to integer
y_train = y_train.astype('int')
y_test = y_test.astype('int')

for model_name, model in models.items():
    X_train_temp = X_train.copy()
    y_train_temp = y_train.copy()
    X_test_temp = X_test.copy()
    y_test_temp = y_test.copy()

    # Create preprocessing and training pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               (model_name, model)])

    # Fit the model
    pipeline.fit(X_train_temp, y_train_temp)

    # Make predictions
    y_pred = pipeline.predict(X_test_temp)

    # Print accuracy
    print(f'Accuracy of {model_name}: {accuracy_score(y_test_temp, y_pred)}')

    # Print confusion matrix
    print(f'Confusion Matrix of {model_name}:\n {confusion_matrix(y_test_temp, y_pred)}')

    # Print classification report
    print(f'Classification Report of {model_name}:\n {classification_report(y_test_temp, y_pred)}')


## What techniques are being used for modeling? (Justifications on Model Selection from High to Low Bias)

1. Logistic Regression: Logistic Regression is our starting point as it's a simple yet effective algorithm for binary classification problems like ours. Despite its simplicity, Logistic Regression can perform well when features have a linear relationship with the log-odds of the outcome (default or not default). Moreover, logistic regression models are very interpretable, which is a bonus when we are first trying to understand our data.

2. Support Vector Machines: We move to SVMs when we suspect that the boundaries between default and non-default credit card users might not be linear. SVMs can use kernel functions to handle such non-linearity. Also, SVMs can handle high-dimensional data well, which is relevant given the number of features we might have in a credit card dataset.

3. Decision Trees: Decision Trees are chosen for their ability to handle non-linear relationships and their interpretability. They are more flexible than both Logistic Regression and SVMs and do not require any assumptions about the relationship between features and the target variable. Moreover, they can handle both numerical and categorical variables which are common in credit card data.

4. AdaBoost: AdaBoost, an ensemble method, is used to potentially improve the performance of our Decision Tree. By combining multiple weak learners (small decision trees), AdaBoost forms a more robust model that can generalize better to unseen data. It is especially useful if some of our features are weakly associated with the outcome but collectively they can predict the outcome well.

5. Random Forest: Random Forest is another ensemble model that creates a bunch of decision trees and aggregates their predictions. It's less likely to overfit than a single decision tree, which makes it a good choice if we have a lot of features and are worried about overfitting. Given the high-dimensional nature of credit card data, Random Forest is likely to improve our predictions.

6. EasyEnsemble: We finally move to EasyEnsemble when tackling the class imbalance in our dataset. Given that defaults are typically less frequent than non-defaults in credit card datasets, EasyEnsemble helps by creating balanced subsets of data and using an ensemble of classifiers (each trained on a different subset). This approach ensures that our model is exposed to enough default examples during training and hence can generalize better to default cases in unseen data.

# PCA Analysis

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Standardize the numeric features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler()), 
                                      ('pca', PCA(n_components=0.95))])

# One hot encode the categorical features
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Create the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, cat_cols)])

# Fit and transform the train data, then transform the test data
X_train_pca = preprocessor.fit_transform(X_train)
X_test_pca = preprocessor.transform(X_test)

# For this setup, you're not directly accessing the PCA object, so if you want the number of components and explained variance,
# you'll have to extract it from the transformer:
pca_components = preprocessor.named_transformers_['num']['pca'].n_components_
explained_variance = preprocessor.named_transformers_['num']['pca'].explained_variance_ratio_

print("Number of PCA components: ", pca_components)
print("Explained variance: ", explained_variance)


In [None]:
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_pca, y_train)

    # Make predictions
    y_pred = model.predict(X_test_pca)

    # Print accuracy
    print(f'Accuracy of {model_name}: {accuracy_score(y_test, y_pred)}')

    # Print confusion matrix
    print(f'Confusion Matrix of {model_name}:\n {confusion_matrix(y_test, y_pred)}')

    # Print classification report
    print(f'Classification Report of {model_name}:\n {classification_report(y_test, y_pred)}')

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Specify parameter grid for each model
param_grid = {
    'Logistic Regression': {
        'Logistic Regression__C': [0.1, 1.0, 10.0, 100.0]
    },
    'Support Vector Machines': {
        'Support Vector Machines__C': [0.1, 1.0, 10.0, 100.0]
    },
    'Decision Trees': {
        'Decision Trees__max_depth': [50, 100, 500, 1000],
        'Decision Trees__min_samples_split': [2, 5, 10],
        'Decision Trees__min_samples_leaf': [1, 2, 5]
    },
    'Random Forest': {
        'Random Forest__n_estimators': [50, 100, 200, 500],
        'Random Forest__max_depth': [50, 100, 500],
        'Random Forest__min_samples_split': [2, 5, 10, 20],
        'Random Forest__min_samples_leaf': [1, 2, 5, 10]
    },
    'AdaBoost': {
        'AdaBoost__n_estimators': [10, 50, 100, 200, 500, 1000],
        'AdaBoost__learning_rate': [0.001, 0.01, 0.1, 1.0, 10.0]
    },
    'EasyEnsemble': {
    'EasyEnsemble__n_estimators': [10, 50, 100, 200],
    'EasyEnsemble__sampling_strategy': ['auto', 0.5, 0.7]
    }

}

# Define a function to execute grid search on a model
def perform_grid_search(model_name, model):
    # Copy the train datasets
    X_train_temp = X_train_pca.copy()
    y_train_temp = y_train.copy()

    # Create the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               (model_name, model)])

    # Define the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='recall')

    # Fit the model to the training data
    grid_search.fit(X_train_temp, y_train_temp)

    # Print the best parameters and the corresponding score
    print(f'Model: {model_name}')
    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Best recall: {grid_search.best_score_}\n')

# Execute grid search on each model
for model_name, model in models.items():
    perform_grid_search(model_name, model)


## Complexity of the dataset:

1. **Did we use raw features or apply feature engineering?** 
We applied feature engineering. Using PCA, we changed the original data into a set of new features that capture most of the data's variation. This helps when dealing with big datasets by reducing unnecessary details and potentially improving our model's predictions.

2. **How did we manage the dataset's size and complexity?** 
We used PCA to handle the dataset's complexity. This method finds and uses the main patterns in the data, reducing the number of features. The number of these main patterns or "principal components" we kept changes the dataset's size.
On top of that, we tweaked settings in our machine learning algorithms, a process called hyperparameter tuning. While this doesn't directly reduce data size, it helps our model work better with the transformed data.
By using PCA and tuning hyperparameters, we aim to make our model predict better. Still, it's essential for us to keep checking how well our model works, especially on new data it hasn't seen before.