In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#!pip install xlrd

In [None]:
xls_file = "default_credit_score.xls"
df = pd.read_excel(xls_file)

# Set the second row as header
df.columns = df.iloc[0]  # Assign the second row as header
df = df[1:].reset_index(drop=True)  # Remove the first row and reset index

# Save as CSV
csv_file = "default_credit_score.csv"
df.to_csv(csv_file, index=False)

print(f"File saved as {csv_file} with corrected headers.")

In [None]:
df = pd.read_csv("default_credit_score.csv")
df.head()

In [None]:
df.columns = ['ID', 'LIMIT_BAL', 'GENDER', 'EDUCATION', 'MARRIAGE', 'AGE', 
               'PAY_ST9', 'PAY_ST8', 'PAY_ST7', 'PAY_ST6', 'PAY_ST5','PAY_ST4',
               'BILL_AMT9', 'BILL_AMT8', 'BILL_AMT7', 'BILL_AMT6', 'BILL_AMT5', 'BILL_AMT4', 
               'PAY_AMT9', 'PAY_AMT8', 'PAY_AMT7', 'PAY_AMT6', 'PAY_AMT5', 'PAY_AMT4',
       'DEFAULT_PAYMENT']

In [None]:
df.head()

In [None]:
df.columns

### Exploration

In [None]:
df.describe()

In [None]:
# dropping ID since it will mess the training, due to the numerical values
df.drop(columns=['ID'], inplace=True)

### Outliers

In [None]:
# Select only numerical columns
# numerical_cols = df.columns[df.columns != "DEFAULT_PAYMENT"]

# Set plot size
plt.figure(figsize=(15, 10))

# Loop through numerical columns and create boxplots
for i, col in enumerate(df.columns):
    plt.subplot(5, 5, i+1)
    sns.boxplot(y=df[col], color="salmon")
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()

In [None]:
# Define the cap limit (95th percentile)
def cap_outliers(df, cols, threshold=0.95):
    for col in cols:
        upper_limit = np.percentile(df[col], threshold * 100)
        df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])
    return df

# Apply capping to the most affected features
cols_to_cap = ['BILL_AMT9', 'BILL_AMT8', 'BILL_AMT7','BILL_AMT6','BILL_AMT5','BILL_AMT4','PAY_AMT9',
               'PAY_AMT8', 'PAY_AMT7', 'PAY_AMT6', 'PAY_AMT5','PAY_AMT4']
df = cap_outliers(df, cols_to_cap)

In [None]:
# Select only numerical columns
numerical_cols = df.columns[df.columns != "DEFAULT_PAYMENT"]

# Set plot size
plt.figure(figsize=(15, 10))

# Loop through numerical columns and create boxplots
for i, col in enumerate(numerical_cols):
    plt.subplot(5, 5, i+1)
    sns.boxplot(y=df[col], color='teal')
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()

### Scaling for KNN 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Define features and target
X = df.drop(columns=['DEFAULT_PAYMENT'])  # Drop the target column
y = df['DEFAULT_PAYMENT']

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Check the transformed data
X_scaled.describe()

In [None]:
# Plot histograms for all numerical features
df.hist(figsize=(15, 12), bins=50, edgecolor='black')
plt.suptitle('Histograms of Features', fontsize=16)
plt.show()

In [None]:
# Plot histograms with Seaborn
df_melted = df.melt(var_name="Feature", value_name="Value")

plt.figure(figsize=(15, 8))
sns.histplot(data=df_melted, x="Value", hue="Feature", bins=50, element="step", common_norm=False)
plt.title('Feature Distributions')
plt.show()

### Splitting 80/20

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# setting train and test data (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

### Trainning Model - KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# testing different values of k to perform KNN
error_rates = []
k_values = range(1, 31)
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)  # Train the model
    y_pred = knn.predict(X_test)  # Predict on test set
    error = 1 - accuracy_score(y_test, y_pred)  # Calculate error rate
    error_rates.append(error)

# finding lowest k value
k_min_value = min(error_rates)
knn_n = error_rates.index(k_min_value)
knn_n # index + 1 is the optimal KNN value which is the lowest 

In [None]:
# plotting the elbow curve to find the optimal value of k
plt.figure(figsize=(12, 6))
plt.plot(k_values, error_rates, marker='o', linestyle='dashed', color='grey', markersize=8)

plt.scatter(k_values[knn_n], error_rates[knn_n], 
            color='red', s=200)

plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Error Rate')
plt.title('Elbow Method to Find Optimal k')
plt.xticks(np.arange(1, 31, step=2))
plt.grid(True)
plt.show()

In [None]:
# calculating test accuracy when k=14
knn_14 = KNeighborsClassifier(n_neighbors=14)
knn_14.fit(X_train, y_train)
y_pred_14 = knn_14.predict(X_test)

In [None]:
# results
conf_matrix = confusion_matrix(y_test, y_pred_14)
class_report = classification_report(y_test, y_pred_14)
knn_accuracy = accuracy_score(y_test, y_pred_14)

# printing results 
print("Accuracy Score for KNN with 14 neighbors:", round(knn_accuracy, 2))

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

print(f"\nObservations:")
print(f"True Negatives(TN) = {conf_matrix[0,0]} values (default payment = 0 / no) were predicted correctly")
print(f"False Positives (FP) = {conf_matrix[0,1]} values (default payment = 0 / no) were predicted incorrectly")
print(f"False Negatives (FN) = {conf_matrix[1,0]} values (default payment = 1 / yes) were predicted incorrectly")
print(f"True Positives (TP) = {conf_matrix[1,1]} values (default payment = 1 / yes) were predicted correctly")

In [None]:
train_accuracy = knn_14.score(X_train, y_train)  # Accuracy on training data 
test_accuracy = knn_14.score(X_test, y_test)    # Accuracy on test data 

print(f"Training Accuracy based on the KNN Model: {train_accuracy:.4f}")
print(f"Test Accuracy based on the KNN Model: {test_accuracy:.4f}")

### Improving the Model 

#### + Using feature importance from *Random Forest*

In [None]:
X_scaled.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

# training the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# feature importances
importances = rf.feature_importances_
feature_names = X_train.columns

# sort features by importance
sorted_indices = np.argsort(importances)[::-1]
top_k = 18  # Number of features to display
top_features = [feature_names[i] for i in sorted_indices[:top_k]]
top_importances = [importances[i] for i in sorted_indices[:top_k]]

# get the percentage
top_importances = np.array(top_importances) * 100

In [None]:
top_features

In [None]:
# Calculate total importance
total_importance = np.sum(importances)

# Sum of the top k feature importances
top_k_importance_sum = np.sum(top_importances / 100)  # Convert back to original scale

# Calculate percentage covered by top 10 features
coverage_percentage = (top_k_importance_sum / total_importance) * 100

print(f"Top {top_k} features cover: {coverage_percentage:.2f}% of total importance")

In [None]:
# plotting top 1' features
plt.figure(figsize=(10, 6))
sns.barplot(x=top_importances, y=top_features, palette="viridis", legend=True)
plt.xlabel("Feature Importance (%)")
plt.ylabel("Features")
plt.title(f'Top {top_k} Important Features')
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.show()

### Training KNN Model after feature importance

In [None]:
X_fi = X_scaled[top_features]
X_fi

In [None]:
# setting train and test data (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X_fi, y, test_size=0.2, random_state=42, stratify=y)

# testing different values of k to perform KNN
error_rates = []
k_values = range(1, 31)
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)  # Train the model
    y_pred = knn.predict(X_test)  # Predict on test set
    error = 1 - accuracy_score(y_test, y_pred)  # Calculate error rate
    error_rates.append(error)

# finding the lowest value
k_min_value = min(error_rates)
knn_n = error_rates.index(k_min_value)
knn_n # index + 1 is the optimal KNN value which is the lowest 

In [None]:
# plotting the elbow curve to find the optimal value of k
plt.figure(figsize=(12, 6))
plt.plot(k_values, error_rates, marker='o', linestyle='dashed', color='grey', markersize=8)

plt.scatter(k_values[knn_n], error_rates[knn_n], 
            color='red', s=200)

plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Error Rate')
plt.title('Elbow Method to Find Optimal k')
plt.xticks(np.arange(1, 31, step=2))
plt.grid(True)
plt.show()

In [None]:
# calculating test accuracy when k=17
knn_17 = KNeighborsClassifier(n_neighbors=17)
knn_17.fit(X_train, y_train)
y_pred_17 = knn_17.predict(X_test)

# results
conf_matrix = confusion_matrix(y_test, y_pred_17)
class_report = classification_report(y_test, y_pred_17)
knn_accuracy = accuracy_score(y_test, y_pred_17)

# printing results 
print("Accuracy Score for KNN with 17 neighbors:", round(knn_accuracy, 2))

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

In [None]:
train_accuracy = knn_17.score(X_train, y_train)  # Accuracy on training data 
test_accuracy = knn_17.score(X_test, y_test)    # Accuracy on test data 

print(f"Training Accuracy based on the KNN Model: {train_accuracy:.4f}")
print(f"Test Accuracy based on the KNN Model: {test_accuracy:.4f}")

### Improving the Model 

#### + Perfoming SMOTE 

In [None]:
from imblearn.over_sampling import SMOTE

y.value_counts()

In [None]:
# applying SMOTE to balance the amount of unique values in Y
smote = SMOTE(sampling_strategy='minority')
X_smote, y = smote.fit_resample(X_scaled, y)

In [None]:
# checking smote result
y.value_counts()

In [None]:
# checking smote result
X_smote.shape

In [None]:
# checking smote result
y.shape

### Training the model KNN after Smote

In [None]:
X_smote

In [None]:
# Standardization
X_smote_scaled = scaler.fit_transform(X_smote)

# Convert back to DataFrame
X_smote_scaled = pd.DataFrame(X_smote_scaled, columns=X_smote.columns)

In [None]:
# setting train and test data (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X_smote_scaled, y, test_size=0.2, random_state=42, stratify=y)

# testing different values of k to perform KNN
error_rates = []
k_values = range(1, 30)
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)  # Train the model
    y_pred = knn.predict(X_test)  # Predict on test set
    error = 1 - accuracy_score(y_test, y_pred)  # Calculate error rate
    error_rates.append(error)

# finding the lowest value
k_min_value = min(error_rates)
knn_n = error_rates.index(k_min_value)
knn_n # index + 1 is the optimal KNN value which is the lowest 

In [None]:
# plotting the elbow curve to find the optimal value of k
plt.figure(figsize=(12, 6))
plt.plot(k_values, error_rates, marker='o', linestyle='dashed', color='red', markersize=8)

plt.scatter(k_values[knn_n], error_rates[knn_n], 
            color='red', s=200)

plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Error Rate')
plt.title('Elbow Method to Find Optimal k')
plt.xticks(np.arange(1, 31, step=2))
plt.grid(True)
plt.show()

In [None]:
# calculating test accuracy when k=17
knn_1 = KNeighborsClassifier(n_neighbors=17)
knn_1.fit(X_train, y_train)
y_pred_1 = knn_1.predict(X_test)

# results
conf_matrix = confusion_matrix(y_test, y_pred_1)
class_report = classification_report(y_test, y_pred_1)
knn_accuracy = accuracy_score(y_test, y_pred_1)

# printing results 
print("Accuracy Score for KNN with 1 neighbors:", round(knn_accuracy, 2))

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)