In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
laotse_credit_risk_dataset_path = kagglehub.dataset_download('laotse/credit-risk-dataset')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv')

This dataset contains columns simulating credit bureau data. Below is a table with the feature name and explanation.

![{E94256FA-43B3-49FE-97F4-9AA017CBAADB}.png](attachment:c469bab5-65eb-401f-809b-d239f4a7300b.png)

The target variable will be loan_status. *Consequently, our task is reduced to a binary classification problem.

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

> Let us view all categorical field values as we might need to remove them / lable encode it later


In [None]:
for d in df.select_dtypes(include=['object']).columns.tolist():
    print(df[d].value_counts(),'\n')

# **Visualization**

In [None]:
#showcase count of loan_status
print(df['loan_status'].value_counts())
sns.countplot(x ='loan_status', data = df)


In [None]:
person_age_income = (

    df.groupby(["person_age"])["loan_status"].value_counts().reset_index()

)

total_counts = person_age_income.groupby(["person_age"])["count"].transform("sum")

person_age_income["percentage"] = (person_age_income["count"] / total_counts) * 100

print(person_age_income)

In [None]:
#comparing  loan status wt person's age

sns.histplot(

    data=person_age_income,

    x="person_age",

    weights="percentage",

    hue="loan_status",

    multiple="stack",

    palette="coolwarm",

)



plt.title("Person Age vs. Loan Status")

plt.xlabel("Person age")

plt.ylabel("Percentage")

plt.xticks(rotation=45)

plt.tight_layout()

plt.show()

Person's Age has few outlier which we can visually see but lets observe them properly in later stage.

In [None]:
# Loan Intent
intent_status_group = (

    df.groupby(["loan_intent"])["loan_status"].value_counts().reset_index()

)
print(intent_status_group)


sns.histplot(

    data=intent_status_group,

    x="loan_intent",

    weights="count",

    hue="loan_status",

    multiple="stack",

    palette="coolwarm",

)



plt.title("Loan Intent vs. Loan Status")

plt.xlabel("Loan Intent")

plt.ylabel("Count")

plt.xticks(rotation=45)

plt.tight_layout()

plt.show()

In [None]:
#Loan Grade
grade_status_group = (

    df.groupby(["loan_grade"])["loan_status"].value_counts().reset_index()

)

total_counts = grade_status_group.groupby("loan_grade")["count"].transform("sum")



grade_status_group["percentage"] = grade_status_group["count"] * 100 / total_counts
print(grade_status_group)
sns.histplot(

    data=grade_status_group,

    x="loan_grade",

    weights="count",

    hue="loan_status",

    multiple="stack",

    palette="coolwarm",

)



plt.title("Loan Grade vs. Loan Status")

plt.xlabel("Loan Grade")

plt.ylabel("Count")

plt.xticks(rotation=45)

plt.tight_layout()

plt.show()

In [None]:
#comparing what is percentage of Loan Stauts in each Loan Grade
sns.histplot(

    data=grade_status_group,

    x="loan_grade",

    weights="percentage",

    hue="loan_status",

    multiple="stack",

    palette="coolwarm",

)



plt.title("Loan Grade vs. Loan Status on percentage")

plt.xlabel("Loan Intent")

plt.ylabel("Percentage")

plt.xticks(rotation=45)

plt.tight_layout()

plt.show()

As we observer that as the Loan Grade changes( lets say decreases) percentage of Loan status to default increases

In [None]:
#lets understand what is majour purpose of loans
plt.figure(figsize=(15,9))
sns.countplot(data = df, x = 'loan_intent', hue = 'loan_status')
plt.title("Relationship between home ownership and loan status")
plt.show()

# Data Preprosessing

In [None]:
df = pd.read_csv('/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv')

In [None]:
# finding null values
print(df.isna().sum())


In [None]:
#filling NA values with mean
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].mean())
df['loan_int_rate']= df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

In [None]:
#finding duplicate records
df.duplicated().sum()

In [None]:
# remove duplicate values
df.drop_duplicates(inplace=True)

In [None]:
# removing records of person's age >100 as this are the extreem case scenarios or possibly data reading error
df = df.drop(df[df['person_age'] > 100].index)

In [None]:
# lets label encode all categorical variables (using label encoding to preserve)
from sklearn import preprocessing
LE = preprocessing.LabelEncoder()
levar = {}
# Iterate through all categorical columns
for col in df.select_dtypes(include=['object']).columns:
    LE = preprocessing.LabelEncoder()
    df[col] = LE.fit_transform(df[col])  # Perform label encoding
    levar[col] = dict(zip(LE.classes_, LE.transform(LE.classes_)))

for col, mapping in levar.items():
    print(f"Column: {col}")
    print(mapping,'\n')

In [None]:
#lets identify the correlation between variables
sns.heatmap(df.corr())

# Model Training

In [None]:
# Define features and target|
X = df.drop(columns=['loan_status'])
y = df['loan_status']

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# scaling dataset to 0 - 1
scaler = preprocessing.StandardScaler()



X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn import metrics
def print_metrics(model, y_pred, y_prob):
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    report = classification_report(y_test, y_pred)

    print(report)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'ACCURACY OF THE MODEL: {accuracy:.2f}')

    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)

    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
#Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print_metrics(clf,y_pred,y_test)

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)
y_prob = decision_tree.predict_proba(X_test)[:, 1]

print_metrics(decision_tree, y_pred, y_prob)

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
KNC = KNeighborsClassifier(n_neighbors=6)
KNC.fit(X_train, y_train)

y_pred = KNC.predict(X_test)
y_prob = KNC.predict_proba(X_test)[:, 1]

print_metrics(KNC, y_pred, y_prob)