# Loan risk analysis

## Imports and data loading

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/credit_analysis.csv')
df = df.dropna()  # Drop missing values

## Preprocessing

In [2]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()

# Categorical columns
cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

# Numerical columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()
    
cat_counts = df.value_counts()
threshold = 5  # Set threshold for noise removal

# Replace rare values with 'Other'
rare_values = cat_counts[cat_counts < threshold].index
df = df.apply(lambda x: x.mask(x.isin(rare_values), "Other"))
df.sample(5)

# Remove numerical noise

# Calculate the z-score for each numerical column
z_scores = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std()

# Find the absolute z-score for each value
z_scores = z_scores.abs()

# Remove rows with z-scores greater than 3
df = df[(z_scores < 3).all(axis=1)]

## Model

In [3]:
x = df.drop("Risk Level", axis=1)  # Drop the target column
y = df["Risk Level"]  # Get the target column

# Hot encode categorical columns
label_encoder = LabelEncoder()
x_encoded = x.copy()
for column in x.columns:
    x_encoded[column] = label_encoder.fit_transform(x[column])

x_train, x_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(x_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame with the feature importances
feature_importance_df = pd.DataFrame({'Feature': x_encoded.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                Feature  Importance
5            Debt Level    0.200679
4        Credit History    0.195807
0             Age Range    0.142258
6  Debt-to-Income Ratio    0.131231
2             Intention    0.103243
8                Income    0.082729
1     Employment Status    0.074014
3       Education Level    0.070041
7      Guarantor Status    0.000000


## Accuracy analysis

In [4]:
accuracy = rf_classifier.score(x_test, y_test)
print(f"Accuracy on test set: {accuracy:.4f}")

Accuracy on test set: 0.6721
