In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [10]:
data=pd.read_csv(r"C:\Users\Dell\Desktop\credit_risk_dataset.csv")

In [11]:
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


# Basic Data Information

In [12]:
data.shape

(32581, 12)

In [13]:
data.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [15]:
data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [16]:
data.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [17]:
emp_median = data['person_emp_length'].median()
data['person_emp_length'] = data['person_emp_length'].fillna(emp_median)
print("Filled missing emp_length with median:", emp_median)

Filled missing emp_length with median: 4.0


In [18]:
int_median = data['loan_int_rate'].median()
data['loan_int_rate'] = data['loan_int_rate'].fillna(int_median)
print("Filled missing loan_int_rate with median:", int_median)

Filled missing loan_int_rate with median: 10.99


In [19]:
data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

# Cleaned Sucessfully

# the target (loan_status)

In [21]:
print("Counts (0 = good, 1 = default):")
display(data['loan_status'].value_counts())
print("\nPercentages:")
display(data['loan_status'].value_counts(normalize=True).round(3))

Counts (0 = good, 1 = default):


loan_status
0    25473
1     7108
Name: count, dtype: int64


Percentages:


loan_status
0    0.782
1    0.218
Name: proportion, dtype: float64

In [22]:
# Force numeric where expected
for c in ['person_age','person_income','person_emp_length','loan_amnt','loan_int_rate',
          'loan_percent_income','cb_person_cred_hist_length','loan_status']:
    if c in data.columns:
        data[c] = pd.to_numeric(data[c], errors='coerce')

# Make string columns tidy
for c in ['person_home_ownership','loan_intent','loan_grade','cb_person_default_on_file']:
    if c in data.columns:
        data[c] = data[c].astype(str).str.strip().replace('nan','MISSING')
        
# Map Y/N to 1/0 for default-in-file
if 'cb_person_default_on_file' in data.columns:
    data['cb_person_default_on_file'] = data['cb_person_default_on_file'].map({'Y':1,'N':0}).fillna(0).astype(int)

print("Done cleaning types.")

Done cleaning types.


# DTI (Debt-to-Income)

In [23]:
# Use the existing loan_percent_income if division fails
data['dti'] = data['loan_amnt'] / data['person_income'].replace(0, np.nan)
data['dti'] = data['dti'].fillna(data['loan_percent_income'])
print("Sample dti values:")
display(data[['loan_amnt','person_income','loan_percent_income','dti']].head())

Sample dti values:


Unnamed: 0,loan_amnt,person_income,loan_percent_income,dti
0,35000,59000,0.59,0.59322
1,1000,9600,0.1,0.104167
2,5500,9600,0.57,0.572917
3,35000,65500,0.53,0.534351
4,35000,54400,0.55,0.643382


In [24]:
data['age_group'] = pd.cut(data['person_age'],
                           bins=[0,25,35,50,120],
                           labels=['<25','25-34','35-49','50+'])
display(data[['person_age','age_group']].head())

Unnamed: 0,person_age,age_group
0,22,<25
1,21,<25
2,25,<25
3,23,<25
4,24,<25


# features and turn categories into numbers

In [25]:
features = [
    'person_age','person_income','person_emp_length','loan_amnt','loan_int_rate',
    'loan_percent_income','cb_person_cred_hist_length','dti',
    'person_home_ownership','loan_intent','loan_grade','age_group','cb_person_default_on_file'
]

# Keep only existing features (safe)
features = [f for f in features if f in data.columns]

X = pd.get_dummies(data[features], drop_first=True)  # turn categories into numbers
y = data['loan_status']

print("Feature matrix shape:", X.shape)
display(X.head().T[:20])  # show first 20 feature rows (transpose for readability)

Feature matrix shape: (32581, 26)


Unnamed: 0,0,1,2,3,4
person_age,22,21,25,23,24
person_income,59000,9600,9600,65500,54400
person_emp_length,123.0,5.0,1.0,4.0,8.0
loan_amnt,35000,1000,5500,35000,35000
loan_int_rate,16.02,11.14,12.87,15.23,14.27
loan_percent_income,0.59,0.1,0.57,0.53,0.55
cb_person_cred_hist_length,3,2,3,2,4
dti,0.59322,0.104167,0.572917,0.534351,0.643382
cb_person_default_on_file,1,0,0,0,1
person_home_ownership_OTHER,False,False,False,False,False


# Split into train and test

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    stratify=y,
                                                    random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train default rate:", y_train.mean(), "Test default rate:", y_test.mean())

Train shape: (26064, 26) Test shape: (6517, 26)
Train default rate: 0.21815531000613875 Test default rate: 0.21819855761853613


# Logistic Regression

In [27]:
model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear', random_state=42)
model.fit(X_train, y_train)
print("Model trained.")

Model trained.


# Get predictions and evaluate (AUC + simple accuracy)

In [28]:
probs = model.predict_proba(X_test)[:,1]   # probability of default
preds = (probs >= 0.5).astype(int)        # predicted labels with cutoff 0.5

print("AUC (bigger is better, 0.5 = random):", round(roc_auc_score(y_test, probs), 4))
print("Accuracy at 0.5 cutoff:", round(accuracy_score(y_test, preds), 4))
print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_test, preds))

AUC (bigger is better, 0.5 = random): 0.8158
Accuracy at 0.5 cutoff: 0.7149
Confusion matrix (rows=true, cols=pred):
 [[3565 1530]
 [ 328 1094]]


#  Model Evaluation — Credit Risk Logistic Regression
#  Metrics Observed

AUC (Area Under Curve): 0.8158

AUC ranges from 0.5 (random guessing) to 1.0 (perfect model).

Our score of 0.82 means the model is quite good at ranking risky borrowers higher than safe ones.

Accuracy (at 0.5 cutoff): 71.5%

About 7 out of 10 predictions are correct.

But accuracy can be misleading in imbalanced datasets (if defaults are rare, the model could get high accuracy by always predicting "no default").

# Confusion Matrix

# Breakdown:

True Negatives (3565): Safe borrowers correctly predicted as safe.

False Positives (1530): Safe borrowers wrongly flagged as default → over-warning.

False Negatives (328): Default borrowers wrongly predicted as safe → risky misses.

True Positives (1094): Default borrowers correctly predicted as default.