In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 

In [4]:
data = pd.read_csv('credit_risk_dataset.csv/credit_risk_dataset.csv')
data

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [6]:
data['person_emp_length'] = data['person_emp_length'].fillna(data['person_emp_length'].median())
data['loan_int_rate'] = data['loan_int_rate'].fillna(data['loan_int_rate'].median())


In [7]:
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
numerical_features = ['person_age', 'person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


In [12]:
X = data[categorical_features + numerical_features]
y = data['loan_status']
y

0        1
1        0
2        1
3        1
4        1
        ..
32576    0
32577    0
32578    1
32579    0
32580    0
Name: loan_status, Length: 32581, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Step 6: Predict loan_status
# Example: Predict on the first 5 rows of the test set
predictions = pipeline.predict(X_test[:5])
print("Predicted loan_status for first 5 test samples:", predictions)



Accuracy: 0.8612858677305508
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      5072
           1       0.76      0.54      0.64      1445

    accuracy                           0.86      6517
   macro avg       0.82      0.75      0.77      6517
weighted avg       0.85      0.86      0.85      6517

Predicted loan_status for first 5 test samples: [0 0 0 1 1]


In [23]:
# Add more unseen data
more_unseen_data = pd.DataFrame({
    'person_home_ownership': ['RENT', 'OWN', 'MORTGAGE', 'RENT', 'OWN', 'MORTGAGE', 'RENT', 'OWN', 'MORTGAGE', 'RENT', 'OWN', 'MORTGAGE', 'RENT', 'OWN', 'MORTGAGE', 'RENT', 'OWN', 'MORTGAGE', 'RENT', 'OWN'],
    'loan_intent': ['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT', 'DEBTCONSOLIDATION', 'PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT', 'DEBTCONSOLIDATION', 'PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT', 'DEBTCONSOLIDATION', 'PERSONAL', 'EDUCATION'],
    'loan_grade': ['B', 'A', 'C', 'D', 'E', 'A', 'B', 'A', 'C', 'D', 'E', 'A', 'B', 'A', 'C', 'D', 'E', 'A', 'B', 'A'],
    'cb_person_default_on_file': ['N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y'],
    'person_age': [30, 22, 45, 35, 50, 27, 40, 29, 33, 31, 28, 26, 32, 24, 36, 38, 34, 25, 37, 23],
    'person_income': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000],
    'loan_amnt': [10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 105000],
    'loan_int_rate': [12.5, 10.0, 15.0, 13.5, 11.0, 9.5, 14.0, 12.0, 10.5, 8.0, 7.5, 16.0, 14.5, 13.0, 11.5, 10.0, 8.5, 7.0, 15.5, 13.0],
    'loan_percent_income': [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.05, 1.1, 1.15],
    'cb_person_cred_hist_length': [5, 3, 10, 7, 12, 8, 9, 6, 11, 4, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
})


# Predict loan_status for the combined unseen data
combined_unseen_predictions = pipeline.predict(more_unseen_data)
print("Predicted loan_status for combined unseen data:", combined_unseen_predictions)

Predicted loan_status for combined unseen data: [0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1 0]


# What we predict ?

we predict the loan_status 
- **0 :** failed 
- **1 :** delivered