In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix


# Load the dataset


In [3]:
df = pd.read_csv('credit_risk_dataset.csv')
df.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


# One-hot encode categorical variables


In [4]:

df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], drop_first=True)



# Split the data into training and testing sets


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the formula

In [9]:
formula = 'loan_status ~ person_age + person_income + person_emp_length + loan_amnt + loan_int_rate + loan_percent_income + cb_person_cred_hist_length + ' + ' + '.join([col for col in df.columns if 'person_home_ownership_' in col or 'loan_intent_' in col or 'loan_grade_' in col or 'cb_person_default_on_file_' in col])
model = smf.glm(formula=formula, data=train_df, family=sm.families.Binomial()).fit()


# Predict probabilities for the test set


In [10]:
test_df['pred_prob'] = model.predict(test_df)

# Convert probabilities to binary predictions (threshold = 0.5)
test_df['pred_loan_status'] = test_df['pred_prob'].apply(lambda x: 1 if x > 0.5 else 0)



# Evaluation metrics

In [11]:
# Calculate accuracy
accuracy = accuracy_score(test_df['loan_status'], test_df['pred_loan_status'])
print("Accuracy:", accuracy)

# Print confusion matrix
conf_matrix = confusion_matrix(test_df['loan_status'], test_df['pred_loan_status'])
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.8525395120454197
Confusion Matrix:
 [[4859  213]
 [ 748  697]]


# Print the model summary


In [12]:
print(model.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:            loan_status   No. Observations:                22920
Model:                            GLM   Df Residuals:                    22897
Model Family:                Binomial   Df Model:                           22
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -7655.6
Date:                Thu, 25 May 2023   Deviance:                       15311.
Time:                        10:22:40   Pearson chi2:                 2.55e+04
No. Iterations:                     7   Pseudo R-squ. (CS):             0.3127
Covariance Type:            nonrobust                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

* The model is fitted on 22920 observations and 22 predictors.
'loan_percent_income' has the highest positive coefficient, indicating a strong positive relationship with loan defaulting.

* 'person_home_ownership_OWN' has a significant negative effect, indicating people who own homes are less likely to default.

* The higher the loan grade (from B to G), the higher the chance of loan defaulting.

* 'loan_intent_VENTURE' and 'loan_intent_EDUCATION' have significant negative coefficients, suggesting loans taken for these purposes are less likely to default.

* Higher interest rates (loan_int_rate) slightly increase the likelihood of loan defaulting.

* Older age (person_age) and longer employment length (person_emp_length) slightly reduce the chance of defaulting, but their effects are not statistically significant at a 5% significance level.

* The likelihood of defaulting is not significantly affected by the length of credit history (cb_person_cred_hist_length) or a history of default (cb_person_default_on_file_Y).

* The model's Pseudo R-square is 0.3127, which suggests about 31% of the variability in loan defaulting is explained by the predictors in this model.

* The standard errors and z-scores indicate that most of the variables are statistically significant in predicting loan defaulting, but some variables (like cb_person_cred_hist_length and cb_person_default_on_file_Y) are not.