<a href="https://colab.research.google.com/github/DeepthiVReddy/Credit-Risk-modeling/blob/main/Modeling_and_Evaluating_Credit_Risk_using_Logistic_Regression_on_Loan_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# Load the data file
data = pd.read_csv("CreditCardLoan.csv")

# Create a dummy variable 'Noncurrent' (Noncurrent=0 if Status =="Current" else 1)
data['Noncurrent'] = data['Status'].apply(lambda x: 0 if x == "Current" else 1)

# Reorder the levels of CreditGrade to make “AA” the first level
data['CreditGrade'] = pd.Categorical(data['CreditGrade'], categories=["AA", "A", "B", "C", "HR", "NC"], ordered=True)

# Estimate a logistic regression model for the event of Noncurrent being 1, with BorrowerRate as the independent variable
logit_model = smf.logit("Noncurrent ~ BorrowerRate", data=data).fit()

# Output the summary of the model
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.245468
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:             Noncurrent   No. Observations:                 5500
Model:                          Logit   Df Residuals:                     5498
Method:                           MLE   Df Model:                            1
Date:                Tue, 12 Nov 2024   Pseudo R-squ.:                 0.09170
Time:                        20:31:33   Log-Likelihood:                -1350.1
converged:                       True   LL-Null:                       -1486.4
Covariance Type:            nonrobust   LLR p-value:                 3.078e-61
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -5.3785      0.217    -24.749      0.000      -5.804      -4.953
BorrowerRate    13.2321

In [None]:
import numpy as np

# Calculate the predicted probability of Noncurrent=1 using the logistic regression model coefficients
data['ProbNoncurrent'] = logit_model.predict(data)

# Calculate the log likelihood for each observation
data['LogLikelihood'] = data.apply(
    lambda row: np.log(row['ProbNoncurrent']) if row['Noncurrent'] == 1 else np.log(1 - row['ProbNoncurrent']),
    axis=1
)

# Calculate the sum of the log likelihoods
log_likelihood_sum = data['LogLikelihood'].sum()

# Display results
print("Sum of LogLikelihood:", log_likelihood_sum)


Sum of LogLikelihood: -1350.0750160502207


In [None]:
# Predict the probability of Noncurrent=1 using the model and save it as pred_noncurrent1
data['pred_noncurrent1'] = logit_model.predict(data)

# Compare ProbNoncurrent and pred_noncurrent1
comparison = data[['ProbNoncurrent', 'pred_noncurrent1']]

# Display the first few rows for comparison
print(comparison.head())

# Check if they are nearly identical by calculating the difference
data['difference'] = data['ProbNoncurrent'] - data['pred_noncurrent1']
print("Max Difference:", data['difference'].max())
print("Min Difference:", data['difference'].min())
print("Mean Difference:", data['difference'].mean())


   ProbNoncurrent  pred_noncurrent1
0        0.032494          0.032494
1        0.133312          0.133312
2        0.032494          0.032494
3        0.176363          0.176363
4        0.013041          0.013041
Max Difference: 0.0
Min Difference: 0.0
Mean Difference: 0.0


In [None]:
# Estimate a logistic regression model with CreditGrade, Amount, Age, and BorrowerRate as independent variables
logit_model_2 = smf.logit("Noncurrent ~ C(CreditGrade) + Amount + Age + BorrowerRate", data=data).fit()

# Output the summary of the model
print(logit_model_2.summary())


Optimization terminated successfully.
         Current function value: 0.213318
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:             Noncurrent   No. Observations:                 5500
Model:                          Logit   Df Residuals:                     5491
Method:                           MLE   Df Model:                            8
Date:                Tue, 12 Nov 2024   Pseudo R-squ.:                  0.2107
Time:                        20:54:28   Log-Likelihood:                -1173.3
converged:                       True   LL-Null:                       -1486.4
Covariance Type:            nonrobust   LLR p-value:                5.313e-130
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -8.2407      0.525    -15.694      0.000      -9.270      -7.212

In [None]:
from sklearn.metrics import confusion_matrix

# Predict the probability of Noncurrent=1 using the model and save it as pred_noncurrent
data['pred_noncurrent'] = logit_model_2.predict(data)

# Recode pred_noncurrent into a dummy variable pred_noncurrent_point3 at cutoff 0.3
data['pred_noncurrent_point3'] = data['pred_noncurrent'].apply(lambda x: 1 if x >= 0.3 else 0)

# Produce a confusion matrix based on pred_noncurrent_point3
conf_matrix = confusion_matrix(data['Noncurrent'], data['pred_noncurrent_point3'])
tn, fp, fn, tp = conf_matrix.ravel()  # Extract true negatives, false positives, false negatives, true positives

# Calculate sensitivity and specificity
sensitivity = tp / (tp + fn)  # Sensitivity (Recall) = TP / (TP + FN)
specificity = tn / (tn + fp)  # Specificity = TN / (TN + FP)

# Display results
print("Confusion Matrix:")
print(conf_matrix)
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")


Confusion Matrix:
[[4964  115]
 [ 338   83]]
Sensitivity: 0.1971
Specificity: 0.9774


In [None]:
# Recode pred_noncurrent into a dummy variable pred_noncurrent_point2 at cutoff 0.2
data['pred_noncurrent_point2'] = data['pred_noncurrent'].apply(lambda x: 1 if x >= 0.2 else 0)

# Produce a confusion matrix based on pred_noncurrent_point2
conf_matrix_point2 = confusion_matrix(data['Noncurrent'], data['pred_noncurrent_point2'])
tn, fp, fn, tp = conf_matrix_point2.ravel()  # Extract true negatives, false positives, false negatives, true positives

# Calculate sensitivity and specificity
sensitivity_point2 = tp / (tp + fn)  # Sensitivity (Recall) = TP / (TP + FN)
specificity_point2 = tn / (tn + fp)  # Specificity = TN / (TN + FP)

# Display results
print("Confusion Matrix (Cutoff = 0.2):")
print(conf_matrix_point2)
print(f"Sensitivity (Cutoff = 0.2): {sensitivity_point2:.4f}")
print(f"Specificity (Cutoff = 0.2): {specificity_point2:.4f}")


Confusion Matrix (Cutoff = 0.2):
[[4722  357]
 [ 255  166]]
Sensitivity (Cutoff = 0.2): 0.3943
Specificity (Cutoff = 0.2): 0.9297
