In [1]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
import os
import sklearn
from sklearn.model_selection import train_test_split
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/german-credit-risk/german_credit.csv


## Predicting Loan Default - When should credit be approved?

**Problem statement**:

The aim of this notebook is to create a simple program that can predict wether a credit should be approved or not, based on previous client data. The program should help the bank minimize risk with future clients.

The model used will predict wether a client should have their loan approved or not, based on his or her history, using logistic regression for classification.

**The data:**

The data consists of a csv file containing records of clients from a private german bank. Includes the client profile (account balance, number of credits,...) and a variable **Creditability** (1 : credit-worthy 0 : not credit-worthy).
A detailed description of the variables can be found [here](https://newonlinecourses.science.psu.edu/stat508/book/export/html/803).

In [2]:
# read and inspect dataset
data = pd.read_csv('../input/german-credit-risk/german_credit.csv')
data.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2


In [3]:
data.describe() # data summary

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.7,2.577,20.903,2.545,2.828,3271.248,2.105,3.384,2.973,2.682,...,2.845,2.358,35.542,2.675,1.928,1.407,2.904,1.155,1.404,1.037
std,0.458487,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,...,1.103718,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856
min,0.0,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,...,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,1.0,12.0,2.0,1.0,1365.5,1.0,3.0,2.0,2.0,...,2.0,1.0,27.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
50%,1.0,2.0,18.0,2.0,2.0,2319.5,1.0,3.0,3.0,3.0,...,3.0,2.0,33.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0
75%,1.0,4.0,24.0,4.0,3.0,3972.25,3.0,5.0,4.0,3.0,...,4.0,3.0,42.0,3.0,2.0,2.0,3.0,1.0,2.0,1.0
max,1.0,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,...,4.0,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0


In [4]:
data.corr() # check for correlations with target variable

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
Creditability,1.0,0.350847,-0.214927,0.228785,-0.017979,-0.15474,0.178943,0.116002,-0.072404,0.088184,...,-0.002967,-0.142612,0.091272,0.109844,0.018119,0.045732,-0.032735,0.003015,0.036466,0.082079
Account Balance,0.350847,1.0,-0.072013,0.192191,0.028783,-0.042695,0.222867,0.106339,-0.00528,0.043261,...,-0.042234,-0.03226,0.058631,0.068274,0.023335,0.076005,0.040663,-0.014145,0.066296,-0.035187
Duration of Credit (month),-0.214927,-0.072013,1.0,-0.077186,0.147492,0.624988,0.047661,0.057381,0.074749,0.014789,...,0.034067,0.303971,-0.03755,-0.062884,0.153126,-0.011284,0.21091,-0.023834,0.164718,-0.13468
Payment Status of Previous Credit,0.228785,0.192191,-0.077186,1.0,-0.090336,-0.059915,0.039058,0.138225,0.044375,0.042171,...,0.063198,-0.053777,0.146337,0.159957,0.061428,0.437066,0.01035,0.01155,0.05237,0.028554
Purpose,-0.017979,0.028783,0.147492,-0.090336,1.0,0.06848,-0.018684,0.016013,0.048369,0.000157,...,-0.038221,0.010966,-0.000892,-0.10023,0.013495,0.054935,0.008085,-0.032577,0.078371,-0.113244
Credit Amount,-0.15474,-0.042695,0.624988,-0.059915,0.06848,1.0,0.064632,-0.008376,-0.271322,-0.016094,...,0.028917,0.311602,0.032273,-0.069392,0.133024,0.020785,0.285393,0.017144,0.277,-0.030662
Value Savings/Stocks,0.178943,0.222867,0.047661,0.039058,-0.018684,0.064632,1.0,0.12095,0.021993,0.017349,...,0.091424,0.018948,0.083434,0.001908,0.006644,-0.021644,0.011709,0.027514,0.087208,0.01045
Length of current employment,0.116002,0.106339,0.057381,0.138225,0.016013,-0.008376,0.12095,1.0,0.126161,0.111278,...,0.245081,0.087187,0.259116,-0.007279,0.115077,0.125791,0.101225,0.097192,0.060518,-0.022845
Instalment per cent,-0.072404,-0.00528,0.074749,0.044375,0.048369,-0.271322,0.021993,0.126161,1.0,0.119308,...,0.049302,0.053391,0.057271,0.007894,0.091229,0.021669,0.097755,-0.071207,0.014413,-0.094762
Sex & Marital Status,0.088184,0.043261,0.014789,0.042171,0.000157,-0.016094,0.017349,0.111278,0.119308,1.0,...,-0.027269,-0.00694,0.00515,-0.026747,0.098934,0.064672,-0.011956,0.122165,0.027275,0.073103


In [5]:
# new data will include the variables with highest correlation
# with dependent variable - creditability
x = data[['Account Balance', 'Duration of Credit (month)', 'Payment Status of Previous Credit']]

y = data['Creditability']

In [6]:
# splitting data
# test = 80% of data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.80, random_state=6)

In [7]:
# using statsmodels for model and metrics
import statsmodels.api as sm

# building the model and fitting the training data 
model = sm.Logit(y_train, x_train).fit()

Optimization terminated successfully.
         Current function value: 0.501809
         Iterations 6


In [8]:
# summary statistics of log. regression
model.summary()

0,1,2,3
Dep. Variable:,Creditability,No. Observations:,200.0
Model:,Logit,Df Residuals:,197.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 05 Mar 2021",Pseudo R-squ.:,0.1785
Time:,20:36:57,Log-Likelihood:,-100.36
converged:,True,LL-Null:,-122.17
Covariance Type:,nonrobust,LLR p-value:,3.37e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Account Balance,0.6364,0.146,4.346,0.000,0.349,0.923
Duration of Credit (month),-0.0500,0.013,-3.986,0.000,-0.075,-0.025
Payment Status of Previous Credit,0.2374,0.146,1.631,0.103,-0.048,0.523


In [9]:
pred_y = model.predict(x_test) # values predicted by model
pred_y.head()

978    0.759170
22     0.487316
909    0.146613
399    0.759170
438    0.559062
dtype: float64

Our logistic regression function returns the likelihood of a credit being worthy, and so values can range from 0 to 1.
We'll transform the likelyhood into a binary variable for classification.
If a credit is more than 50% likely to be worthy, it will be labeled worthy.
The function below can apply this process.

In [10]:
# function for turning likelihood into labels
def binary_classify(x): # takes int x returns output int label
    x = round(x, 2)
    if x >= 0.50:
        return 1
    return 0

pred = list(map(binary_classify, pred_y)) # apply function to all predictions

### Classification performance

Now that we have used logistic regression to perform classification, we can check how accurate the predictions are:

In [11]:
# accuracy score
sklearn.metrics.accuracy_score(y_test, pred)

0.7375

The model makes 73.75% of predictions correct.

## Proposed solution

In order to classify future clients as credit worthy or not, we can build a simple program that will take as inputs their account balance, credit duration and payment status of current credit. The program will then apply the previous model and classify the client, providing information to the bank of wether the credit should be approved or not.
The formula used for classification was obtained previously with *statsmodels*.

In [12]:
# predictive function based on logistic model - returns likelyhood
def log_func(x):
    balance, credit, pay_status = x
    result = (0.6364 * balance) + (-0.05 * credit) + (0.2374 * pay_status)
    return result

def binary_classify(x): # takes int x = likelihood returns output int label
    x = round(x, 2)
    if x >= 0.50:
        return 1
    return 0

def predict(x): # makes credit predictions
    return binary_classify(log_func(x)) 

def print_result(x): # prints output to user
    if predict(x) == 1:
        return "Credit worthy"
    return "Not credit worthy"

Let's try to apply the program as an example.
We can check the data of a specific client as a test:

In [13]:
test = data.iloc[97] # data from client
test = test[['Creditability', 'Account Balance', 'Duration of Credit (month)',
      'Payment Status of Previous Credit']]
test

Creditability                         0
Account Balance                       2
Duration of Credit (month)           36
Payment Status of Previous Credit     3
Name: 97, dtype: int64

Client data:
* Creditability: 0 - was classified not credit worthy
* Account Balance: 2
* Duration of Credit (month): 36
* Payment Status of Previous Credit: 3

In [14]:
# testing

input_test = (2, 36, 3) # client data
print('Result: ' + print_result(input_test))

Result: Not credit worthy


The client was correctly classified as not credit worthy.