## Credit Preditor using Logistic Regression

### Setting up the environment

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import numpy as np
import math

In [44]:
df = pd.read_csv("german_data.csv", sep=" ", header=None)
df.columns
df[20] = np.where(df[20] == 2, 0, 1)
df[20] = df[20].astype("category", copy=False)

In [45]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   0       1000 non-null   object  
 1   1       1000 non-null   int64   
 2   2       1000 non-null   object  
 3   3       1000 non-null   object  
 4   4       1000 non-null   int64   
 5   5       1000 non-null   object  
 6   6       1000 non-null   object  
 7   7       1000 non-null   int64   
 8   8       1000 non-null   object  
 9   9       1000 non-null   object  
 10  10      1000 non-null   int64   
 11  11      1000 non-null   object  
 12  12      1000 non-null   int64   
 13  13      1000 non-null   object  
 14  14      1000 non-null   object  
 15  15      1000 non-null   int64   
 16  16      1000 non-null   object  
 17  17      1000 non-null   int64   
 18  18      1000 non-null   object  
 19  19      1000 non-null   object  
 20  20      1000 non-null   category
dtypes: category(1),

In [46]:
df.columns = [
    "checking_account_status",      # Attribute 1
    "duration_months",              # Attribute 2
    "credit_history",               # Attribute 3
    "purpose",                      # Attribute 4
    "credit_amount",                # Attribute 5
    "savings_account",              # Attribute 6
    "employment_since",             # Attribute 7
    "installment_rate",             # Attribute 8
    "personal_status_sex",          # Attribute 9
    "other_debtors",                # Attribute 10
    "present_residence_since",      # Attribute 11
    "property",                     # Attribute 12
    "age",                          # Attribute 13
    "other_installment_plans",      # Attribute 14
    "housing",                      # Attribute 15
    "number_existing_credits",      # Attribute 16
    "job",                          # Attribute 17
    "people_liable",                # Attribute 18
    "telephone",                    # Attribute 19
    "foreign_worker",               # Attribute 20
    "Credit given?"
]

In [47]:
df.head()

Unnamed: 0,checking_account_status,duration_months,credit_history,purpose,credit_amount,savings_account,employment_since,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_existing_credits,job,people_liable,telephone,foreign_worker,Credit given?
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,0
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,0


### Here it is important to NORMALIZE the values of the predictors

#### Normalizing (or scaling) the predictors helps when variables hugely vary in measurment (for example Age vs Salary),
#### allowing the model to give the same importance to every variable (independent of its units of measurment)
#### before the actual training process

In [48]:
X = df[[
    "duration_months",         # 2
    "credit_amount",           # 5
    "installment_rate",        # 8
    "present_residence_since", # 11
    "age",                     # 13
    "number_existing_credits", # 16
    "people_liable"            # 18
]]  ##Only the numerical data, because transforming the categories to floating values would requiere pipelines
y = df["Credit given?"]  ##Note that this column contains categories, not int64

In [49]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
## this is Z-score standarization, which basically substracts the mean to all the values to later devide each value by
## the standard deviation, usually making most values range from [-3;3] 
X_standarized = scaler.fit_transform(X)

### Preparing the data for training the model

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_standarized, y, test_size=0.3, random_state=990)

In [52]:
X_train

array([[-0.24085723, -0.48142554,  0.02414692, ...,  0.03993041,
        -0.704926  , -0.42828957],
       [ 0.25695309, -0.06353681,  0.02414692, ..., -1.19140394,
        -0.704926  , -0.42828957],
       [-0.73866754, -0.9436197 ,  0.91847717, ..., -0.83959412,
        -0.704926  , -0.42828957],
       ...,
       [-0.73866754, -0.70507931, -0.87018333, ..., -1.36730885,
        -0.704926  , -0.42828957],
       [ 0.25695309, -0.30030498,  0.91847717, ...,  1.53512213,
         1.02707891, -0.42828957],
       [-0.90460432, -0.84260334,  0.91847717, ...,  0.39174023,
        -0.704926  , -0.42828957]], shape=(700, 7))

### Introducing the model and training it

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
md = LogisticRegression()
md.fit(X_train, y_train)
md.coef_

array([[-0.43688133, -0.07862818, -0.20658385, -0.03123329,  0.30904876,
         0.06984367, -0.06817791]])

### Trying for the predictions

In [55]:
predictions = md.predict(X_test)
predictions.view()

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [56]:
print(y_test)

707    0
725    1
91     1
19     1
31     1
      ..
698    1
307    0
776    1
257    0
297    1
Name: Credit given?, Length: 300, dtype: category
Categories (2, int64): [0, 1]


### Evaluation of model

In [89]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy: .2f}')

Accuracy:  0.73


#### An accuary of 73% is actually incredible when we consider we didn't take into account most vaiables

### Now let's try adding more variables, specifically categorical and numerical (non-float) varibles
- Categorical variables:
    - 3 (credit score)
    - 6 (saving ammount)
    - 7 (employment)
    - 10 (other debtors)
- non-float variables:
    - 2 (duration of bank account)
    - 5 (credit score)

In [58]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [62]:
df_checking_status_ohe = ohe.fit_transform(df[['checking_account_status']])
df_checking_status_ohe

Unnamed: 0,checking_account_status_A11,checking_account_status_A12,checking_account_status_A13,checking_account_status_A14
0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
995,0.0,0.0,0.0,1.0
996,1.0,0.0,0.0,0.0
997,0.0,0.0,0.0,1.0
998,1.0,0.0,0.0,0.0


In [65]:
df_credit_history_ohe = ohe.fit_transform(df[['credit_history']])
df_credit_history_ohe

Unnamed: 0,credit_history_A30,credit_history_A31,credit_history_A32,credit_history_A33,credit_history_A34
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
995,0.0,0.0,1.0,0.0,0.0
996,0.0,0.0,1.0,0.0,0.0
997,0.0,0.0,1.0,0.0,0.0
998,0.0,0.0,1.0,0.0,0.0


In [68]:
df_other_debtors_ohe = ohe.fit_transform(df[['other_debtors']])
df_other_debtors_ohe

Unnamed: 0,other_debtors_A101,other_debtors_A102,other_debtors_A103
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
...,...,...,...
995,1.0,0.0,0.0
996,1.0,0.0,0.0
997,1.0,0.0,0.0
998,1.0,0.0,0.0


In [70]:
df_sex_ohe = ohe.fit_transform(df[['personal_status_sex']])
df_sex_ohe

Unnamed: 0,personal_status_sex_A91,personal_status_sex_A92,personal_status_sex_A93,personal_status_sex_A94
0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
995,0.0,1.0,0.0,0.0
996,1.0,0.0,0.0,0.0
997,0.0,0.0,1.0,0.0
998,0.0,0.0,1.0,0.0


In [72]:
df_purpose_ohe = ohe.fit_transform(df[['purpose']])
df_purpose_ohe

Unnamed: 0,purpose_A40,purpose_A41,purpose_A410,purpose_A42,purpose_A43,purpose_A44,purpose_A45,purpose_A46,purpose_A48,purpose_A49
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [151]:
df_ohe = pd.concat([df, df_checking_status_ohe, df_credit_history_ohe,df_other_debtors_ohe, df_sex_ohe,df_purpose_ohe], axis=1).drop(columns=['checking_account_status', 'credit_history','other_debtors', 'purpose','personal_status_sex'])
df_ohe.head(10)

Unnamed: 0,duration_months,credit_amount,savings_account,employment_since,installment_rate,present_residence_since,property,age,other_installment_plans,housing,...,purpose_A40,purpose_A41,purpose_A410,purpose_A42,purpose_A43,purpose_A44,purpose_A45,purpose_A46,purpose_A48,purpose_A49
0,6,1169,A65,A75,4,4,A121,67,A143,A152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,48,5951,A61,A73,2,2,A121,22,A143,A152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,12,2096,A61,A74,2,3,A121,49,A143,A152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,42,7882,A61,A74,2,4,A122,45,A143,A153,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24,4870,A61,A73,3,4,A124,53,A143,A153,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,36,9055,A65,A73,2,4,A124,35,A143,A153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,24,2835,A63,A75,3,4,A122,53,A143,A152,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,36,6948,A61,A73,2,2,A123,35,A143,A151,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,12,3059,A64,A74,2,4,A121,61,A143,A152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,30,5234,A61,A71,4,2,A123,28,A143,A152,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
df_ohe.columns

Index(['duration_months', 'credit_amount', 'savings_account',
       'employment_since', 'installment_rate', 'present_residence_since',
       'property', 'age', 'other_installment_plans', 'housing',
       'number_existing_credits', 'job', 'people_liable', 'telephone',
       'foreign_worker', 'Credit given?', 'checking_account_status_A11',
       'checking_account_status_A12', 'checking_account_status_A13',
       'checking_account_status_A14', 'credit_history_A30',
       'credit_history_A31', 'credit_history_A32', 'credit_history_A33',
       'credit_history_A34', 'other_debtors_A101', 'other_debtors_A102',
       'other_debtors_A103', 'personal_status_sex_A91',
       'personal_status_sex_A92', 'personal_status_sex_A93',
       'personal_status_sex_A94', 'purpose_A40', 'purpose_A41', 'purpose_A410',
       'purpose_A42', 'purpose_A43', 'purpose_A44', 'purpose_A45',
       'purpose_A46', 'purpose_A48', 'purpose_A49'],
      dtype='object')

### This are all the variables I consider non_ordinal, meaning I couldn't quite put them in order of measurment. Let's see has progressed in comparison to the only numerical-trained model

In [153]:
numerical_categorical_model = LogisticRegression() ## 'ncm' for short
X_ncm = df_ohe[['duration_months', 'credit_amount', 'installment_rate', 'present_residence_since', 'age','number_existing_credits', 'people_liable','checking_account_status_A11',
       'checking_account_status_A12', 'checking_account_status_A13',
       'checking_account_status_A14', 'credit_history_A30',
       'credit_history_A31', 'credit_history_A32', 'credit_history_A33',
       'credit_history_A34', 'other_debtors_A101', 'other_debtors_A102',
       'other_debtors_A103', 'personal_status_sex_A91',
       'personal_status_sex_A92', 'personal_status_sex_A93',
       'personal_status_sex_A94', 'purpose_A40', 'purpose_A41', 'purpose_A410',
       'purpose_A42', 'purpose_A43', 'purpose_A44', 'purpose_A45',
       'purpose_A46', 'purpose_A48', 'purpose_A49']]
y_ncm = df_ohe[['Credit given?']]

## Standarized the X variables
X_ncm_standarized = scaler.fit_transform(X_ncm)

In [154]:
X_train_ncm, X_test_ncm, y_train_ncm, y_test_ncm = train_test_split(X_ncm_standarized, y, test_size=0.3, random_state=990)

### Fit the data to a new model

In [155]:
numerical_categorical_model.fit(X_train_ncm, y_train_ncm)
numerical_categorical_model.coef_

array([[-0.45810116, -0.13248339, -0.32273573,  0.01162936,  0.27187448,
        -0.13044452, -0.07053801, -0.39787614, -0.20591841,  0.07158226,
         0.51444744, -0.16038773, -0.2089365 , -0.08022234,  0.02287174,
         0.241886  , -0.02599041, -0.11616507,  0.13774383, -0.16171604,
        -0.05395236,  0.09431887,  0.04586499, -0.214272  ,  0.29856694,
         0.06857057, -0.032968  ,  0.16843857, -0.05616795, -0.09961051,
        -0.30655487,  0.1098653 ,  0.02272748]])

In [156]:
ncm_predictions = numerical_categorical_model.predict(X_test_ncm)
accuracy_ncm = accuracy_score(y_test_ncm, ncm_predictions)
print(f'Accuracy: {accuracy_ncm: .2f}')

Accuracy:  0.76


### As we can see the improvement was only a 3% increase in precision, which is honestly disappointing

#### Now I'll add the ordinal data in a quite crude and brute manner

In [149]:
df_ohe['savings_account'].unique()


array(['0.0', '1.0', '3.0', '4.0', '2.0'], dtype=object)

In [150]:
df_ohe.head(10)

Unnamed: 0,duration_months,credit_amount,savings_account,employment_since,installment_rate,present_residence_since,property,age,other_installment_plans,housing,...,purpose_A40,purpose_A41,purpose_A410,purpose_A42,purpose_A43,purpose_A44,purpose_A45,purpose_A46,purpose_A48,purpose_A49
0,6,1169,0.0,A75,4,4,A121,67,A143,A152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,48,5951,1.0,A73,2,2,A121,22,A143,A152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,12,2096,1.0,A74,2,3,A121,49,A143,A152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,42,7882,1.0,A74,2,4,A122,45,A143,A153,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24,4870,1.0,A73,3,4,A124,53,A143,A153,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,36,9055,0.0,A73,2,4,A124,35,A143,A153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,24,2835,3.0,A75,3,4,A122,53,A143,A152,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,36,6948,1.0,A73,2,2,A123,35,A143,A151,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,12,3059,4.0,A74,2,4,A121,61,A143,A152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,30,5234,1.0,A71,4,2,A123,28,A143,A152,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
df_ohe['employment_since'].unique()


array(['A75', 'A73', 'A74', 'A71', 'A72'], dtype=object)

In [146]:
saving_account_levels = [['A65', 'A61', 'A62', 'A63', 'A64']]
employment_level = [[ 'A71','A72', 'A73', 'A74', 'A75']]
## It is extreamly important the categories are organized in order from least to most

In [133]:
from sklearn.preprocessing import OrdinalEncoder


In [140]:
oe_employment = OrdinalEncoder(categories=employment_level)

ordinalized_employment = oe_employment.fit_transform(
    df_ohe[['employment_since']]
)

ordinalized_employment

array([[4.],
       [2.],
       [3.],
       [3.],
       [2.],
       [2.],
       [4.],
       [2.],
       [3.],
       [0.],
       [1.],
       [1.],
       [2.],
       [4.],
       [2.],
       [2.],
       [4.],
       [1.],
       [4.],
       [4.],
       [2.],
       [2.],
       [1.],
       [1.],
       [2.],
       [2.],
       [4.],
       [2.],
       [2.],
       [4.],
       [1.],
       [2.],
       [2.],
       [4.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [4.],
       [2.],
       [2.],
       [3.],
       [4.],
       [1.],
       [4.],
       [0.],
       [2.],
       [2.],
       [1.],
       [4.],
       [2.],
       [2.],
       [4.],
       [2.],
       [0.],
       [2.],
       [4.],
       [1.],
       [2.],
       [4.],
       [4.],
       [2.],
       [1.],
       [4.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [4.],
       [4.],
       [3.],
       [4.],
       [4.],
       [1.],

In [143]:
df_ohe[['savings_account']]

Unnamed: 0,savings_account
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
995,1.0
996,1.0
997,1.0
998,1.0


In [137]:
df_ohe['savings_account'].dtype

dtype('float64')