In [275]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [276]:
df = pd.read_csv('../south_german_credit_data_preprocessed.csv')
pd.set_option('display.max_columns', None )
df

Unnamed: 0,checking account,duration in month,credit history,credit purpose,credit amount,savings account,employment since..,installment rate,status : sex,other debtors / guarantors,residence since,property,relationship : age,other installment plans,housing,existing credits,job,people to provide maintenance for,telephone,foreign worker,goodness
0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,buerge,wohnzeit,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
1,1,18,4,2,1049,1,2,4,2,1,4,2,21,3,1,1,3,2,1,2,1
2,1,9,4,0,2799,1,3,2,3,1,2,1,36,3,1,2,3,1,1,2,1
3,2,12,2,9,841,2,4,2,2,1,4,1,23,3,1,1,2,2,1,2,1
4,1,12,4,0,2122,1,3,3,3,1,2,1,39,3,1,2,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,1,24,2,3,1987,1,3,2,3,1,4,1,21,3,1,1,2,1,1,2,0
997,1,24,2,0,2303,1,5,4,3,2,1,1,45,3,2,1,3,2,1,2,0
998,4,21,4,0,12680,5,5,4,3,1,4,4,30,3,3,1,4,2,2,2,0
999,2,12,2,3,6468,5,1,2,3,1,1,4,52,3,2,1,4,2,2,2,0


In [277]:
# Dropping first row from the data frame 
df.drop(index=df.index[0], inplace=True)

df.head()

Unnamed: 0,checking account,duration in month,credit history,credit purpose,credit amount,savings account,employment since..,installment rate,status : sex,other debtors / guarantors,residence since,property,relationship : age,other installment plans,housing,existing credits,job,people to provide maintenance for,telephone,foreign worker,goodness
1,1,18,4,2,1049,1,2,4,2,1,4,2,21,3,1,1,3,2,1,2,1
2,1,9,4,0,2799,1,3,2,3,1,2,1,36,3,1,2,3,1,1,2,1
3,2,12,2,9,841,2,4,2,2,1,4,1,23,3,1,1,2,2,1,2,1
4,1,12,4,0,2122,1,3,3,3,1,2,1,39,3,1,2,2,1,1,1,1
5,1,12,4,0,2171,1,3,4,3,1,4,2,38,1,2,2,2,2,1,1,1


In [304]:
# Convert string to int for all the columns in the dataframe
df = df.astype(int)
df.dtypes

checking account                     int64
duration in month                    int64
credit history                       int64
credit purpose                       int64
credit amount                        int64
savings account                      int64
employment since..                   int64
installment rate                     int64
status : sex                         int64
other debtors / guarantors           int64
residence since                      int64
property                             int64
relationship : age                   int64
other installment plans              int64
housing                              int64
existing credits                     int64
job                                  int64
people to provide maintenance for    int64
telephone                            int64
foreign worker                       int64
goodness                             int64
dtype: object

## Spliting the dataset into feature dataset and class labels

In [306]:
X = df.drop(['goodness'], axis=1)
y = df["goodness"]

### Applying logistic regression on the dataset with all the features

In [307]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42 )

logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)
score = logisticRegr.score(X_test, y_test)


In [308]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, predictions))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, predictions))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, predictions))

Accuracy: 0.76
Precision: 0.7790697674418605
Recall: 0.9305555555555556


### Applying SVM with all the features

In [309]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [313]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.79
Precision: 0.7931034482758621
Recall: 0.9583333333333334


#### SVM has a better evaluation metrics than other two classifiers hence we will go ahead with it

### Applying Naive Bayes with all the features

In [311]:
from sklearn.naive_bayes import GaussianNB
GNBclf = GaussianNB()
model = GNBclf.fit(X_train, y_train)
preds = GNBclf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, preds))

Accuracy: 0.78


In [312]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, preds))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, preds))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, preds))

Accuracy: 0.78
Precision: 0.8571428571428571
Recall: 0.8333333333333334


### Dropping irrelevant features as deduced from Correlation Matrix and Chi-Square

In [286]:
X.drop(['residence since','telephone','people to provide maintenance for','job'], axis=1, inplace= True)


In [287]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42 )

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [288]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8
Precision: 0.825
Recall: 0.9166666666666666


#### Since the features like telephone, residence since, job and people to provide maintenance are uncorrelated with class labels as we saw in our previous assignment, there is a marginal improvement in evaluation metrics due to optimal feature selection

### Dropping Protected Attributes namely Sex, Marital Status and Age

In [314]:
X.drop(['relationship : age','status : sex'], axis=1, inplace= True)

In [315]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42 )

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [316]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.79
Precision: 0.8227848101265823
Recall: 0.9027777777777778


### We can see that these protected attributes do have affect on evaluating the credit risk. This reflects how pre-conceived notions affects our decision making process in an unfair manner. The same can be said also based on the correlation matrix and chi-square values (calculated in 2nd second exercise ) which shows a non-negigible corelation to the class labels 

### 2. Reproducibility

In [295]:
# load data from A2
german_credit = pd.read_csv("../german_processed.csv")
german_credit

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate of disposable income,Personal status and sex,Other debtors / guarantors,Present residence since,Property,Age,Other installment plans,Housing,Number of existing credits,Job,Number of people being liable,Telephone,foreign worker,score
0,... < 0 DM,6,critical account/ other credits existing (not ...,radio/television,1169,unknown/ no savings account,.. >= 7 years,4,male : single,none,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes,1
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,none,2,real estate,22,none,own,1,skilled employee / official,1,none,yes,2
2,no checking account,12,critical account/ other credits existing (not ...,education,2096,... < 100 DM,4 <= ... < 7 years,2,male : single,none,3,real estate,49,none,own,1,unskilled - resident,2,none,yes,1
3,... < 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,guarantor,4,building society savings agreement/ life insur...,45,none,for free,1,skilled employee / official,2,none,yes,1
4,... < 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,none,4,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking account,12,existing credits paid back duly till now,furniture/equipment,1736,... < 100 DM,4 <= ... < 7 years,3,female : divorced/separated/married,none,4,real estate,31,none,own,1,unskilled - resident,1,none,yes,1
996,... < 0 DM,30,existing credits paid back duly till now,car (used),3857,... < 100 DM,1 <= ... < 4 years,4,male : divorced/separated,none,4,building society savings agreement/ life insur...,40,none,own,1,management/ self-employed/ highly qualified em...,1,"yes, registered under the customers name",yes,1
997,no checking account,12,existing credits paid back duly till now,radio/television,804,... < 100 DM,.. >= 7 years,4,male : single,none,4,"car or other, not in attribute 6",38,none,own,1,skilled employee / official,1,none,yes,1
998,... < 0 DM,45,existing credits paid back duly till now,radio/television,1845,... < 100 DM,1 <= ... < 4 years,4,male : single,none,4,unknown / no property,23,none,for free,1,skilled employee / official,1,"yes, registered under the customers name",yes,2


In [296]:
# convert telephone and foreign worker into binary attributes
german_credit.replace({'Telephone':{'yes, registered under the customers name':1, 'none':0}}, inplace=True)
german_credit.replace({'foreign worker':{'yes':1, 'no':0}}, inplace=True)

In [297]:
# convert categorical attributes into binary features
dtype = german_credit.dtypes
cat_attributes = dtype[dtype == 'object'].index
bin_df = pd.get_dummies(german_credit, columns = cat_attributes)

In [298]:
# separate features and scores
features = bin_df.drop('score', axis = 1)
scores = bin_df.score

In [299]:
features

Unnamed: 0,Duration in month,Credit amount,Installment rate of disposable income,Present residence since,Age,Number of existing credits,Number of people being liable,Telephone,foreign worker,Status of existing checking account_... < 0 DM,Status of existing checking account_... >= 200 DM / salary at least 1 year,Status of existing checking account_0 <= ... < 200 DM,Status of existing checking account_no checking account,Credit history_all credits at this bank paid back duly,Credit history_critical account/ other credits existing (not at this bank),Credit history_delay in paying off in the past,Credit history_existing credits paid back duly till now,Credit history_no credits taken/ all credits paid back duly,Purpose_business,Purpose_car (new),Purpose_car (used),Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_others,Purpose_radio/television,Purpose_repairs,Purpose_retraining,Savings account/bonds_.. >= 1000 DM,Savings account/bonds_... < 100 DM,Savings account/bonds_100 <= ... < 500 DM,Savings account/bonds_500 <= ... < 1000 DM,Savings account/bonds_unknown/ no savings account,Present employment since_.. >= 7 years,Present employment since_... < 1 year,Present employment since_1 <= ... < 4 years,Present employment since_4 <= ... < 7 years,Present employment since_unemployed,Personal status and sex_female : divorced/separated/married,Personal status and sex_male : divorced/separated,Personal status and sex_male : married/widowed,Personal status and sex_male : single,Other debtors / guarantors_co-applicant,Other debtors / guarantors_guarantor,Other debtors / guarantors_none,Property_building society savings agreement/ life insurance,"Property_car or other, not in attribute 6",Property_real estate,Property_unknown / no property,Other installment plans_bank,Other installment plans_none,Other installment plans_stores,Housing_for free,Housing_own,Housing_rent,Job_management/ self-employed/ highly qualified employee/ officer,Job_skilled employee / official,Job_unemployed/ unskilled - non-resident,Job_unskilled - resident
0,6,1169,4,4,67,2,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
1,48,5951,2,2,22,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
2,12,2096,2,3,49,1,2,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1
3,42,7882,2,4,45,1,2,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0
4,24,4870,3,4,53,2,2,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,4,31,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1
996,30,3857,4,4,40,1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0
997,12,804,4,4,38,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0
998,45,1845,4,4,23,1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0


according to the paper, they have 48 features in total, but we have now 59.

In [300]:
# split the data for ten-fold cross validation
X_train, X_test, y_train, y_test = train_test_split(features, scores, test_size=0.1, random_state=0)

In [301]:
# use the off-the-shelf logistic regression classifier in Python
lr = LogisticRegression(max_iter=1e4).fit(X_train, y_train)

In [302]:
coef = lr.coef_

In [303]:
print(
    "coefficient of single male:", coef[0, X_train.columns.get_loc('Personal status and sex_male : single')],
    "\ncoefficient of married male:", coef[0, X_train.columns.get_loc('Personal status and sex_male : married/widowed')],
    "\ncoefficient of married/divorced female:", coef[0, X_train.columns.get_loc('Personal status and sex_female : divorced/separated/married')],
    "\ncoefficient of divorced male:", coef[0, X_train.columns.get_loc('Personal status and sex_male : divorced/separated')]
)

coefficient of single male: -0.37274544372910584 
coefficient of married male: -0.18132141335080873 
coefficient of married/divorced female: -0.071673965555545 
coefficient of divorced male: 0.22209026921268024


We are not able to derive the same coefficients as depicted in table 2 of the paper.
The approach from the paper has a poor reproducibility. As there are different logistic regression functions in different libraries with different parameters in Python, the authors should specify which logistic regression function they used and with which parameters. We also got 59 features, rather than 48 described in the paper, which may be another reason, why we have different coefficients in the end.