In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np

### Import dataset

In [8]:
# Load the dataset
data = pd.read_csv('../02_data/01_raw_data/dataset.csv', index_col=0)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Status of existing checking account,Duration in months,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,Credit risk,Number of Accounts
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,67,A143,A152,2,A173,1,A192,A201,1,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,22,A143,A152,1,A173,1,A191,A201,2,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,49,A143,A152,1,A172,2,A191,A201,1,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,45,A143,A153,1,A173,2,A191,A201,1,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,53,A143,A153,2,A173,2,A191,A201,2,2


### Preprocess data

In [9]:
# Change the values of the "Credit Risk" feature to (good = 0, bad = 1)
data['Credit risk'] = data['Credit risk'].apply(lambda x: 1 if x == 2 else 0)

In [10]:
# Since many features have values that are not meaningful, I will change them to something more meaningful
# The values I will change them to can be found here: http://archive.ics.uci.edu/dataset/144/statlog+german+credit+data

map = {
    'A11' : 'less_than_0_dm',
    'A12' : '0_to_200_dm',
    'A13' : 'greater_than_200_dm',
    'A14' : 'no_checking_account',
    'A30' : 'no_credits_taken',
    'A31' : 'all_credits_paid_back',
    'A32' : 'existing_credits_paid_back_till_now',
    'A33' : 'delay_in_past',
    'A34' : 'critical_account',
    'A40' : 'car_new',
    'A41' : 'car_used',
    'A42' : 'furniture_equipment',
    'A43' : 'radio_television',
    'A44' : 'domestic_appliances',
    'A45' : 'repairs',
    'A46' : 'education',
    'A47' : 'vacation',
    'A48' : 'retraining',
    'A49' : 'business',
    'A410' : 'others',
    'A61' : 'less_than_100_dm',
    'A62' : '100_to_500_dm',
    'A63' : '500_to_1000_dm',
    'A64' : 'greater_than_1000_dm',
    'A65' : 'unknown_or_no_savings_account',
    'A71' : 'unemployed',
    'A72' : '0_1_year',
    'A73' : '1_4_years',
    'A74' : '4_7_years',
    'A75' : '7_more_years',
    'A91' : 'male_divorced_separated',
    'A92' : 'female_divorced_separated_married',
    'A93' : 'male_single',
    'A94' : 'male_married_widowed',
    'A95' : 'female_single',
    'A101' : 'none',
    'A102' : 'co_applicant',
    'A103' : 'guarantor',
    'A121' : 'real_estate',
    'A122' : 'savings_agreement_life_insurance',
    'A123' : 'car_or_other',
    'A124' : 'unknown_or_no_property',
    'A141' : 'bank',
    'A142' : 'stores',
    'A143' : 'none',
    'A151' : 'rent',
    'A152' : 'own',
    'A153' : 'for_free',
    'A171' : 'unemployed_unskilled_non_resident',
    'A172' : 'unskilled_resident',
    'A173' : 'skilled_employee',
    'A174' : 'management_self_employed_highly_qualified',
    'A191' : 'none',
    'A192' : 'yes',
    'A201' : 'yes',
    'A202' : 'no'
}

data.replace(map, inplace = True)

In [11]:
# Check if there are missing values in some column
print('Number of missing values in each column is:')
data.isnull().sum()

Number of missing values in each column is:


Status of existing checking account                         0
Duration in months                                          0
Credit history                                              0
Purpose                                                     0
Credit amount                                               0
Savings account/bonds                                       0
Present employment since                                    0
Installment rate in percentage of disposable income         0
Personal status and sex                                     0
Other debtors / guarantors                                  0
Present residence since                                     0
Property                                                    0
Age in years                                                0
Other installment plans                                     0
Housing                                                     0
Number of existing credits at this bank                     0
Job     

### Save data

In [12]:
data.to_csv('../02_data/02_treated_data/preprocessed_data.csv')