In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [12]:
df_credit = pd.read_csv("data/credit_dataset/SouthGermanCredit.asc",  sep=' ', header=0)
df_credit.columns=["status", "duration", "credit_history", "purpose", "amount", 
                "savings", "employment_duration", "installment_rate",
                "personal_status_sex", "other_debtors",
                "present_residence", "property",
                "age", "other_installment_plans",
                "housing", "number_credits",
                "job", "people_liable", "telephone", "foreign_worker",
                "credit_risk"]
df_credit.dropna(inplace=True)
df_credit.drop(columns=["personal_status_sex", "property"],inplace=True)

mappings={"purpose": {0: "others", 
                      1: "car_new",
                      2: "car_used",
                      3: "furnitures",
                      4: "radio_tv",
                      5: "domestic_appliances",
                      6: "repairs",
                      7: "education",
                      8: "vacation",
                      9: "retraining",
                      10: "business"},
        "housing": {1: "free",
                    2: "rent",
                    3: "own"}}

for col, mapping in mappings.items():
    df_credit[col] = df_credit[col].map(mapping)


df_credit=pd.get_dummies(df_credit, dtype="int")


#move risk target to end
target_col = df_credit.pop("credit_risk")
df_credit.insert(len(df_credit.columns), "credit_risk", target_col)


In [13]:
df_credit["credit_risk"].value_counts()

credit_risk
1    700
0    300
Name: count, dtype: int64

In [14]:
df_credit

Unnamed: 0,status,duration,credit_history,amount,savings,employment_duration,installment_rate,other_debtors,present_residence,age,...,purpose_furnitures,purpose_others,purpose_radio_tv,purpose_repairs,purpose_retraining,purpose_vacation,housing_free,housing_own,housing_rent,credit_risk
0,1,18,4,1049,1,2,4,1,4,21,...,0,0,0,0,0,0,1,0,0,1
1,1,9,4,2799,1,3,2,1,2,36,...,0,1,0,0,0,0,1,0,0,1
2,2,12,2,841,2,4,2,1,4,23,...,0,0,0,0,1,0,1,0,0,1
3,1,12,4,2122,1,3,3,1,2,39,...,0,1,0,0,0,0,1,0,0,1
4,1,12,4,2171,1,3,4,1,4,38,...,0,1,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,24,2,1987,1,3,2,1,4,21,...,1,0,0,0,0,0,1,0,0,0
996,1,24,2,2303,1,5,4,2,1,45,...,0,1,0,0,0,0,0,0,1,0
997,4,21,4,12680,5,5,4,1,4,30,...,0,1,0,0,0,0,0,1,0,0
998,2,12,2,6468,5,1,2,1,1,52,...,1,0,0,0,0,0,0,0,1,0


In [15]:
random_state=1234
credit_train, credit_test= train_test_split(df_credit, test_size=0.2, random_state=random_state)

credit_train.to_parquet("data/credit_dataset/train_cleaned.parquet")
credit_test.to_parquet("data/credit_dataset/test_cleaned.parquet")

x_train = credit_train.drop("credit_risk", axis=1)
y_train= credit_train["credit_risk"]

x_test=credit_test.drop("credit_risk", axis=1)
y_test=credit_test["credit_risk"]

model = RandomForestClassifier(n_estimators=100, random_state=random_state)
# model=GaussianNB()

model.fit(x_train, y_train)

target_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, target_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 76.50%


In [16]:
with open('data/credit_dataset/RF.pkl', 'wb') as f:
    pickle.dump(model, f)

In [17]:
feature_descriptions=[
                      "Status of the checking account: (1: no checking account, 2: negative checking account, 3: between 0 and 200 DM, 4: Above 200 DM",
                      "Duration of the credit request in months",
                      "Credit history of the customer (increasing rating from 0: history of delayed payments on previous credits to 4: all credits always paid back duly)",
                      "Amount of credit requested in DM",
                      "Status of the savings account (1: no savings account, 2: less than 100 DM, 3: Between 100 and 500 DM, 4: Between 500 and 1000 DM, 5: More than 1000 DM)",
                      "Duration of current employment (1: Unemmployed, 2: less than 1 year, 3: between 1 and 4 years, 4: between 4 and 7 years, 5: more than 7 years)",
                      "Installment rate is the amount to be paid at fixed intervals (here given as a percentage of debtor’s disposable income)",
                      "Are there any other people involved in the credit? (0: No, 1: There are co-applicants, 2: There are guarantors)",
                      "Length of time (in years) the debtor lives in the present residence",
                      "Age of debtor in years",
                      "Other running installments that have to be made by the debtor (1: yes, installments to bank, 2: yes, installments to stores , 3: no installments)",
                      "Number of credits including the current one the debtor has (or had) at this bank",
                      "Type of job (from 1: unskilled to 4: highly skilled)",
                      "Number of people that are financially dependent on the debtor (1: 3 or more people, 2: less than 3 people)",
                      "Does the debtor have a telephone? (1: no, 2: yes)",
                      "Is the debtor a foreign worker? (1: yes, 2: no)",
                      "One-hot variable for the credit purpose -- business",
                      "One-hot variable for the credit purpose -- new car",
                      "One-hot variable for the credit purpose -- used car",
                      "One-hot variable for the credit purpose -- domestic appliances",
                      "One-hot variable for the credit purpose -- furniture",
                      "One-hot variable for the credit purpose -- others",
                      "One-hot variable for the credit purpose -- radio or tv",
                      "One-hot variable for the credit purpose -- repairs",
                      "One-hot variable for the credit purpose -- retraining",
                      "One-hot variable for the credit purpose -- vacation",
                      "One hot variable for housing situation -- debtor lives for free in someone elses property",
                      "One hot variable for housing situation -- debtor lives in their own property",
                      "One hot variable for housing situation -- debtor lives in rented property",
                      ]

feature_desc_df = pd.DataFrame({
    "feature_name": list(x_train.columns),
    "feature_average": x_train.mean().to_list() ,
    "feature_desc": feature_descriptions,
})

dataset_description="The dataset contains information from the 1970s in Germany on a series of debtors that took a loan from the bank. It includes many detailed categorical and one-hot variables about their financial situation. Keep in mind that at the time Germany used Deutsche Marks (DM) with an average yearly salary of 10 000 to 20 000 DM"
target_description="The target variable is whether the customer paid back all their debts on time and turned out to be a good credit (1) or there were some issues and they were a bad credit (0) for the bank."
task_description="Predict whether a new customer will be a credit risk or not"

dataset_info={
 "dataset_description": dataset_description,
 "target_description": target_description,
 "task_description": task_description,
 "feature_description": feature_desc_df
 }


with open('data/credit_dataset/dataset_info', 'wb') as f:
    pickle.dump(dataset_info, f)