# PD prediction with Machine Learning Over Time

In [1]:
# import the necessary packages
import numpy as np
import os
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn import preprocessing as pp
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit

from sklearn.model_selection import GridSearchCV

Here we load the data. The data is retrieved from:
https://www.kaggle.com/datasets/sid321axn/bondora-peer-to-peer-lending-loan-data

In [2]:
# load the data
df = pd.read_csv('../Data/LoanData_Bondora.csv')
df.info()

  df = pd.read_csv('../Data/LoanData_Bondora.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179235 entries, 0 to 179234
Columns: 112 entries, ReportAsOfEOD to ActiveLateLastPaymentCategory
dtypes: bool(3), float64(62), int64(12), object(35)
memory usage: 149.6+ MB


In [3]:
# display the first 10 columns by percentage of missing data
missing_percentages = df.isnull().sum() * 100 / len(df)
missing_percentages.sort_values(ascending=False).head(10)

City                        100.000000
County                      100.000000
DateOfBirth                 100.000000
EmploymentPosition          100.000000
Rating_V0                    97.450833
EL_V0                        97.450833
CreditScoreEsEquifaxRisk     93.182693
EL_V1                        92.789355
Rating_V1                    92.789355
Rating_V2                    85.972048
dtype: float64

Based on previous studies, only the most important features are retained.

In [4]:
# drop the columns with more than 50% missing values
columns_to_drop = missing_percentages[missing_percentages > 50].index
df = df.drop(columns=columns_to_drop)
df.shape

(179235, 76)

In [5]:
# treat categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns].head()

Unnamed: 0,ReportAsOfEOD,LoanId,ListedOnUTC,BiddingStartedOn,UserName,LoanApplicationStartedDate,LoanDate,ContractEndDate,FirstPaymentDate,MaturityDate_Original,MaturityDate_Last,Country,EmploymentDurationCurrentEmployer,LastPaymentOn,StageActiveSince,Rating,Status,WorseLateCategory,CreditScoreEsMicroL
0,2021-07-20,66AE108B-532B-4BB3-BAB7-0019A46412C1,2016-03-23 16:07:19,2016-03-23 16:07:19,BO965519,2016-03-17 12:39:22,2016-03-23,2020-06-26,2016-05-12,2021-04-12,2020-06-26,EE,MoreThan5Years,2021-06-16,2020-03-03 09:27:48.493000000,C,Late,180+,
1,2021-07-20,D152382E-A50D-46ED-8FF2-0053E0C86A70,2015-06-25 11:02:28,2015-06-25 11:02:28,BOA9K172A,2015-06-24 12:36:16,2015-06-25,,2015-08-17,2020-07-17,2020-07-17,EE,MoreThan5Years,2019-06-19,2019-08-01 14:18:33,B,Late,180+,
2,2021-07-20,87342E13-66CB-483F-833A-007953E50C78,2016-01-14 10:00:21,2016-01-14 10:00:21,BO7971663,2016-01-07 15:37:16,2016-01-19,2019-10-24,2016-02-22,2021-01-20,2021-01-20,EE,UpTo3Years,2019-10-23,2018-02-28 14:43:37.670000000,A,Repaid,180+,
3,2021-07-20,87227056-6BF9-410C-98D1-008F788E122A,2015-03-24 15:55:44,2015-03-24 15:55:44,BO76151K3,2015-03-20 15:20:48,2015-03-27,,2015-05-04,2020-04-01,2020-04-01,ES,UpTo5Years,2020-08-07,2020-11-27 00:00:00,F,Late,180+,M3
4,2021-07-20,2DDE6336-E466-4624-A337-00A0ED1A1468,2015-12-17 10:12:00,2015-12-17 10:12:00,BOK423A63,2015-12-13 00:30:08,2015-12-22,,2016-02-01,2020-01-02,2020-01-02,ES,UpTo1Year,2016-02-01,2020-11-27 00:00:00,HR,Late,180+,M5


In [6]:
# check variance of categorical columns
df[categorical_columns].nunique().sort_values()

ReportAsOfEOD                             1
Status                                    3
Country                                   4
Rating                                    8
EmploymentDurationCurrentEmployer         9
WorseLateCategory                         9
CreditScoreEsMicroL                      11
FirstPaymentDate                       2477
LastPaymentOn                          3422
MaturityDate_Original                  3442
MaturityDate_Last                      3937
LoanDate                               4041
ContractEndDate                        4670
StageActiveSince                      64708
UserName                              85087
ListedOnUTC                          178456
BiddingStartedOn                     178462
LoanApplicationStartedDate           178690
LoanId                               179235
dtype: int64

To mimic a real business scenario where only information at application is available, we retain only Country, EmploymentDurationCurrentEmployer, Rating, MaturityDate_Original, LoanDate

In [7]:
# retain only categorical variables that are available at the time of loan application
# keep only the columns that do not appear in the given list
cat_columns_to_keep = ['Country', 'EmploymentDurationCurrrentEmployer', 'LoanDate', 'MaturityDate_Original', 'Rating']
cat_columns_to_drop = categorical_columns.difference(cat_columns_to_keep)
df.drop(columns=cat_columns_to_drop, inplace=True)
df.shape

(179235, 61)

In [8]:
# treat bool columns
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns].head()

Unnamed: 0,NewCreditCustomer,ActiveScheduleFirstPaymentReached,Restructured
0,False,True,False
1,False,True,False
2,True,True,False
3,True,True,False
4,True,True,False


We assume that new credit customer means that the person or entity applying for the loan is new to the credit market. Therefore, this is information available at the moment of application. On the other hand, information regarding the active schedule of the first payment and restructuring of the loan is future information that would be improper to use. Therefore, we know only whether the borrower is new to the credit market or not. This aspect may be useful as experienced borrowers may carry less probability of default. 

In [9]:
# we keep only NewCreditCustomer
bool_columns_to_keep = ['NewCreditCustomer']
bool_columns_to_drop = bool_columns.difference(bool_columns_to_keep)
df.drop(columns=bool_columns_to_drop, inplace=True)
df.shape

(179235, 59)

In [10]:
# treat numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# check variance of numeric columns
df[numeric_columns].nunique().sort_values()

RecoveryStage                                  2
Gender                                         3
VerificationType                               5
CreditScoreEeMini                              7
MaritalStatus                                  7
ApplicationSignedWeekday                       7
EmploymentStatus                               7
Education                                      7
ModelVersion                                   8
PreviousEarlyRepaymentsCountBeforeLoan        11
HomeOwnershipType                             12
LanguageCode                                  13
UseOfLoan                                     17
RefinanceLiabilities                          21
OccupationArea                                21
ApplicationSignedHour                         24
NoOfPreviousLoansBeforeLoan                   28
MonthlyPaymentDay                             29
LoanDuration                                  32
ExistingLiabilities                           39
NextPaymentNr       

The hour and day of the week on which someone signs an application does not carry useful information to predict the probability of default. Gender may be improper to use as it may introduce biased predictions towards a specific sex, which would produce undue discrimination.

In [11]:
# drop sensitive data (gender)
df.drop('Gender', axis=1, inplace=True)

In [12]:
# drop application hour and day of week
numeric_columns_to_drop = ['ApplicationSignedHour', 'ApplicationSignedWeekday']
df.drop(columns=numeric_columns_to_drop, inplace=True)
df.shape

(179235, 56)

In [13]:
# look at numeric predictors relatd to income (regex: 'Income')
income_columns = df.filter(regex='Income').columns
income_columns

Index(['IncomeFromPrincipalEmployer', 'IncomeFromPension',
       'IncomeFromFamilyAllowance', 'IncomeFromSocialWelfare',
       'IncomeFromLeavePay', 'IncomeFromChildSupport', 'IncomeOther',
       'IncomeTotal', 'DebtToIncome'],
      dtype='object')

The total income is assumed to contain all income components. To avoid multicollinearity and redundancy of information, we keep only total income and DebtToIncome

In [14]:
income_columns_to_keep = ['IncomeTotal', 'DebtToIncome']
income_columns_to_drop = income_columns.difference(income_columns_to_keep)
df.drop(columns=income_columns_to_drop, inplace=True)
df.shape

(179235, 49)

In [15]:
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'AppliedAmount',
       'BidsApi', 'BidsManual', 'BidsPortfolioManager', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'ExistingLiabilities', 'ExpectedLoss', 'ExpectedReturn', 'FreeCash',
       'HomeOwnershipType', 'IncomeTotal', 'Interest',
       'InterestAndPenaltyBalance', 'InterestAndPenaltyPaymentsMade',
       'LanguageCode', 'LiabilitiesTotal', 'LoanDate', 'LoanDuration',
       'LoanNumber', 'LossGivenDefault', 'MaritalStatus',
       'MaturityDate_Original', 'ModelVersion', 'MonthlyPayment',
       'MonthlyPaymentDay', 'NewCreditCustomer', 'NextPaymentNr',
       'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PlannedInterestTillDate',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'PrincipalBalance',
       'PrincipalOverdueBySchedule', 'PrincipalPaymentsMade',
       'ProbabilityOfDefault', 'R

Loss Given Default, Expected Loss, Principal Balance, and Recovery Stage are unknown. Therefore, they are dropped. 

In [16]:
# Loss Given Default is unkown. We will drop it. We will also drop ExpectedLoss. Recovery Stage are unkown.
df.drop(columns=['LossGivenDefault', 'ExpectedLoss', 'ExpectedReturn', 'RecoveryStage'], inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'AppliedAmount',
       'BidsApi', 'BidsManual', 'BidsPortfolioManager', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'ExistingLiabilities', 'FreeCash', 'HomeOwnershipType', 'IncomeTotal',
       'Interest', 'InterestAndPenaltyBalance',
       'InterestAndPenaltyPaymentsMade', 'LanguageCode', 'LiabilitiesTotal',
       'LoanDate', 'LoanDuration', 'LoanNumber', 'MaritalStatus',
       'MaturityDate_Original', 'ModelVersion', 'MonthlyPayment',
       'MonthlyPaymentDay', 'NewCreditCustomer', 'NextPaymentNr',
       'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PlannedInterestTillDate',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'PrincipalBalance',
       'PrincipalOverdueBySchedule', 'PrincipalPaymentsMade',
       'ProbabilityOfDefault', 'Rating', 'RefinanceLiabilities', 'UseOfLoan',
       'Verifica

Also any information regarding principal is unknown.

In [17]:
principal_columns = df.filter(regex='Principal').columns
df.drop(columns=principal_columns, inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'AppliedAmount',
       'BidsApi', 'BidsManual', 'BidsPortfolioManager', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'ExistingLiabilities', 'FreeCash', 'HomeOwnershipType', 'IncomeTotal',
       'Interest', 'InterestAndPenaltyBalance',
       'InterestAndPenaltyPaymentsMade', 'LanguageCode', 'LiabilitiesTotal',
       'LoanDate', 'LoanDuration', 'LoanNumber', 'MaritalStatus',
       'MaturityDate_Original', 'ModelVersion', 'MonthlyPayment',
       'MonthlyPaymentDay', 'NewCreditCustomer', 'NextPaymentNr',
       'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PlannedInterestTillDate',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'RefinanceLiabilities', 'UseOfLoan', 'VerificationType'],
      dtype='object')

Also information about bids is unavailable

In [18]:
bids_columns = df.filter(regex='Bids').columns
df.drop(columns=bids_columns, inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'AppliedAmount',
       'Country', 'CreditScoreEeMini', 'DebtToIncome', 'Education',
       'EmploymentStatus', 'ExistingLiabilities', 'FreeCash',
       'HomeOwnershipType', 'IncomeTotal', 'Interest',
       'InterestAndPenaltyBalance', 'InterestAndPenaltyPaymentsMade',
       'LanguageCode', 'LiabilitiesTotal', 'LoanDate', 'LoanDuration',
       'LoanNumber', 'MaritalStatus', 'MaturityDate_Original', 'ModelVersion',
       'MonthlyPayment', 'MonthlyPaymentDay', 'NewCreditCustomer',
       'NextPaymentNr', 'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PlannedInterestTillDate',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'RefinanceLiabilities', 'UseOfLoan', 'VerificationType'],
      dtype='object')

Amount and applied amount convey the same information. Let's see which has less missing data. We keep Amount and the sum of loan amounts for loans before the current loan

In [19]:
amount_columns = df.filter(regex='Amount').columns
amount_columns_to_keep = ['Amount', 'AmountOfPreviousLoansBeforeLoan']
amount_columns_to_drop = amount_columns.difference(amount_columns_to_keep)
df.drop(columns=amount_columns_to_drop, inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'ExistingLiabilities', 'FreeCash', 'HomeOwnershipType', 'IncomeTotal',
       'Interest', 'InterestAndPenaltyBalance',
       'InterestAndPenaltyPaymentsMade', 'LanguageCode', 'LiabilitiesTotal',
       'LoanDate', 'LoanDuration', 'LoanNumber', 'MaritalStatus',
       'MaturityDate_Original', 'ModelVersion', 'MonthlyPayment',
       'MonthlyPaymentDay', 'NewCreditCustomer', 'NextPaymentNr',
       'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PlannedInterestTillDate',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'RefinanceLiabilities', 'UseOfLoan', 'VerificationType'],
      dtype='object')

In [20]:
# interest columns
interest_columns = df.filter(regex='Interest').columns
interest_columns 

Index(['Interest', 'PlannedInterestTillDate', 'InterestAndPenaltyPaymentsMade',
       'InterestAndPenaltyBalance'],
      dtype='object')

In [21]:
df[interest_columns].isnull().sum()

Interest                             0
PlannedInterestTillDate           1974
InterestAndPenaltyPaymentsMade       0
InterestAndPenaltyBalance         5232
dtype: int64

In [22]:
# check whether the non-null values are the same between Interest and PlannedInterestTillDate
df['Interest'].notnull().equals(df['PlannedInterestTillDate'].notnull())

False

The column Interest is assumed to be the contractually agreed interest rate. The planned interest till date is assumed to convey the same information as Interest. Therefore, Planned Interest Till Date is dropped. The other interest-related variables are leaky. Therefore, we drop them as well. 

In [23]:
interest_columns_to_keep = ['Interest']
interest_columns_to_drop = interest_columns.difference(interest_columns_to_keep)
df.drop(columns=interest_columns_to_drop, inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'ExistingLiabilities', 'FreeCash', 'HomeOwnershipType', 'IncomeTotal',
       'Interest', 'LanguageCode', 'LiabilitiesTotal', 'LoanDate',
       'LoanDuration', 'LoanNumber', 'MaritalStatus', 'MaturityDate_Original',
       'ModelVersion', 'MonthlyPayment', 'MonthlyPaymentDay',
       'NewCreditCustomer', 'NextPaymentNr', 'NoOfPreviousLoansBeforeLoan',
       'NrOfScheduledPayments', 'OccupationArea',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'RefinanceLiabilities', 'UseOfLoan', 'VerificationType'],
      dtype='object')

In [24]:
# check liabilities columns
liabilities_columns = df.filter(regex='Liabilities').columns
liabilities_columns

Index(['ExistingLiabilities', 'LiabilitiesTotal', 'RefinanceLiabilities'], dtype='object')

To avoid information redundancy and multicollineariy, we keep only the total amount of liabilities.

In [25]:
# keep only LiabilitiesTotal
liabilities_columns_to_keep = ['LiabilitiesTotal']
liabilities_columns_to_drop = liabilities_columns.difference(liabilities_columns_to_keep)
df.drop(columns=liabilities_columns_to_drop, inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'FreeCash', 'HomeOwnershipType', 'IncomeTotal', 'Interest',
       'LanguageCode', 'LiabilitiesTotal', 'LoanDate', 'LoanDuration',
       'LoanNumber', 'MaritalStatus', 'MaturityDate_Original', 'ModelVersion',
       'MonthlyPayment', 'MonthlyPaymentDay', 'NewCreditCustomer',
       'NextPaymentNr', 'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'UseOfLoan', 'VerificationType'],
      dtype='object')

The number of the next payment is not clear in meaning. Therefore, we drop it.

In [26]:
# drop NextPaymentNr
df.drop(columns='NextPaymentNr', inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'FreeCash', 'HomeOwnershipType', 'IncomeTotal', 'Interest',
       'LanguageCode', 'LiabilitiesTotal', 'LoanDate', 'LoanDuration',
       'LoanNumber', 'MaritalStatus', 'MaturityDate_Original', 'ModelVersion',
       'MonthlyPayment', 'MonthlyPaymentDay', 'NewCreditCustomer',
       'NoOfPreviousLoansBeforeLoan', 'NrOfScheduledPayments',
       'OccupationArea', 'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'UseOfLoan', 'VerificationType'],
      dtype='object')

We also do not include the monthly payment date since it is not deemed informative

In [27]:
# drop monthly payment date
df.drop(columns='MonthlyPaymentDay', inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Education', 'EmploymentStatus',
       'FreeCash', 'HomeOwnershipType', 'IncomeTotal', 'Interest',
       'LanguageCode', 'LiabilitiesTotal', 'LoanDate', 'LoanDuration',
       'LoanNumber', 'MaritalStatus', 'MaturityDate_Original', 'ModelVersion',
       'MonthlyPayment', 'NewCreditCustomer', 'NoOfPreviousLoansBeforeLoan',
       'NrOfScheduledPayments', 'OccupationArea',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'UseOfLoan', 'VerificationType'],
      dtype='object')

We exclude also the date when the loan ends as it is irrelevant for prediction and the contractually agreed duration of a loan is already available as a variable. We do keep the loan date as it is assumed to be the date on which the loan enters the secondary market. 

In [28]:
# take the difference between the two dates: LoanDate and MaturityDate_Original in months
df['LoanDate'] = pd.to_datetime(df['LoanDate'])
df['MaturityDate_Original'] = pd.to_datetime(df['MaturityDate_Original'])
df['Duration'] = (df['MaturityDate_Original'] - df['LoanDate'])
df['Duration'] = df['Duration'].dt.days / 30 # convert to months
df.drop(columns=['LoanDate', 'MaturityDate_Original'], inplace=True)
df.columns.sort_values()

Index(['Age', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'Country',
       'CreditScoreEeMini', 'DebtToIncome', 'Duration', 'Education',
       'EmploymentStatus', 'FreeCash', 'HomeOwnershipType', 'IncomeTotal',
       'Interest', 'LanguageCode', 'LiabilitiesTotal', 'LoanDuration',
       'LoanNumber', 'MaritalStatus', 'ModelVersion', 'MonthlyPayment',
       'NewCreditCustomer', 'NoOfPreviousLoansBeforeLoan',
       'NrOfScheduledPayments', 'OccupationArea',
       'PreviousEarlyRepaymentsCountBeforeLoan',
       'PreviousRepaymentsBeforeLoan', 'ProbabilityOfDefault', 'Rating',
       'UseOfLoan', 'VerificationType'],
      dtype='object')

Convert variables that are supposed to be categorical to categorical.

In [29]:
# check categorical predictors
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index(['Country', 'Rating'], dtype='object')

In [30]:
df['VerificationType'].value_counts()

VerificationType
4.0    115157
1.0     52764
3.0      9428
2.0      1828
0.0         8
Name: count, dtype: int64

Also the type of home ownership, the language code, Model Version, Marital Status, Occupation Area, Use of Loan and Verification Type should be categorical.
Let's change the datatype to category (but keeping existing missing data).

In [31]:
# transfom HomeOwnershipType, LanguageCode, MaritalStatus, ModelVersion, OccupationArea, UseOfLoan, VerificationType into categorical variables
# by keeping the missing data
df['HomeOwnershipType'] = df['HomeOwnershipType'].astype('category')
df['LanguageCode'] = df['LanguageCode'].astype('category')
df['MaritalStatus'] = df['MaritalStatus'].astype('category')
df['ModelVersion'] = df['ModelVersion'].astype('category')
df['OccupationArea'] = df['OccupationArea'].astype('category')
df['UseOfLoan'] = df['UseOfLoan'].astype('category')
df['VerificationType'] = df['VerificationType'].astype('category')
df.dtypes

LoanNumber                                   int64
NewCreditCustomer                             bool
VerificationType                          category
LanguageCode                              category
Age                                          int64
Country                                     object
Amount                                     float64
Interest                                   float64
LoanDuration                                 int64
MonthlyPayment                             float64
UseOfLoan                                 category
Education                                  float64
MaritalStatus                             category
EmploymentStatus                           float64
OccupationArea                            category
HomeOwnershipType                         category
IncomeTotal                                float64
LiabilitiesTotal                           float64
DebtToIncome                               float64
FreeCash                       