### **Import the libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
#https://www.kaggle.com/datasets/zhijinzhai/loandata
loanData = pd.read_csv('Loan_payments_data.csv')

In [3]:
loanData.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female


In [4]:
loanData.shape

(500, 11)

In [5]:
loanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Loan_ID         500 non-null    object 
 1   loan_status     500 non-null    object 
 2   Principal       500 non-null    int64  
 3   terms           500 non-null    int64  
 4   effective_date  500 non-null    object 
 5   due_date        500 non-null    object 
 6   paid_off_time   400 non-null    object 
 7   past_due_days   200 non-null    float64
 8   age             500 non-null    int64  
 9   education       500 non-null    object 
 10  Gender          500 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 43.1+ KB


In [6]:
loanData.isnull().sum()

Loan_ID             0
loan_status         0
Principal           0
terms               0
effective_date      0
due_date            0
paid_off_time     100
past_due_days     300
age                 0
education           0
Gender              0
dtype: int64

### **Preprocessing**

In [7]:
loanData['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [8]:
{column: len(loanData[column].unique()) for column in loanData.columns}

{'Loan_ID': 500,
 'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'effective_date': 7,
 'due_date': 25,
 'paid_off_time': 321,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2}

In [9]:
loanData = loanData.drop('Loan_ID', axis=1)

In [10]:
# Create date/time columns
for column in ['effective_date', 'due_date', 'paid_off_time']:
    loanData[column] = pd.to_datetime(loanData[column])

#get the year, month and day
loanData['efective_year'] = loanData['effective_date'].apply(lambda x:x.year)
loanData['efective_month'] = loanData['effective_date'].apply(lambda x:x.month)
loanData['efective_day'] = loanData['effective_date'].apply(lambda x:x.day)

loanData['due_year'] = loanData['due_date'].apply(lambda x:x.year)
loanData['due_month'] = loanData['due_date'].apply(lambda x:x.month)
loanData['due_day'] = loanData['due_date'].apply(lambda x:x.day)

loanData['paid_off_year'] = loanData['paid_off_time'].apply(lambda x:x.year)
loanData['paid_off_month'] = loanData['paid_off_time'].apply(lambda x:x.month)
loanData['paid_off_day'] = loanData['paid_off_time'].apply(lambda x:x.day)
loanData['paid_off_hour'] = loanData['paid_off_time'].apply(lambda x:x.hour)

In [11]:
loanData = loanData.drop(['effective_date', 'due_date', 'paid_off_time'], axis=1)

In [12]:
#Handling the missing values

for column in ['past_due_days', 'paid_off_year', 'paid_off_month', 'paid_off_day', 'paid_off_hour']:
    loanData[column] = loanData[column].fillna(loanData[column].mean())

In [13]:
loanData.isnull().sum()

loan_status       0
Principal         0
terms             0
past_due_days     0
age               0
education         0
Gender            0
efective_year     0
efective_month    0
efective_day      0
due_year          0
due_month         0
due_day           0
paid_off_year     0
paid_off_month    0
paid_off_day      0
paid_off_hour     0
dtype: int64

In [14]:
# changing the catergorical features(Gender and education) to numeric

def binary_encode(data, column, positive_value):
    loanData[column] = loanData[column].apply(lambda x: 1 if x == positive_value else 0)
    return loanData


def ordinal_encode(data, column, ordering):
    loanData[column] = loanData[column].apply(lambda x: ordering.index(x))
    return loanData

In [15]:
#Binary encode the Gender feature

loanData = binary_encode(loanData, 'Gender', positive_value='male')


#Ordinal encode the education feature
education_ordering = [
    'High School or Below',
    'college',
    'Bechalor',
    'Master or Above'
]
loanData = ordinal_encode(loanData, 'education', ordering=education_ordering)

In [16]:
loanData['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [17]:
#Encode the label(loan_status)
label_mapping = {'COLLECTION': 0, 'PAIDOFF': 1, 'COLLECTION_PAIDOFF' : 2}

loanData['loan_status'] = loanData['loan_status'].replace(label_mapping)

In [18]:
loanData.head()

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,efective_year,efective_month,efective_day,due_year,due_month,due_day,paid_off_year,paid_off_month,paid_off_day,paid_off_hour
0,1,1000,30,36.01,45,0,1,2016,9,8,2016,10,7,2016.0,9.0,14.0,19.0
1,1,1000,30,36.01,50,2,0,2016,9,8,2016,10,7,2016.0,10.0,7.0,9.0
2,1,1000,30,36.01,33,2,0,2016,9,8,2016,10,7,2016.0,9.0,25.0,16.0
3,1,1000,15,36.01,27,1,1,2016,9,8,2016,9,22,2016.0,9.0,22.0,20.0
4,1,1000,30,36.01,28,1,0,2016,9,9,2016,10,8,2016.0,9.0,23.0,21.0


In [19]:
#separating the label from the features
X = loanData.drop('loan_status', axis=1)
y = loanData['loan_status']

In [20]:
# Feature Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [21]:
X

Unnamed: 0,Principal,terms,past_due_days,age,education,Gender,efective_year,efective_month,efective_day,due_year,due_month,due_day,paid_off_year,paid_off_month,paid_off_day,paid_off_hour
0,0.493377,0.897891,0.000000,2.284043,-1.022825,0.426653,0.0,0.0,-3.126073,0.0,0.664986,-1.303142,0.0,-1.035098,-0.463997,1.339835
1,0.493377,0.897891,0.000000,3.106587,1.771779,-2.343823,0.0,0.0,-3.126073,0.0,0.664986,-1.303142,0.0,0.690066,-1.475829,-1.072109
2,0.493377,0.897891,0.000000,0.309935,1.771779,-2.343823,0.0,0.0,-3.126073,0.0,0.664986,-1.303142,0.0,-1.035098,1.126025,0.616252
3,0.493377,-0.978972,0.000000,-0.677119,0.374477,0.426653,0.0,0.0,-3.126073,0.0,-1.094236,0.724148,0.0,-1.035098,0.692382,1.581030
4,0.493377,0.897891,0.000000,-0.512610,0.374477,-2.343823,0.0,0.0,-2.209336,0.0,0.664986,-1.167989,0.0,-1.035098,0.836930,1.822224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.493377,0.897891,-1.780899,-0.512610,-1.022825,0.426653,0.0,0.0,0.540875,0.0,0.664986,-0.762531,0.0,0.690066,-0.463997,1.339835
496,0.493377,-0.978972,-1.187446,-0.841628,-1.022825,0.426653,0.0,0.0,0.540875,0.0,-1.094236,1.264758,0.0,0.690066,-1.042187,1.581030
497,-1.243866,-0.978972,-1.780899,-0.183592,0.374477,0.426653,0.0,0.0,0.540875,0.0,-1.094236,1.264758,0.0,-1.035098,1.704214,-0.589721
498,0.493377,0.897891,-1.888799,1.132480,0.374477,-2.343823,0.0,0.0,0.540875,0.0,2.424209,-0.897684,0.0,2.415229,-0.897640,2.063419


In [22]:
y

0      1
1      1
2      1
3      1
4      1
      ..
495    2
496    2
497    2
498    2
499    2
Name: loan_status, Length: 500, dtype: int64

In [23]:
y.value_counts()

loan_status
1    300
0    100
2    100
Name: count, dtype: int64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123, stratify=y )

### **Training**

In [25]:
models = [
LogisticRegression(),
SVC(),
DecisionTreeClassifier(),
MLPClassifier(),
RandomForestClassifier(),
XGBClassifier()
]

for model in models:
    model.fit(X_train, y_train)



In [26]:
model_names = [
    'LogisticRegression',
    'SVC',
    'DecisionTreeClassifier',
    'MLPClassifier',
    'RandomForestClassifier',
    'XGBClassifier'
]

for model, name in zip(models, model_names):
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) *100))

LogisticRegression: 98.40%
SVC: 98.40%
DecisionTreeClassifier: 97.60%
MLPClassifier: 100.00%
RandomForestClassifier: 99.20%
XGBClassifier: 100.00%


All the models performed excellently well after they were evaluated using the accuracy score especially the Neural Network(MLPClassifier) and Xgboost models with accuracy of 100%.