# Importing Necessary Libraries

In [1]:
import numpy as np
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Standardizing and Encoding Dataset

In [2]:
df_cleaned = pd.read_excel('../data/dataset_v1.xlsx', index_col = 'Loan_ID')
df_cleaned.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,146.369492,360,1,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360,1,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360,1,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360,1,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360,1,Urban,Y


In [3]:
x = df_cleaned.drop('Loan_Status', axis = 1)
y = df_cleaned['Loan_Status'].copy()

In [4]:
num_values = x.select_dtypes(include = np.number)
cat_values = x.select_dtypes(exclude = np.number)

In [5]:
cat_values.shape, num_values.shape, x.shape

((611, 6), (611, 5), (611, 11))

In [6]:
num_values.columns, cat_values.columns

(Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term', 'Credit_History'],
       dtype='object'),
 Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
        'Property_Area'],
       dtype='object'))

In [7]:
x.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

## Handling The Categorical Columns

In [8]:
cat_values.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP001002,Male,No,0,Graduate,No,Urban
LP001003,Male,Yes,1,Graduate,No,Rural
LP001005,Male,Yes,0,Graduate,Yes,Urban
LP001006,Male,Yes,0,Not Graduate,No,Urban
LP001008,Male,No,0,Graduate,No,Urban


In [9]:
def convert_to_0_1(row):
    return 1 if (row == 'Male') or (row == 'Yes') else 0

In [10]:
cat_values["Married"] = cat_values["Married"].apply(convert_to_0_1)
cat_values["Self_Employed"] = cat_values["Self_Employed"].apply(convert_to_0_1)
cat_values["Gender"] = cat_values["Gender"].apply(convert_to_0_1)


In [11]:
cat_values.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP001002,1,0,0,Graduate,0,Urban
LP001003,1,1,1,Graduate,0,Rural
LP001005,1,1,0,Graduate,1,Urban
LP001006,1,1,0,Not Graduate,0,Urban
LP001008,1,0,0,Graduate,0,Urban


In [12]:
cat_values_ordinal = cat_values.loc[:, ["Gender", "Married", "Self_Employed"]]
cat_values_to_one_hot = cat_values.drop(["Gender", "Married", "Self_Employed"], axis = 1)

In [13]:
cat_values_to_one_hot.head(1)

Unnamed: 0_level_0,Dependents,Education,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LP001002,0,Graduate,Urban


In [14]:
cat_values_to_one_hot.astype('str').info()

<class 'pandas.core.frame.DataFrame'>
Index: 611 entries, LP001002 to LP002990
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Dependents     611 non-null    object
 1   Education      611 non-null    object
 2   Property_Area  611 non-null    object
dtypes: object(3)
memory usage: 19.1+ KB


In [15]:
def custom_combiner(feature, category):
    return str(feature) + "_" + str(category)

In [16]:
one_hot = OneHotEncoder(feature_name_combiner=custom_combiner)
one_hot.fit(cat_values_to_one_hot.astype("str"))

pickle.dump(one_hot, open("models_and_encoders/one_hot_encoder.pkl", "wb"))

cat_values_to_one_hot = one_hot.transform(cat_values_to_one_hot.astype("str"))

In [17]:
cat_values_to_one_hot.todense()

matrix([[1., 0., 0., ..., 0., 0., 1.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 0., 0., 1.],
        ...,
        [0., 1., 0., ..., 0., 0., 1.],
        [0., 0., 1., ..., 0., 0., 1.],
        [1., 0., 0., ..., 0., 1., 0.]])

In [18]:
cat_values_to_one_hot = pd.DataFrame(cat_values_to_one_hot.todense(), columns = one_hot.get_feature_names_out(), index=cat_values.index)

In [19]:
one_hot.get_feature_names_out()

array(['Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'Education_Graduate', 'Education_Not Graduate',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'], dtype=object)

In [20]:
cat_values_to_one_hot.head()

Unnamed: 0_level_0,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
LP001002,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
LP001003,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
LP001005,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
LP001006,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
LP001008,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [21]:
cat_values_ordinal.head()

Unnamed: 0_level_0,Gender,Married,Self_Employed
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LP001002,1,0,0
LP001003,1,1,0
LP001005,1,1,1
LP001006,1,1,0
LP001008,1,0,0


In [22]:
perfect_cat_values = pd.concat([cat_values_ordinal, cat_values_to_one_hot], axis = 1)

In [23]:
perfect_cat_values.head()

Unnamed: 0_level_0,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
LP001003,1,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
LP001005,1,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
LP001006,1,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
LP001008,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## Handling The Numerical Columns

In [24]:
num_values.sample(10)

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LP001350,13650,0.0,146.369492,360,1
LP001541,6000,0.0,160.0,360,1
LP002082,5818,2160.0,184.0,360,1
LP001811,3406,4417.0,123.0,360,1
LP002170,5000,3667.0,236.0,360,1
LP001106,2275,2067.0,146.369492,360,1
LP001702,3418,0.0,127.0,360,1
LP002119,4554,1229.0,158.0,360,1
LP001883,3418,0.0,135.0,360,1
LP002833,4467,0.0,120.0,360,0


In [25]:
num_values.Credit_History.value_counts()

Credit_History
1    494
0    117
Name: count, dtype: int64

In [26]:
df_cleaned.shape

(611, 12)

In [27]:
perfect_cat_values = pd.concat([perfect_cat_values, num_values["Credit_History"]], axis = 1)
num_values = num_values.drop("Credit_History", axis = 1)

In [28]:
num_values.sample()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001770,3189,2598.0,120.0,360


In [29]:
perfect_cat_values.head(2)

Unnamed: 0_level_0,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LP001002,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
LP001003,1,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1


In [30]:
num_values.head(2)

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001002,5849,0.0,146.369492,360
LP001003,4583,1508.0,128.0,360


In [31]:
num_values_cols = num_values.columns
list(num_values_cols)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [32]:
std = StandardScaler()
std.fit(num_values)
pickle.dump(std, open("models_and_encoders/credit_risk_scaler.pkl", "wb"))
num_values_arrays = std.transform(num_values)

In [33]:
std.feature_names_in_

array(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term'], dtype=object)

### `std.fit` calculates the standard deviation and mean of each columns in the dataset 
### `std.transform` applies the mean and standard deviation to each row in this fomula -> `(x-mean)/standard deviation`

In [34]:
num_values.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001002,5849,0.0,146.369492,360
LP001003,4583,1508.0,128.0,360
LP001005,3000,0.0,66.0,360
LP001006,2583,2358.0,120.0,360
LP001008,6000,0.0,141.0,360


In [35]:
num_values_arrays

array([[ 7.34964378e-02, -5.55745555e-01, -3.37663141e-16,
         2.75515640e-01],
       [-1.33509947e-01, -4.09552064e-02, -2.18238112e-01,
         2.75515640e-01],
       [-3.92349684e-01, -5.55745555e-01, -9.54826979e-01,
         2.75515640e-01],
       ...,
       [ 4.36983952e-01, -4.73816056e-01,  1.26682009e+00,
         2.75515640e-01],
       [ 3.57026510e-01, -5.55745555e-01,  4.82709358e-01,
         2.75515640e-01],
       [-1.33509947e-01, -5.55745555e-01, -1.58835784e-01,
         2.75515640e-01]])

In [36]:
standardized_num_values = pd.DataFrame(num_values_arrays, columns = list(num_values_cols), index = num_values.index)
standardized_num_values.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516
LP001003,-0.13351,-0.040955,-0.2182381,0.275516
LP001005,-0.39235,-0.555746,-0.954827,0.275516
LP001006,-0.460534,0.249212,-0.3132818,0.275516
LP001008,0.098187,-0.555746,-0.06379206,0.275516


In [37]:
restructured_x = pd.concat([standardized_num_values, perfect_cat_values], axis = 1)
restructured_x.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
LP001003,-0.13351,-0.040955,-0.2182381,0.275516,1,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
LP001005,-0.39235,-0.555746,-0.954827,0.275516,1,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
LP001006,-0.460534,0.249212,-0.3132818,0.275516,1,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
LP001008,0.098187,-0.555746,-0.06379206,0.275516,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1


In [38]:
full_df = pd.concat([restructured_x, y], axis =1)
full_df.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,Y
LP001003,-0.13351,-0.040955,-0.2182381,0.275516,1,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,N
LP001005,-0.39235,-0.555746,-0.954827,0.275516,1,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,Y
LP001006,-0.460534,0.249212,-0.3132818,0.275516,1,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,Y
LP001008,0.098187,-0.555746,-0.06379206,0.275516,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,Y


In [39]:
len(full_df.columns)

18

In [40]:
full_df["Loan_Status"] = full_df["Loan_Status"].apply(lambda x: 1 if x == "Y" else 0)

In [41]:
full_df.head(10)

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1
LP001003,-0.13351,-0.040955,-0.2182381,0.275516,1,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0
LP001005,-0.39235,-0.555746,-0.954827,0.275516,1,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1
LP001006,-0.460534,0.249212,-0.3132818,0.275516,1,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1
LP001008,0.098187,-0.555746,-0.06379206,0.275516,1,0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1
LP001011,0.002859,0.876655,1.433147,0.275516,1,1,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1
LP001013,-0.501412,-0.038224,-0.6102935,0.275516,1,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1
LP001014,-0.386463,0.299052,0.1381759,0.275516,1,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,0
LP001018,-0.227856,-0.03481,0.2569805,0.275516,1,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1
LP001020,1.216773,3.188433,2.407345,0.275516,1,1,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,0


In [42]:
full_df.to_excel("../data/dataset_v2.xlsx", index=None)