# Importing Necessary Libraries

In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Standardizing and Encoding Dataset

In [65]:
df_cleaned = pd.read_excel('../data/dataset_v1.xlsx', index_col = 'Loan_ID')
df_cleaned.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,146.369492,360,1,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360,1,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360,1,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360,1,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360,1,Urban,Y


#### Converting the Credit_History column from int64 to float64

In [66]:
print(df_cleaned['Credit_History'].dtype)  # To see the data type

int64


In [67]:
df_cleaned['Credit_History'] = df_cleaned['Credit_History'].astype(float)


In [68]:
print(df_cleaned['Credit_History'].dtype)

float64


In [69]:
x = df_cleaned.drop('Loan_Status', axis = 1)
y = df_cleaned['Loan_Status'].copy()

In [70]:
num_values = x.select_dtypes(include = np.number)
cat_values = x.select_dtypes(exclude = np.number)

In [71]:
cat_values.shape, num_values.shape, x.shape

((611, 6), (611, 5), (611, 11))

## Handling The Categorical Columns

In [72]:
cat_values.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP001002,Male,No,0,Graduate,No,Urban
LP001003,Male,Yes,1,Graduate,No,Rural
LP001005,Male,Yes,0,Graduate,Yes,Urban
LP001006,Male,Yes,0,Not Graduate,No,Urban
LP001008,Male,No,0,Graduate,No,Urban


In [73]:
def convert_to_0_1(row):
    return 1 if (row == 'Male') or (row == 'Yes') else 0

In [74]:
cat_values["Married"] = cat_values["Married"].apply(convert_to_0_1)
cat_values["Self_Employed"] = cat_values["Self_Employed"].apply(convert_to_0_1)
cat_values["Gender"] = cat_values["Gender"].apply(convert_to_0_1)


In [75]:
cat_values.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP001002,1,0,0,Graduate,0,Urban
LP001003,1,1,1,Graduate,0,Rural
LP001005,1,1,0,Graduate,1,Urban
LP001006,1,1,0,Not Graduate,0,Urban
LP001008,1,0,0,Graduate,0,Urban


In [76]:
cat_values_ordinal = cat_values.loc[:, ["Gender", "Married", "Self_Employed"]]
cat_values_to_one_hot = cat_values.drop(["Gender", "Married", "Self_Employed"], axis = 1)

In [77]:
cat_values_to_one_hot = pd.get_dummies(cat_values_to_one_hot)

In [78]:
cat_values_to_one_hot.head()

Unnamed: 0_level_0,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
LP001002,True,False,False,False,True,False,False,False,True
LP001003,False,True,False,False,True,False,True,False,False
LP001005,True,False,False,False,True,False,False,False,True
LP001006,True,False,False,False,False,True,False,False,True
LP001008,True,False,False,False,True,False,False,False,True


In [79]:
cat_values_ordinal.head()

Unnamed: 0_level_0,Gender,Married,Self_Employed
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LP001002,1,0,0
LP001003,1,1,0
LP001005,1,1,1
LP001006,1,1,0
LP001008,1,0,0


In [80]:
perfect_cat_values = pd.concat([cat_values_ordinal, cat_values_to_one_hot], axis = 1)

In [81]:
perfect_cat_values.head()

Unnamed: 0_level_0,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1,0,0,True,False,False,False,True,False,False,False,True
LP001003,1,1,0,False,True,False,False,True,False,True,False,False
LP001005,1,1,1,True,False,False,False,True,False,False,False,True
LP001006,1,1,0,True,False,False,False,False,True,False,False,True
LP001008,1,0,0,True,False,False,False,True,False,False,False,True


# Handling The Numerical Columns

In [82]:
num_values.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LP001002,5849,0.0,146.369492,360,1.0
LP001003,4583,1508.0,128.0,360,1.0
LP001005,3000,0.0,66.0,360,1.0
LP001006,2583,2358.0,120.0,360,1.0
LP001008,6000,0.0,141.0,360,1.0


In [83]:
num_values.Credit_History.value_counts()

Credit_History
1.0    494
0.0    117
Name: count, dtype: int64

In [84]:
df_cleaned.shape

(611, 12)

In [85]:
perfect_cat_values = pd.concat([perfect_cat_values, num_values["Credit_History"]], axis = 1)
num_values = num_values.drop("Credit_History", axis = 1)

In [86]:
perfect_cat_values.head(2)

Unnamed: 0_level_0,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LP001002,1,0,0,True,False,False,False,True,False,False,False,True,1.0
LP001003,1,1,0,False,True,False,False,True,False,True,False,False,1.0


In [87]:
num_values.head(2)

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001002,5849,0.0,146.369492,360
LP001003,4583,1508.0,128.0,360


In [88]:
num_values_cols = num_values.columns
list(num_values_cols)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [89]:
std = StandardScaler()
num_values_arrays = std.fit_transform(num_values)

### `std.fit` calculates the standard deviation and mean of each columns in the dataset 
### `std.transform` applies the mean and standard deviation to each row in this fomula -> `(x-mean)/standard deviation`

In [90]:
num_values_arrays

array([[ 7.34964378e-02, -5.55745555e-01, -3.37663141e-16,
         2.75515640e-01],
       [-1.33509947e-01, -4.09552064e-02, -2.18238112e-01,
         2.75515640e-01],
       [-3.92349684e-01, -5.55745555e-01, -9.54826979e-01,
         2.75515640e-01],
       ...,
       [ 4.36983952e-01, -4.73816056e-01,  1.26682009e+00,
         2.75515640e-01],
       [ 3.57026510e-01, -5.55745555e-01,  4.82709358e-01,
         2.75515640e-01],
       [-1.33509947e-01, -5.55745555e-01, -1.58835784e-01,
         2.75515640e-01]])

In [91]:
standardized_num_values = pd.DataFrame(num_values_arrays, columns = list(num_values_cols), index = num_values.index)
standardized_num_values.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516
LP001003,-0.13351,-0.040955,-0.2182381,0.275516
LP001005,-0.39235,-0.555746,-0.954827,0.275516
LP001006,-0.460534,0.249212,-0.3132818,0.275516
LP001008,0.098187,-0.555746,-0.06379206,0.275516


In [92]:
restructured_x = pd.concat([standardized_num_values, perfect_cat_values], axis = 1)
restructured_x.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516,1,0,0,True,False,False,False,True,False,False,False,True,1.0
LP001003,-0.13351,-0.040955,-0.2182381,0.275516,1,1,0,False,True,False,False,True,False,True,False,False,1.0
LP001005,-0.39235,-0.555746,-0.954827,0.275516,1,1,1,True,False,False,False,True,False,False,False,True,1.0
LP001006,-0.460534,0.249212,-0.3132818,0.275516,1,1,0,True,False,False,False,False,True,False,False,True,1.0
LP001008,0.098187,-0.555746,-0.06379206,0.275516,1,0,0,True,False,False,False,True,False,False,False,True,1.0


In [93]:
full_df = pd.concat([restructured_x, y], axis =1)
full_df.head()

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LP001002,0.073496,-0.555746,-3.376631e-16,0.275516,1,0,0,True,False,False,False,True,False,False,False,True,1.0,Y
LP001003,-0.13351,-0.040955,-0.2182381,0.275516,1,1,0,False,True,False,False,True,False,True,False,False,1.0,N
LP001005,-0.39235,-0.555746,-0.954827,0.275516,1,1,1,True,False,False,False,True,False,False,False,True,1.0,Y
LP001006,-0.460534,0.249212,-0.3132818,0.275516,1,1,0,True,False,False,False,False,True,False,False,True,1.0,Y
LP001008,0.098187,-0.555746,-0.06379206,0.275516,1,0,0,True,False,False,False,True,False,False,False,True,1.0,Y


In [94]:
len(full_df.columns)

18

In [95]:
full_df["Loan_Status"] = full_df["Loan_Status"].apply(lambda x: 1 if x == "Y" else 0)

In [96]:
full_df.sample(10)

Unnamed: 0_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Self_Employed,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Credit_History,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LP001671,-0.324329,0.405561,-0.396445,0.275516,0,1,0,True,False,False,False,True,False,False,True,False,1.0,1
LP001924,-0.366515,0.486466,-0.681576,0.275516,1,0,0,True,False,False,False,True,False,True,False,False,1.0,1
LP001404,-0.365043,0.223609,0.090654,0.275516,0,1,0,True,False,False,False,True,False,False,True,False,1.0,1
LP002813,2.302985,-0.555746,5.389342,0.275516,0,1,1,False,True,False,False,True,False,False,True,False,1.0,1
LP002841,-0.365207,0.148848,-0.503369,0.275516,1,1,0,True,False,False,False,True,False,False,False,True,0.0,0
LP002522,-0.474106,-0.555746,-0.634054,0.275516,0,0,1,True,False,False,False,True,False,False,False,True,0.0,1
LP001146,-0.450396,0.618577,-0.313282,0.275516,0,1,0,True,False,False,False,True,False,False,False,True,0.0,0
LP002863,0.164573,-0.555746,0.043132,0.275516,1,1,0,False,False,False,True,True,False,False,True,False,1.0,0
LP002619,-0.259251,-0.04949,-0.26576,-0.659707,1,1,0,True,False,False,False,False,True,False,True,False,1.0,1
LP001841,-0.460534,0.18401,-0.503369,0.275516,1,0,1,True,False,False,False,False,True,True,False,False,1.0,1


In [97]:
full_df.to_excel("../data/dataset_v2.xlsx" , index = None)