## Importing Necessary Libraries

In [7]:
import pandas as pd

## Checking out the dataset

In [8]:
df = pd.read_excel("../data/dataset_v0.xlsx")

In [9]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [10]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [11]:
df.shape

(614, 13)

## Cleaning The Dataset

In [12]:
df.dropna(subset=["Married"], inplace=True)

In [13]:
df.shape

(611, 13)

In [14]:
df.Gender.unique()

array(['Male', 'Female', nan], dtype=object)

In [15]:
df.Gender.value_counts()

Gender
Male      487
Female    111
Name: count, dtype: int64

In [16]:
df.Gender = df.Gender.fillna("Male")

In [17]:
df.Dependents.unique()

array([0, 1, 2, '3+', nan], dtype=object)

In [18]:
df.Dependents.value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [19]:
df.Dependents = df.Dependents.fillna(0)

In [20]:
df.Self_Employed.unique()

array(['No', 'Yes', nan], dtype=object)

In [21]:
df.Self_Employed.value_counts()

Self_Employed
No     497
Yes     82
Name: count, dtype: int64

In [22]:
df[df.Self_Employed.isna()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
11,LP001027,Male,Yes,2,Graduate,,2500,1840.0,109.0,360.0,1.0,Urban,Y
19,LP001041,Male,Yes,0,Graduate,,2600,3500.0,115.0,,1.0,Urban,Y
24,LP001052,Male,Yes,1,Graduate,,3717,2925.0,151.0,360.0,,Semiurban,N
29,LP001087,Female,No,2,Graduate,,3750,2083.0,120.0,360.0,1.0,Semiurban,Y
30,LP001091,Male,Yes,1,Graduate,,4166,3369.0,201.0,360.0,,Urban,N
95,LP001326,Male,No,0,Graduate,,6782,0.0,,360.0,,Urban,N
107,LP001370,Male,No,0,Not Graduate,,7333,0.0,120.0,360.0,1.0,Rural,N
111,LP001387,Female,Yes,0,Graduate,,2929,2333.0,139.0,360.0,1.0,Semiurban,Y
114,LP001398,Male,No,0,Graduate,,5050,0.0,118.0,360.0,1.0,Semiurban,Y
158,LP001546,Male,No,0,Graduate,,2980,2083.0,120.0,360.0,1.0,Rural,Y


In [23]:
df_top = df[df.index < 600]
df_bottom = df[df.index >= 600]

In [24]:
df_top.shape, df_bottom.shape

((597, 13), (14, 13))

In [25]:
df_top.loc[:, "Self_Employed"] = df_top.Self_Employed.fillna("No")
df_bottom.loc[:, "Self_Employed"] = df_bottom.Self_Employed.fillna("Yes")

In [26]:
df = pd.concat([df_top, df_bottom])

In [27]:
df.shape

(611, 13)

In [28]:
df.Self_Employed.value_counts()

Self_Employed
No     527
Yes     84
Name: count, dtype: int64

Replace null values of Loan_Amount with its mean 

In [29]:
loan_amt_mean = df.LoanAmount.mean()

In [30]:
df.LoanAmount = df.LoanAmount.fillna(loan_amt_mean)

In [31]:
df.Loan_Amount_Term.unique()

array([360., 120., 240.,  nan, 180.,  60., 300., 480.,  36.,  84.,  12.])

In [32]:
df.Loan_Amount_Term.value_counts()

Loan_Amount_Term
360.0    511
180.0     44
480.0     14
300.0     13
84.0       4
120.0      3
240.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [33]:
df.Loan_Amount_Term = df.Loan_Amount_Term.fillna(360.0)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 611 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            611 non-null    object 
 1   Gender             611 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         611 non-null    object 
 4   Education          611 non-null    object 
 5   Self_Employed      611 non-null    object 
 6   ApplicantIncome    611 non-null    int64  
 7   CoapplicantIncome  611 non-null    float64
 8   LoanAmount         611 non-null    float64
 9   Loan_Amount_Term   611 non-null    float64
 10  Credit_History     561 non-null    float64
 11  Property_Area      611 non-null    object 
 12  Loan_Status        611 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 66.8+ KB


In [35]:
df.Credit_History.value_counts()

Credit_History
1.0    472
0.0     89
Name: count, dtype: int64

In [36]:
df.shape

(611, 13)

In [37]:
df_top = df[df.index < 306]
df_bottom = df[df.index >= 306]

In [38]:
df_top.shape, df_bottom.shape

((304, 13), (307, 13))

In [39]:
df_top.loc[:, "Credit_History"] = df_top.Credit_History.fillna(1.)

In [40]:
df_bottom.loc[:, "Credit_History"] = df_bottom.Credit_History.fillna(0.)

In [41]:
df = pd.concat([df_top, df_bottom])
df.shape

(611, 13)

In [42]:
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.369492,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Saving Cleaned Data

In [43]:
df.to_excel('../data/dataset_v1.xlsx' , index = None)