### Feature Engineering

In [25]:
import numpy as np
import pandas as pd

In [26]:
df = pd.read_csv('data/loan.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Deleting unnesscary features

In [27]:
df.drop(columns=['Loan_ID'],inplace=True)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Seprating CATE and NUME feature for preprocessing

In [28]:
cate_features = [feature for feature in df.columns if df[feature].dtype == 'O']
num_features = [
    feature for feature in df.columns if feature not in cate_features]

# removing Credit_History from num features
num_features.remove('Credit_History')
cate_features.append('Credit_History')

{'cate_features': cate_features,
    'num_features': num_features}

{'cate_features': ['Gender',
  'Married',
  'Dependents',
  'Education',
  'Self_Employed',
  'Property_Area',
  'Loan_Status',
  'Credit_History'],
 'num_features': ['ApplicantIncome',
  'CoapplicantIncome',
  'LoanAmount',
  'Loan_Amount_Term']}

### Handling with null values

In [29]:
# handling numerical features

for feature in num_features:
    if df[feature].isna().sum() != 0:
        df[feature].fillna(df[feature].mean(),inplace=True)

df[num_features].isna().sum()
df[num_features]

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,5849,0.0,146.412162,360.0
1,4583,1508.0,128.000000,360.0
2,3000,0.0,66.000000,360.0
3,2583,2358.0,120.000000,360.0
4,6000,0.0,141.000000,360.0
...,...,...,...,...
609,2900,0.0,71.000000,360.0
610,4106,0.0,40.000000,180.0
611,8072,240.0,253.000000,360.0
612,7583,0.0,187.000000,360.0


In [30]:
# handling catgorical features

for feature in cate_features:
    if feature != 'Credit_History':
        df[feature].fillna(df[feature].mode()[0],inplace=True)

df[cate_features].isna().sum()
df[cate_features]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,Credit_History
0,Male,No,0,Graduate,No,Urban,Y,1.0
1,Male,Yes,1,Graduate,No,Rural,N,1.0
2,Male,Yes,0,Graduate,Yes,Urban,Y,1.0
3,Male,Yes,0,Not Graduate,No,Urban,Y,1.0
4,Male,No,0,Graduate,No,Urban,Y,1.0
...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,Rural,Y,1.0
610,Male,Yes,3+,Graduate,No,Rural,Y,1.0
611,Male,Yes,1,Graduate,No,Urban,Y,1.0
612,Male,Yes,2,Graduate,No,Urban,Y,1.0


### Encoding cate features

In [31]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for feature in cate_features:
    if feature != 'Credit_History':
        df[feature] = encoder.fit_transform(df[feature])

df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)

df[cate_features].head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,Credit_History
0,1,0,0,0,0,2,1,1.0
1,1,1,1,0,0,0,0,1.0
2,1,1,0,0,1,2,1,1.0
3,1,1,0,1,0,2,1,1.0
4,1,0,0,0,0,2,1,1.0


### Now data is cleaned let's save into a new file

In [32]:
df.to_csv('data/loan-cleaned.csv',index=False);