### Feature Engineering

In [80]:
import numpy as np
import pandas as pd

In [81]:
df = pd.read_csv('data/loan.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Deleting unnesscary features

In [82]:
df.drop(columns=['Loan_ID'],inplace=True)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Seprating CATE and NUME feature for preprocessing

In [83]:
cate_features = [feature for feature in df.columns if df[feature].dtype == 'O']
num_features = [
    feature for feature in df.columns if feature not in cate_features]

# removing Credit_History from num features
num_features.remove('Credit_History')
cate_features.append('Credit_History')

{'cate_features': cate_features,
    'num_features': num_features}

{'cate_features': ['Gender',
  'Married',
  'Dependents',
  'Education',
  'Self_Employed',
  'Property_Area',
  'Loan_Status',
  'Credit_History'],
 'num_features': ['ApplicantIncome',
  'CoapplicantIncome',
  'LoanAmount',
  'Loan_Amount_Term']}

### Handling with null values

In [84]:
# handling numerical features

for feature in num_features:
    if df[feature].isna().sum() != 0:
        df[feature].fillna(df[feature].mean(),inplace=True)

df[num_features].isna().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
dtype: int64

In [85]:
# handling catgorical features

for feature in cate_features:
    if feature != 'Credit_History':
        df[feature].fillna(df[feature].mode()[0],inplace=True)

df[cate_features].isna().sum()

Gender             0
Married            0
Dependents         0
Education          0
Self_Employed      0
Property_Area      0
Loan_Status        0
Credit_History    50
dtype: int64

### Encoding cate features

In [86]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for feature in cate_features:
    if feature != 'Credit_History':
        df[feature] = encoder.fit_transform(df[feature])

df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)

df[cate_features].head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,Credit_History
0,1,0,0,0,0,2,1,1.0
1,1,1,1,0,0,0,0,1.0
2,1,1,0,0,1,2,1,1.0
3,1,1,0,1,0,2,1,1.0
4,1,0,0,0,0,2,1,1.0


### Now data is cleaned let's save into a new file

In [88]:
df.to_csv('data/loan-cleaned.csv',index=False);