## LOAN PREDICTION PROBLEM

### 3. Load packages 

In [1]:
import pandas as pd
import numpy as np                     # For mathematical calculations
from xverse.ensemble import VotingSelector
from sklearn.preprocessing import StandardScaler, MinMaxScaler  
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")

#### Reading the Data 

In [2]:
data = pd.read_csv("data/loans_data.csv")

#### Understanding the Data 

In [3]:
#show features represented in the train dataset
for column in data.columns:
    print(column)

Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status


<img src="images/Data column Description.PNG" /> 

We have 12 independent variables and 1 target variable, i.e. Loan_Status in the dataset.

In [4]:
#show the first 5 rows of the dataset
data.head() 

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# preprocessing the dataset.

def preprocessing(data):

    # replace with numerical values
    data['Dependents'].replace('3+', 3,inplace=True)
    data['Loan_Status'].replace('N', 0,inplace=True)
    data['Loan_Status'].replace('Y', 1,inplace=True)

    # handle missing data 
    data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
    data['Married'].fillna(data['Married'].mode()[0], inplace=True)
    data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
    data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)
    data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)
    data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)
    data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)

    # drop ID column
    data = data.drop('Loan_ID',axis=1)

    #scale the data
    data["ApplicantIncome"] = MinMaxScaler().fit_transform(data["ApplicantIncome"].values.reshape(-1,1))
    data["LoanAmount"] = MinMaxScaler().fit_transform(data["LoanAmount"].values.reshape(-1,1))
    data["CoapplicantIncome"] = MinMaxScaler().fit_transform(data["CoapplicantIncome"].values.reshape(-1,1))
    data["Loan_Amount_Term"] = MinMaxScaler().fit_transform(data["Loan_Amount_Term"].values.reshape(-1,1))


    return data 

In [6]:
#preprocess the dataset 
data = preprocessing(data)

In [7]:
#split data into independent features and target 
X = data.drop('Loan_Status',axis = 1)
y = data.Loan_Status

In [8]:
# call Votingselector with minimum_votes of 2 
clf = VotingSelector(minimum_votes=2)
clf.fit(X, y)


VotingSelector(exclude_features=None, feature_names='all',
               handle_category='woe', minimum_votes=2, no_of_features=5,
               numerical_missing_values='median',
               selection_techniques=['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE'])

In [9]:
# show available techniques 
print(clf.available_techniques)

['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE']


In [10]:
#show important features 
clf.feature_importances_

Unnamed: 0,Variable_Name,Information_Value,Random_Forest,Recursive_Feature_Elimination,Extra_Trees,Chi_Square,L_One
0,Credit_History,1.555207,0.276299,3.374063,0.279651,26.005877,0.485714
1,Property_Area,0.096228,0.046397,1.069711,0.043202,4.905879,0.0
2,Loan_Amount_Term,0.040059,0.042608,0.0,0.047626,0.008367,0.0
3,Married,0.038244,0.026325,0.956183,0.020787,1.782425,0.0
4,Education,0.033044,0.022892,0.0,0.021542,0.98839,0.0
5,Dependents,0.02466,0.038656,0.78206,0.048033,0.851731,0.0
6,LoanAmount,0.016229,0.193779,0.0,0.188621,0.050625,0.0
7,CoapplicantIncome,0.004391,0.112,-0.83446,0.118877,0.272207,0.0
8,Gender,0.001488,0.018244,0.0,0.021787,0.036234,0.0
9,ApplicantIncome,0.000772,0.204138,0.0,0.188461,0.001195,0.0


In [11]:
# votes 
clf.feature_votes_

Unnamed: 0,Variable_Name,Information_Value,Random_Forest,Recursive_Feature_Elimination,Extra_Trees,Chi_Square,L_One,Votes
0,Credit_History,1,1,1,1,1,1,6
1,Property_Area,1,1,1,0,1,1,5
3,Married,1,0,1,0,1,1,4
4,Education,1,0,0,0,1,1,3
5,Dependents,0,0,1,1,1,0,3
7,CoapplicantIncome,0,1,1,1,0,0,3
2,Loan_Amount_Term,1,0,0,0,0,1,2
6,LoanAmount,0,1,0,1,0,0,2
9,ApplicantIncome,0,1,0,1,0,0,2
8,Gender,0,0,0,0,0,0,0


In [12]:
# transform your data into important features 
clf.transform(X).head()

Unnamed: 0,Credit_History,Property_Area,Married,Education,Dependents,CoapplicantIncome,Loan_Amount_Term,LoanAmount,ApplicantIncome
0,0.540288,-0.131267,-0.259118,0.099793,-0.005509,0.0,0.74359,0.172214,0.070489
1,0.540288,-0.321136,0.148063,0.099793,-0.181374,0.036192,0.74359,0.172214,0.05483
2,0.540288,-0.131267,0.148063,0.099793,-0.005509,0.0,0.74359,0.082489,0.03525
3,0.540288,-0.131267,0.148063,-0.332034,-0.005509,0.056592,0.74359,0.160637,0.030093
4,0.540288,-0.131267,-0.259118,0.099793,-0.005509,0.0,0.74359,0.191027,0.072356
