<a href="https://colab.research.google.com/github/7rohit/loan-prediction-random-forest/blob/main/loan_prediction_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
train_data = pd.read_csv('train_ctrUa4K.csv')
test_data = pd.read_csv('test_lAUu6dG.csv')

In [None]:
def mix_data():
    train_data = pd.read_csv('train_ctrUa4K.csv')
    test_data = pd.read_csv('test_lAUu6dG.csv')
    targets = train_data.Loan_Status
    train_data.drop('Loan_Status', 1, inplace=True)
    mixture = train_data.append(test_data)
    mixture.reset_index(inplace=True)
    mixture.drop(['index', 'Loan_ID'], inplace=True, axis=1)
    return mixture

In [None]:
mixture = mix_data()
mixture.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,981.0,981.0,954.0,961.0,902.0
mean,5179.795107,1601.91633,142.51153,342.201873,0.83592
std,5695.104533,2718.772806,77.421743,65.100602,0.370553
min,0.0,0.0,9.0,6.0,0.0
25%,2875.0,0.0,100.0,360.0,1.0
50%,3800.0,1110.0,126.0,360.0,1.0
75%,5516.0,2365.0,162.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [None]:
def add_gender():
    global mixture
    mixture['Gender'].fillna('Male', inplace=True)

def add_martial_status():
    global mixture
    mixture['Married'].fillna('Yes', inplace=True)

def add_employment():
    global mixture
    mixture['Self_Employed'].fillna('No', inplace=True)

def add_loan_amount():
    global mixture
    mixture['LoanAmount'].fillna(mixture['LoanAmount'].median(), inplace=True)

def add_credit_history():
    global mixture
    mixture['Credit_History'].fillna(2, inplace=True)

In [None]:
add_gender()
add_martial_status()
add_employment()
add_loan_amount()
add_credit_history()

In [None]:
mixture.isnull().sum()

Gender                0
Married               0
Dependents           25
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     20
Credit_History        0
Property_Area         0
dtype: int64

In [None]:
def num_gender():
    global mixture
    mixture['Gender'] = mixture['Gender'].map({'Male':1,'Female':0})
 
def num_martial_status():
    global mixture
    mixture['Married'] = mixture['Married'].map({'Yes':1,'No':0})
 
def num_dependents():
    global mixture
    mixture['Singleton'] = mixture['Dependents'].map(lambda d: 1 if d=='1' else 0)
    mixture['Small_Family'] = mixture['Dependents'].map(lambda d: 1 if d=='2' else 0)
    mixture['Large_Family'] = mixture['Dependents'].map(lambda d: 1 if d=='3+' else 0)
    mixture.drop(['Dependents'], axis=1, inplace=True)
 
def num_education():
    global mixture
    mixture['Education'] = mixture['Education'].map({'Graduate':1,'Not Graduate':0})
 
def num_employment():
    global mixture
    mixture['Self_Employed'] = mixture['Self_Employed'].map({'Yes':1,'No':0})
 
def num_income():
    global mixture
    mixture['Total_Income'] = mixture['ApplicantIncome'] + mixture['CoapplicantIncome']
    mixture.drop(['ApplicantIncome','CoapplicantIncome'], axis=1, inplace=True)
 
def num_loan_amount():
    global mixture
    mixture['Debt_Income_Ratio'] = mixture['Total_Income'] / mixture['LoanAmount']

def num_loan_term():
    global mixture
    mixture['Very_Short_Term'] = mixture['Loan_Amount_Term'].map(lambda t: 1 if t<=60 else 0)
    mixture['Short_Term'] = mixture['Loan_Amount_Term'].map(lambda t: 1 if t>60 and t<180 else 0)
    mixture['Long_Term'] = mixture['Loan_Amount_Term'].map(lambda t: 1 if t>=180 and t<=300  else 0)
    mixture['Very_Long_Term'] = mixture['Loan_Amount_Term'].map(lambda t: 1 if t>300 else 0)
    mixture.drop('Loan_Amount_Term', axis=1, inplace=True)

def num_credit_history():
    global mixture
    mixture['Credit_History_Bad'] = mixture['Credit_History'].map(lambda c: 1 if c==0 else 0)
    mixture['Credit_History_Good'] = mixture['Credit_History'].map(lambda c: 1 if c==1 else 0)
    mixture['Credit_History_Unknown'] = mixture['Credit_History'].map(lambda c: 1 if c==2 else 0)
    mixture.drop('Credit_History', axis=1, inplace=True)

def num_property():
    global mixture
    property_dummies = pd.get_dummies(mixture['Property_Area'], prefix='Property')
    mixture = pd.concat([mixture, property_dummies], axis=1)
    mixture.drop('Property_Area', axis=1, inplace=True)

In [None]:
num_gender()
num_martial_status()
num_dependents()
num_education()
num_employment()
num_income()
num_loan_amount()
num_loan_term()
num_credit_history()
num_property()

In [None]:
def scale(df):
    df = df - df.min()
    df = df/df.max()
    return df

In [None]:
mixture['LoanAmount'] = scale(mixture['LoanAmount'])
mixture['Total_Income'] = scale(mixture['Total_Income'])
mixture['Debt_Income_Ratio'] = scale(mixture['Debt_Income_Ratio'])

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
train_data.shape

(614, 13)

In [None]:
def score(clf, X, y, scoring='accuracy'):
    x_val = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(x_val)

def recover_train_test_target():
    global mixture, train_data
    targets = train_data['Loan_Status'].map({'Y':1,'N':0})
    train = mixture.head(614)
    test = mixture.iloc[614:]
    return train, test, targets

In [None]:
train, test, targets = recover_train_test_target()

In [None]:
clf = RandomForestClassifier()
clf = clf.fit(train, targets)

In [None]:
features = pd.DataFrame()
features['Feature'] = train.columns
features['Importance'] = clf.feature_importances_
features.sort_values(by=['Importance'], ascending=False, inplace=True)
features.set_index('Feature', inplace=True)

In [None]:
model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(train)
train_reduced.shape

(614, 5)

In [None]:
parameters = {'bootstrap': False,'min_samples_leaf': 49, 'n_estimators': 150, 'min_samples_split': 15,'max_depth': 5}

model = RandomForestClassifier(**parameters)
model.fit(train, targets)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=49, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
score(model, train, targets, scoring='accuracy')

0.809462881514061

In [None]:
output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test_lAUu6dG.csv')
df_output['Loan_ID'] = aux['Loan_ID']
df_output['Loan_Status'] = np.vectorize(lambda s: 'Y' if s==1 else 'N')(output)
df_output[['Loan_ID','Loan_Status']].to_csv('solution.csv',index=False)