In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('Train_Loan_Home.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
columnsX=data.columns[1:-1]
columnsY=data.columns[-1]

In [5]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
catCol=['Gender','Married','Education','Self_Employed','Property_Area','Dependents']
numWithScalingCol=['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
numerCol=['Credit_History']

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import ensemble

In [8]:
numeric_transformer_scaling = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=99))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_scaling, numWithScalingCol),
        ('num2', numeric_transformer, numerCol),
        ('cat', categorical_transformer, catCol)])

In [9]:
modelPipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', ensemble.RandomForestClassifier())])

In [10]:
modelPipeline.fit(data[columnsX],data[columnsY])

In [11]:
scores=modelPipeline.predict_proba(data[columnsX])[:,-1]

In [12]:
scores

array([1.  , 0.25, 0.91, 0.91, 0.99, 0.91, 0.96, 0.06, 0.92, 0.13, 0.93,
       0.92, 0.87, 0.31, 0.91, 1.  , 0.88, 0.03, 0.29, 0.91, 0.07, 0.93,
       0.07, 0.08, 0.31, 0.94, 0.98, 0.93, 0.26, 0.95, 0.18, 0.23, 0.3 ,
       1.  , 0.15, 0.98, 0.05, 1.  , 0.78, 0.96, 0.22, 0.84, 0.89, 0.99,
       0.85, 0.94, 0.92, 0.95, 0.05, 0.91, 0.98, 0.95, 0.4 , 0.31, 0.04,
       0.98, 0.99, 0.31, 0.94, 0.97, 0.96, 0.92, 0.06, 0.06, 0.03, 0.32,
       0.05, 0.91, 0.85, 0.07, 0.88, 0.89, 0.93, 0.12, 0.96, 0.16, 0.31,
       0.25, 0.06, 0.97, 0.9 , 0.95, 0.23, 0.27, 0.88, 0.95, 0.98, 0.98,
       0.79, 0.98, 1.  , 0.96, 0.97, 0.89, 0.81, 0.35, 0.99, 0.95, 0.86,
       0.99, 0.88, 0.88, 0.92, 0.98, 0.95, 0.93, 0.96, 0.25, 0.03, 0.89,
       0.99, 0.99, 0.02, 0.94, 0.97, 0.88, 0.98, 0.93, 0.3 , 0.83, 0.87,
       0.96, 0.67, 0.99, 0.87, 0.92, 0.92, 0.97, 0.07, 0.24, 0.78, 0.85,
       0.94, 0.95, 0.94, 0.31, 0.12, 0.96, 0.13, 0.33, 0.32, 0.96, 0.9 ,
       0.99, 1.  , 0.99, 0.87, 0.9 , 0.19, 0.97, 0.

In [13]:
data2=pd.read_csv('Test_Loan_Home.csv')

In [14]:
scores=modelPipeline.predict_proba(data2[columnsX])[:,-1]

In [15]:
scoreOutput={j:k for j,k in zip(data2['Loan_ID'],scores)}

In [16]:
import joblib

In [17]:
joblib.dump(modelPipeline,'modelPipeline.pkl')

['modelPipeline.pkl']