# LOADING MODULES AND DATASETS

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve,precision_recall_curve,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from scipy.sparse import csr_matrix
from eli5.sklearn import PermutationImportance

import eli5
import xgboost as xgbs
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick import classifier, features, regressor
import pickle
import re

In [2]:
data_train = r'/Users/Humza Ali/OneDrive/Documents/Data/Practise/loan_train.csv'
loan_train = pd.read_csv(data_train)

In [3]:
loan_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data_test = r'/Users/Humza Ali/OneDrive/Documents/Data/Practise/loan_test.csv'
loan_test = pd.read_csv(data_test)

In [5]:
loan_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


# BIVARIATE ANALYSIS AND FEATURE ENGINEERING

In [6]:
loan_train.shape

(614, 13)

In [7]:
loan_test.shape

(367, 12)

In [8]:
loan_train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [9]:
loan_train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [10]:
loan_train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [11]:
loan_train["Loan_Status"].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [12]:
loan_train["Loan_Status"]=np.where(loan_train['Loan_Status']=="Y",1,0)

In [13]:
loan_train["Loan_Status"].value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In [14]:
loan_train["Gender"].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [15]:
loan_train["Education"].value_counts()

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [16]:
loan_train["Dependents"].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [17]:
loan_train = loan_train.assign(
    Dependents=loan_train["Dependents"]
    .str.replace("3+", "3", regex=False)
    .astype("float")
)

In [18]:
loan_train["Dependents"].value_counts()

0.0    345
1.0    102
2.0    101
3.0     51
Name: Dependents, dtype: int64

In [19]:
loan_test["Dependents"].value_counts()

0     200
2      59
1      58
3+     40
Name: Dependents, dtype: int64

In [20]:
loan_test = loan_test.assign(
    Dependents=loan_test["Dependents"]
    .str.replace("3+", "3", regex=False)
    .astype("float")
)

In [21]:
loan_test["Dependents"].value_counts()

0.0    200
2.0     59
1.0     58
3.0     40
Name: Dependents, dtype: int64

In [22]:
loan_train['Property_Area'].value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [23]:
loan_test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [24]:
loan_train.drop(['Loan_ID'],1,inplace=True)

# TRAIN TEST SPLIT

In [25]:
x_train,x_test = train_test_split(loan_train,test_size=0.2,random_state=1)

In [26]:
x_train1 = x_train.drop(["Loan_Status"],1)
y_train1 = x_train["Loan_Status"]

In [27]:
x_test1 = x_test.drop(["Loan_Status",],1)
y_test1 = x_test["Loan_Status"]

# CREATING MODEL TRANSFORMER AND PIPELINE

In [28]:
nums_cols = x_train1.select_dtypes(np.number).columns

In [29]:
char_cols = x_train1.select_dtypes(object).columns

In [30]:
nums_cols,char_cols

(Index(['Dependents', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term', 'Credit_History'],
       dtype='object'),
 Index(['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area'], dtype='object'))

In [31]:
pipe_num = make_pipeline(SimpleImputer(strategy ="median"),StandardScaler())
pipe_char = make_pipeline(SimpleImputer(strategy ='constant',fill_value='Missing'),
                         OneHotEncoder(handle_unknown = "ignore"))

In [32]:
ctrans=make_column_transformer((pipe_num,nums_cols),(pipe_char,char_cols))

In [33]:
ctrans.fit_transform(x_train1)

array([[ 1.2229778 , -0.17504615, -0.52115617, ...,  0.        ,
         1.        ,  0.        ],
       [-0.75881131, -0.30146139, -0.52115617, ...,  0.        ,
         0.        ,  1.        ],
       [-0.75881131, -0.18541313,  0.2852085 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.75881131, -0.31430407, -0.52115617, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.23208325, -0.00484204, -0.10662353, ...,  1.        ,
         0.        ,  0.        ],
       [-0.75881131, -0.28846399, -0.0411537 , ...,  0.        ,
         1.        ,  0.        ]])

In [34]:
ctrans.transform(loan_test)

array([[-0.75881131,  0.02919878, -0.52115617, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.23208325, -0.37991002, -0.02766494, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.2229778 , -0.08220755,  0.07103331, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.75881131, -0.35298682,  0.13452918, ...,  0.        ,
         1.        ,  0.        ],
       [-0.75881131, -0.08220755,  0.26612684, ...,  1.        ,
         0.        ,  0.        ],
       [-0.75881131,  0.56766271, -0.52115617, ...,  1.        ,
         0.        ,  0.        ]])

# LOGISTIC REGRESSION

In [35]:
logreg=LogisticRegression(
   solver="liblinear",
    penalty="l1",
    class_weight="balanced",
    random_state=1,
    max_iter=800,)

In [36]:
logreg

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=800, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
pipe = make_pipeline(ctrans,logreg)

In [38]:
pipe

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

In [39]:
pipe.fit(x_train1,y_train1)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

In [40]:
pipe.predict(x_train1)

array([0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,

In [41]:
pipe.predict(x_test1)

array([1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1])

# ROC AUC SCORE

In [42]:
roc_auc_score(y_train1,pipe.predict_proba(x_train1)[:,1])

0.7953745600804425

In [43]:
roc_auc_score(y_test1,pipe.predict_proba(x_test1)[:,1])

0.7380952380952381

In [44]:
# AUC ROC ON WHOLE DATA

In [45]:
roc_auc_score(y_train1,pipe.predict_proba(x_train)[:,1])

0.7953745600804425

In [46]:
roc_auc_score(y_test1,pipe.predict_proba(x_test)[:,1])

0.7380952380952381

In [47]:
pipe.predict(loan_test)

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,

In [87]:
submission = grid.predict(loan_test)

In [88]:
submission=pd.DataFrame(data=submission)

In [89]:
submission.columns=["Loan_Status"]

In [90]:
submission["Loan_Status"]=np.where(submission["Loan_Status"]==1,"Y","N")

In [91]:
submission["Loan_ID"]=loan_test["Loan_ID"]

In [92]:
submission=submission[["Loan_ID","Loan_Status"]]

In [93]:
submission.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y


In [94]:
submission.to_csv("Loan_Prediction_2.csv",index=False)

# RANDOM FOREST

In [57]:
strings='randomforestclassifier__'

param_dict={'n_estimators':[int(x) for x in np.linspace(200,2000,num=10)],
           'max_features':['auto','sqrt'],
            'max_depth':[int(x) for x in np.linspace(10,110,num=11)],
            'min_samples_split':[2,5,10],
            'min_samples_leaf':[1,2,4],
            'bootstrap':[True,False]
            }

In [58]:
param_dict.items()

dict_items([('n_estimators', [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]), ('max_features', ['auto', 'sqrt']), ('max_depth', [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]), ('min_samples_split', [2, 5, 10]), ('min_samples_leaf', [1, 2, 4]), ('bootstrap', [True, False])])

In [59]:
param_dict = {strings+k:v for k,v in param_dict.items()}

In [60]:
param_dict

{'randomforestclassifier__n_estimators': [200,
  400,
  600,
  800,
  1000,
  1200,
  1400,
  1600,
  1800,
  2000],
 'randomforestclassifier__max_features': ['auto', 'sqrt'],
 'randomforestclassifier__max_depth': [10,
  20,
  30,
  40,
  50,
  60,
  70,
  80,
  90,
  100,
  110],
 'randomforestclassifier__min_samples_split': [2, 5, 10],
 'randomforestclassifier__min_samples_leaf': [1, 2, 4],
 'randomforestclassifier__bootstrap': [True, False]}

In [61]:
ctrans.fit_transform(x_train1)

array([[ 1.2229778 , -0.17504615, -0.52115617, ...,  0.        ,
         1.        ,  0.        ],
       [-0.75881131, -0.30146139, -0.52115617, ...,  0.        ,
         0.        ,  1.        ],
       [-0.75881131, -0.18541313,  0.2852085 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.75881131, -0.31430407, -0.52115617, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.23208325, -0.00484204, -0.10662353, ...,  1.        ,
         0.        ,  0.        ],
       [-0.75881131, -0.28846399, -0.0411537 , ...,  0.        ,
         1.        ,  0.        ]])

In [62]:
rf = RandomForestClassifier(random_state=1)

In [63]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [64]:
model_pipeline = make_pipeline(ctrans,rf)

In [65]:
model_pipeline

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                           

In [66]:
grid=RandomizedSearchCV(model_pipeline,param_dict,cv=5,scoring = "accuracy")

In [67]:
grid

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('columntransformer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('simpleimputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                   

In [68]:
grid.fit(x_train1,y_train1)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('columntransformer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('simpleimputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                   

In [69]:
grid.predict(x_train1)

array([0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [70]:
grid.predict(x_test1)

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1])

In [72]:
roc_auc_score(y_train1,grid.predict_proba(x_train1)[:,1])

0.9861352825153731

In [73]:
roc_auc_score(y_test1,grid.predict_proba(x_test1)[:,1])

0.7783882783882784

# XGBOOST

In [74]:
strings="xgb__"
parameters = {
    'max_depth':range(2,10,1),
    'n_estimators': [40,60,100,150],
    'learning_rate': [0.1, 0.01,0.05], #shrinkage.. reduces overfitting
    'reg_lambda':[0.1,0.01,1,10],
    'reg_alpha':[0.1,0.01,1,10],
}

In [75]:
parameters.items()

dict_items([('max_depth', range(2, 10)), ('n_estimators', [40, 60, 100, 150]), ('learning_rate', [0.1, 0.01, 0.05]), ('reg_lambda', [0.1, 0.01, 1, 10]), ('reg_alpha', [0.1, 0.01, 1, 10])])

In [76]:
xgb_params={strings+k:v for k,v in parameters.items()}
xgb_params

{'xgb__max_depth': range(2, 10),
 'xgb__n_estimators': [40, 60, 100, 150],
 'xgb__learning_rate': [0.1, 0.01, 0.05],
 'xgb__reg_lambda': [0.1, 0.01, 1, 10],
 'xgb__reg_alpha': [0.1, 0.01, 1, 10]}

In [78]:
xgb=xgbs.XGBClassifier()
pipe=Pipeline([("columntransfer",ctrans),("xgb",xgb)])

In [79]:
grid = RandomizedSearchCV(pipe,xgb_params,cv = 5,scoring = 'accuracy')

In [80]:
grid

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('columntransfer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('simpleimputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                      

In [81]:
grid.fit(x_train1,y_train1)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('columntransfer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('simpleimputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                      

In [82]:
grid.predict(x_train1)

array([0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [83]:
grid.predict(x_test1)

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1])

In [84]:
roc_auc_score(y_train1,grid.predict_proba(x_train1)[:,1])

0.8544398035348262

In [85]:
roc_auc_score(y_test1,grid.predict_proba(x_test1)[:,1])

0.7721306471306472

In [86]:
grid.predict(loan_test)

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,