In [10]:
import os

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [11]:
# Read the Data
data = pd.read_csv('./Bank_Personal_Loan_Modelling.csv')
data.head(3)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [5]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [17]:
# let's explore the shape of the data. 
data.shape

(5000, 14)

In [18]:
data.isnull().any()

ID                    False
Age                   False
Experience            False
Income                False
ZIP Code              False
Family                False
CCAvg                 False
Education             False
Mortgage              False
Personal Loan         False
Securities Account    False
CD Account            False
Online                False
CreditCard            False
dtype: bool

In [19]:
drop_cols=['ID','ZIP Code']

In [20]:
# Dividing the columns in the dataset in to numeric and categorical attributes.
cols = set(data.columns)
cols_numeric = set(['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage'])
cols_categorical = list(cols - cols_numeric)
cols_categorical

['ID',
 'Education',
 'Personal Loan',
 'CD Account',
 'CreditCard',
 'Family',
 'Online',
 'Securities Account',
 'ZIP Code']

In [21]:
for x in cols_categorical:
    data[x] = data[x].astype('category')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   ID                  5000 non-null   category
 1   Age                 5000 non-null   int64   
 2   Experience          5000 non-null   int64   
 3   Income              5000 non-null   int64   
 4   ZIP Code            5000 non-null   category
 5   Family              5000 non-null   category
 6   CCAvg               5000 non-null   float64 
 7   Education           5000 non-null   category
 8   Mortgage            5000 non-null   int64   
 9   Personal Loan       5000 non-null   category
 10  Securities Account  5000 non-null   category
 11  CD Account          5000 non-null   category
 12  Online              5000 non-null   category
 13  CreditCard          5000 non-null   category
dtypes: category(9), float64(1), int64(4)
memory usage: 472.6 KB


In [22]:
X = data.drop('Personal Loan', axis = 1)
Y = data[['Personal Loan']]

In [23]:
cat_cols = ['Family','Education','Securities Account','CD Account','Online','CreditCard']

In [24]:
from sklearn.preprocessing import LabelEncoder


In [25]:
from sklearn.preprocessing import StandardScaler
# define standard scaler
#scaler = StandardScaler()
# transform data
#data_encoded_scaled = pd.DataFrame(scaler.fit_transform(X))
#data_encoded_scaled.columns = X.columns

In [26]:
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline,make_pipeline

In [27]:
ct3=make_column_transformer(('drop',drop_cols),remainder='passthrough')

In [28]:
pd.DataFrame(ct3.fit_transform(X))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,25,1,49,4,1.6,1,0,1,0,0,0
1,45,19,34,3,1.5,1,0,1,0,0,0
2,39,15,11,1,1,1,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0
4,35,8,45,4,1,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,1,0
4996,30,4,15,4,0.4,1,85,0,0,1,0
4997,63,39,24,2,0.3,3,0,0,0,0,0
4998,65,40,49,3,0.5,2,0,0,0,1,0


In [29]:
from sklearn.base import BaseEstimator,TransformerMixin

In [30]:
class ArrayToDf(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        self.X=X
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X=pd.DataFrame(X,\
                       index=[i for i in range(X.shape[0])],\
                       columns=['Age','Experience', 'Income',
       'Family', 'CCAvg', 
       'Education', 'Mortgage', 'Securities Account', 'CD Account',
       'Online', 'CreditCard']
       
        
        )
        
        return X
    

In [31]:
type(ct3)

sklearn.compose._column_transformer.ColumnTransformer

In [32]:
pipchk=make_pipeline(ct3,ArrayToDf())

In [33]:
type(pipchk)

sklearn.pipeline.Pipeline

In [34]:
pipchk.fit_transform(X)

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,1,0,0,0
1,45,19,34,3,1.5,1,0,1,0,0,0
2,39,15,11,1,1,1,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0
4,35,8,45,4,1,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,1,0
4996,30,4,15,4,0.4,1,85,0,0,1,0
4997,63,39,24,2,0.3,3,0,0,0,0,0
4998,65,40,49,3,0.5,2,0,0,0,1,0


In [35]:
class CustomLabelEncode(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        le=LabelEncoder()
        for i in X[cat_cols]:
            X[i]=le.fit_transform(X[i])
        return X
            

In [36]:
le_ct=make_column_transformer((CustomLabelEncode(),cat_cols),remainder='passthrough')

In [37]:
pd.DataFrame(le_ct.fit_transform(X))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3,0,1,0,0,0,1,25,1,49,91107,1.6,0
1,2,0,1,0,0,0,2,45,19,34,90089,1.5,0
2,0,0,0,0,0,0,3,39,15,11,94720,1,0
3,0,1,0,0,0,0,4,35,9,100,94112,2.7,0
4,3,1,0,0,0,1,5,35,8,45,91330,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,2,0,0,1,0,4996,29,3,40,92697,1.9,0
4996,3,0,0,0,1,0,4997,30,4,15,92037,0.4,85
4997,1,2,0,0,0,0,4998,63,39,24,93023,0.3,0
4998,2,1,0,0,1,0,4999,65,40,49,90034,0.5,0


In [38]:
class ArrayToDfUpdated(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        self.X=X
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X=pd.DataFrame(X,\
                       index=[i for i in range(X.shape[0])],\
                       columns=['Family','Education','Securities Account','CD Account','Online','CreditCard','Age','Experience',
                                'Income', 'CCAvg','Mortgage']
       
        
        )
        
        return X
    

In [39]:
columns=['Family','Education','Securities Account','CD Account','Online','CreditCard','Age','Experience',
                                'Income', 'CCAvg','Mortgage']

In [44]:
Final_pipeline=make_pipeline(ct3,ArrayToDf(),le_ct,ArrayToDfUpdated(),StandardScaler(),ArrayToDfUpdated())

In [45]:
Final_pipeline.fit_transform(X)

Unnamed: 0,Family,Education,Securities Account,CD Account,Online,CreditCard,Age,Experience,Income,CCAvg,Mortgage
0,1.397414,-1.049078,2.928915,-0.25354,-1.216618,-0.645314,-1.774417,-1.666078,-0.538229,-0.193385,-0.555524
1,0.525991,-1.049078,2.928915,-0.25354,-1.216618,-0.645314,-0.029524,-0.096330,-0.864109,-0.250611,-0.555524
2,-1.216855,-1.049078,-0.341423,-0.25354,-1.216618,-0.645314,-0.552992,-0.445163,-1.363793,-0.536736,-0.555524
3,-1.216855,0.141703,-0.341423,-0.25354,-1.216618,-0.645314,-0.901970,-0.968413,0.569765,0.436091,-0.555524
4,1.397414,0.141703,-0.341423,-0.25354,-1.216618,1.549632,-0.901970,-1.055621,-0.625130,-0.536736,-0.555524
...,...,...,...,...,...,...,...,...,...,...,...
4995,-1.216855,1.332484,-0.341423,-0.25354,0.821951,-0.645314,-1.425438,-1.491662,-0.733757,-0.021710,-0.555524
4996,1.397414,-1.049078,-0.341423,-0.25354,0.821951,-0.645314,-1.338194,-1.404454,-1.276892,-0.880087,0.280238
4997,-0.345432,1.332484,-0.341423,-0.25354,-1.216618,-0.645314,1.540880,1.647835,-1.081363,-0.937312,-0.555524
4998,0.525991,0.141703,-0.341423,-0.25354,0.821951,-0.645314,1.715370,1.735043,-0.538229,-0.822862,-0.555524


In [46]:
type(Final_pipeline)

sklearn.pipeline.Pipeline

In [1]:
import pandera as pa
from pandera import check,Column,DataFrameSchema

In [13]:
schema=DataFrameSchema({'ID':Column(pa.Int), 'Age':Column(pa.Int,nullable=True), 'Experience':Column(pa.Int,nullable=True),
                        'Income':Column(pa.Int,nullable=True), 'ZIP Code':Column(pa.Int,nullable=True),
                        'Family':Column(pa.Int,nullable=True),
                        'CCAvg':Column(pa.Float,nullable=True),
       'Education':Column(pa.Int,nullable=True), 'Mortgage':Column(pa.Int,nullable=True), 'Personal Loan':Column(pa.Int,nullable=True),
                        'Securities Account':Column(pa.Int,nullable=True),
       'CD Account':Column(pa.Int,nullable=True), 'Online':Column(pa.Int,nullable=True),
                        'CreditCard':Column(pa.Int,nullable=True)})

In [14]:
schema.validate(data)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0
