In [20]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [21]:
df = pd.read_csv("bank.csv")

In [22]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [23]:
cols= ['age','job','marital','education','balance','day','duration','deposit']
df = df[cols]

In [24]:
df.head()

Unnamed: 0,age,job,marital,education,balance,day,duration,deposit
0,59,admin.,married,secondary,2343,5,1042,yes
1,56,admin.,married,secondary,45,5,1467,yes
2,41,technician,married,secondary,1270,5,1389,yes
3,55,services,married,secondary,2476,5,579,yes
4,54,admin.,married,tertiary,184,5,673,yes


                          
job: Nominal (categorical) - Represents the occupation or job category of the person.                               
marital: Nominal (categorical) - Represents the marital status of the person.                                   
education: Ordinal (categorical) - Represents the education level of the person.                       

In [6]:
df.head(1)

Unnamed: 0,age,job,marital,education,balance,day,duration,deposit
0,59,admin.,married,secondary,2343,5,1042,yes


In [25]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1],df.iloc[:,-1],test_size=0.2, random_state=42)

In [30]:
X_train.head(2)

Unnamed: 0,age,job,marital,education,balance,day,duration
3955,28,student,single,tertiary,5741,10,1042
11150,34,management,married,secondary,355,21,314


# Without Column Transformer

In [38]:
scaler = StandardScaler()
scalled_x_train=scaler.fit_transform(X_train[['age','balance','day','duration']])
scalled_x_test=scaler.transform(X_train[['age','balance','day','duration']])

In [39]:
scalled_x_train

array([[-1.11404081,  1.34627101, -0.66666896,  1.94194241],
       [-0.60927832, -0.37177816,  0.6358884 , -0.16253175],
       [ 0.56850085, -0.42090174, -0.66666896,  0.64688139],
       ...,
       [-0.52515123, -0.12009959,  0.5174741 , -0.50075081],
       [ 0.8208821 , -0.24705532,  1.10954563,  0.0051324 ],
       [-0.94578665, -0.4843796 ,  0.87271702, -0.292616  ]])

In [37]:
ohe = OneHotEncoder(drop='first',sparse=False)

In [40]:
ohe_xtrain = ohe.fit_transform(X_train[['job','marital']])
ohe_xtest = ohe.transform(X_train[['job','marital']])



In [41]:
ohe_xtrain

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [43]:
X_train['education'].value_counts()

education
secondary    4362
tertiary     2980
primary      1202
unknown       385
Name: count, dtype: int64

In [44]:
oc = OrdinalEncoder(categories=[['unknown','primary','secondary','tertiary']])

In [46]:
edu_xtrain= oc.fit_transform(X_train[['education']])
edu_xtest= oc.fit_transform(X_train[['education']])

In [49]:
edu_xtrain

array([[3.],
       [2.],
       [2.],
       ...,
       [2.],
       [3.],
       [3.]])

In [50]:
X_train.head(1)

Unnamed: 0,age,job,marital,education,balance,day,duration
3955,28,student,single,tertiary,5741,10,1042


# using column transformer

In [51]:
preprocessor = ColumnTransformer(
transformers=[
    ("scalling",StandardScaler(),['age','day','balance','duration']),
    ('OheEncodinng',OneHotEncoder(drop='first',sparse=False),['job','marital']),
    ('ordinalEncoding',OrdinalEncoder(categories=[['unknown','primary','secondary','tertiary']]),['education']),
],
remainder='passthrough'
)

In [52]:
preprocessor

In [53]:
new_xtrain = preprocessor.fit_transform(X_train)
new_xtest = preprocessor.transform(X_test)



In [54]:
new_xtrain

array([[-1.11404081, -0.66666896,  1.34627101, ...,  0.        ,
         1.        ,  3.        ],
       [-0.60927832,  0.6358884 , -0.37177816, ...,  1.        ,
         0.        ,  2.        ],
       [ 0.56850085, -0.66666896, -0.42090174, ...,  0.        ,
         0.        ,  2.        ],
       ...,
       [-0.52515123,  0.5174741 , -0.12009959, ...,  1.        ,
         0.        ,  2.        ],
       [ 0.8208821 ,  1.10954563, -0.24705532, ...,  1.        ,
         0.        ,  3.        ],
       [-0.94578665,  0.87271702, -0.4843796 , ...,  0.        ,
         1.        ,  3.        ]])