In [2]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [3]:
dataset = pd.read_csv('bank-full.csv', delimiter=';')
dataset.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [4]:
dataset.nunique()

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64

In [5]:
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
X = dataset.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8 , 11, 12, 13, 14, 15]].values
y = dataset.iloc[:, -1].values

In [7]:
np.unique(y, return_counts=True)

(array(['no', 'yes'], dtype=object), array([39922,  5289], dtype=int64))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

In [9]:
lb = LabelEncoder()
for col in [4, 6, 7]:
    X_train[:, col] = lb.fit_transform(X_train[:, col])
    X_test[:, col] = lb.transform(X_test[:, col])

y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [10]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,2,3,8,13])], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [11]:
st = SMOTE(sampling_strategy='minority')
X_train, y_train = st.fit_resample(X_train, y_train)

In [12]:
pd.DataFrame(X_train[:, :]).nunique()

0      5756
1      6155
2      1184
3      1193
4     10349
5      4083
6      1903
7      3656
8      2624
9      7405
10     2074
11      348
12     5699
13    13078
14    11881
15     5218
16    13508
17    12873
18     2517
19     7072
20     3504
21     4272
22     4085
23     2639
24     4753
25     1977
26    26838
27      499
28    32701
29    11708
30     4258
31    28902
32    18701
33    10860
34     9103
dtype: int64

In [13]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([31937, 31937], dtype=int64))

In [14]:
sc = StandardScaler()
X_train[:, :] = sc.fit_transform(X_train[:, :])
X_test[:, :] = sc.transform(X_test[:, :])

In [15]:
X_train

array([[-0.392378  , -0.49373563, -0.18047974, ..., -0.58377554,
         2.60989894,  0.07949978],
       [ 2.98295434, -0.49373563, -0.18047974, ..., -0.19074074,
         2.20729982,  5.37061143],
       [-0.392378  , -0.49373563, -0.18047974, ..., -0.19074074,
        -0.48917801, -0.36142619],
       ...,
       [-0.392378  , -0.49373563, -0.18047974, ..., -0.34960784,
        -0.48917801, -0.36142619],
       [-0.392378  , -0.49373563, -0.18047974, ...,  1.42249077,
        -0.48917801, -0.36142619],
       [-0.392378  , -0.49373563, -0.18047974, ..., -0.34907149,
        -0.48917801, -0.36142619]])

In [16]:
X_test

array([[-0.3923780039157944, 2.271523617747956, -0.1804797394516539, ...,
        -0.19074073979396355, 2.46945738528567, 4.488759484029288],
       [-0.3923780039157944, 2.271523617747956, -0.1804797394516539, ...,
        -0.5837755421605579, -0.48917801239064435, -0.36142619259896464],
       [-0.3923780039157944, -0.49373563206988064, 6.246835567479312,
        ..., 0.9883636673058197, -0.48917801239064435,
        -0.36142619259896464],
       ...,
       [-0.3923780039157944, 2.271523617747956, -0.1804797394516539, ...,
        0.20229406257263086, 3.798970760190849, 0.07949977800360375],
       [-0.3923780039157944, 2.271523617747956, -0.1804797394516539, ...,
        -0.19074073979396355, -0.48917801239064435, -0.36142619259896464],
       [-0.3923780039157944, -0.49373563206988064, -0.1804797394516539,
        ..., 0.9883636673058197, -0.48917801239064435,
        -0.36142619259896464]], dtype=object)