# Bank Marketing Data

### Attribute Desription

1 - age (numeric)

2 - job : type of job (categorical: 
"admin.","unknown","unemployed","management","housemaid","entrepreneur","student","blue-collar","self-employed","retired","technician","services") 

3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)

4 - education (categorical: "unknown","secondary","primary","tertiary")

5 - default: has credit in default? (binary: "yes","no")

6 - balance: average yearly balance, in euros (numeric) 

7 - housing: has housing loan? (binary: "yes","no")

8 - loan: has personal loan? (binary: "yes","no")

9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 

10 - day: last contact day of the month (numeric)

11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")

12 - duration: last contact duration, in seconds (numeric)

13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)

15 - previous: number of contacts performed before this campaign and for this client (numeric)

16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

17 - y : has the client subscribed a term deposit? (binary: "yes","no")


### Data Import

In [1]:
# importing pandas library

import pandas as pd

In [2]:
# passing filepath only to read_csv() function

raw_data = pd.read_csv('Desktop/Data/bank_marketing.csv')

In [3]:
# viewing first five rows of dataset using head() function

raw_data.head()

Unnamed: 0,"age;""job"";""marital"";""education"";""default"";""balance"";""housing"";""loan"";""contact"";""day"";""month"";""duration"";""campaign"";""pdays"";""previous"";""poutcome"";""y"""
0,"58;""management"";""married"";""tertiary"";""no"";2143..."
1,"44;""technician"";""single"";""secondary"";""no"";29;""..."
2,"33;""entrepreneur"";""married"";""secondary"";""no"";2..."
3,"47;""blue-collar"";""married"";""unknown"";""no"";1506..."
4,"33;""unknown"";""single"";""unknown"";""no"";1;""no"";""n..."


In [4]:
# using 'sep' parameter of read_csv() function

raw_data = pd.read_csv('Desktop/Data/bank_marketing.csv', sep=';')
raw_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
# a quick glance at the number of rows, number of attributes and their datatypes

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
# a short statistical summary of the dataset

raw_data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [7]:
# a short statistical summary of the dataset, including categorical variables

raw_data.describe(include='all')

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


### Dealing with missing values

In [8]:
# checking for missing values

raw_data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Dealing with Categorical Variables

In [9]:
# creating a savepoint

data = raw_data.copy()

In [10]:
# checking unique values in the 'default' column

data.default.unique()

array(['no', 'yes'], dtype=object)

In [11]:
data.housing.unique()

array(['yes', 'no'], dtype=object)

In [12]:
data.loan.unique()

array(['no', 'yes'], dtype=object)

In [13]:
# importing LabelEncoder class from the Scikit-Learn library

from sklearn.preprocessing import LabelEncoder

In [14]:
# creating an object of the Label Encoder class

lab_enc = LabelEncoder()

In [15]:
# label encoding the 'default' column using the Label Encoder class object

data['default'] = lab_enc.fit_transform(data['default'])

In [16]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,0,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,0,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,0,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,0,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [17]:
# applying inverse transform to get back original values from the encoded values

lab_enc.inverse_transform(data['default'][:5])

  if diff:


array(['no', 'no', 'no', 'no', 'no'], dtype=object)

In [18]:
# label encoding the 'housing' column using replace() function of pandas

data.housing = data.housing.replace({'yes':1, 'no':0})

In [19]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,0,29,1,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,0,2,1,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,0,1506,1,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,0,1,0,no,unknown,5,may,198,1,-1,0,unknown,no


In [20]:
# label encoding the 'loan' column using replace() function of pandas

data.loan = data.loan.replace({'yes':1, 'no':0})

In [21]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,no


In [22]:
data.y.unique()

array(['no', 'yes'], dtype=object)

In [23]:
# label encoding the 'y' column using replace() function of pandas

data.y = data.y.replace({'yes':1, 'no':0})

In [24]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,0


In [25]:
# creating a savepoint

data_lab_encd = data.copy()

In [26]:
data.education.value_counts()

secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64

In [27]:
# one-hot encoding the 'education' column by using the get_dummies() function

pd.get_dummies(data_lab_encd[['education']])

Unnamed: 0,education_primary,education_secondary,education_tertiary,education_unknown
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
5,0,0,1,0
6,0,0,1,0
7,0,0,1,0
8,1,0,0,0
9,0,1,0,0


In [28]:
# one-hot encoding the 'education' column by using the get_dummies() function and prefix 'edu' and separator '-'

pd.get_dummies(data_lab_encd[['education']], prefix='edu', prefix_sep='-')

Unnamed: 0,edu-primary,edu-secondary,edu-tertiary,edu-unknown
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
5,0,0,1,0
6,0,0,1,0
7,0,0,1,0
8,1,0,0,0
9,0,1,0,0


In [29]:
# one-hot encoding two columns 'education' and 'job'

pd.get_dummies(data_lab_encd[['education','job']])

Unnamed: 0,education_primary,education_secondary,education_tertiary,education_unknown,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
7,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [30]:
# one-hot encoding the categorical variables and storing the dummies in a separate dataframe

data_dummies = pd.get_dummies(data_lab_encd[['education','job','marital','contact','month','poutcome']])

In [31]:
# viewing the last five rows of the dataset using tail() function

data_dummies.tail()

Unnamed: 0,education_primary,education_secondary,education_tertiary,education_unknown,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
45206,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45207,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
45208,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
45209,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45210,0,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [32]:
# checking all the dummy variable names

data_dummies.columns

Index(['education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'month_apr', 'month_aug', 'month_dec', 'month_feb',
       'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'poutcome_failure',
       'poutcome_other', 'poutcome_success', 'poutcome_unknown'],
      dtype='object')

In [33]:
# one-hot encoding the categorical variables, dropping the first dummy for each one and storing the dummies in a separate dataframe

data_dummies = pd.get_dummies(data_lab_encd[['education','job','marital','contact','month','poutcome']], drop_first=True)

In [34]:
data_dummies.tail()

Unnamed: 0,education_secondary,education_tertiary,education_unknown,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
45206,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
45207,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
45208,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
45209,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
45210,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [35]:
data_dummies.columns

Index(['education_secondary', 'education_tertiary', 'education_unknown',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'contact_telephone',
       'contact_unknown', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

### Feature Scaling

In [36]:
data_lab_encd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null int64
balance      45211 non-null int64
housing      45211 non-null int64
loan         45211 non-null int64
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null int64
dtypes: int64(11), object(6)
memory usage: 5.9+ MB


In [37]:
# dropping categorical variables for which dummies have been created

data_lab_encd.drop(['education','job','marital','contact','month','poutcome'], axis=1, inplace=True)

In [38]:
data_lab_encd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 11 columns):
age         45211 non-null int64
default     45211 non-null int64
balance     45211 non-null int64
housing     45211 non-null int64
loan        45211 non-null int64
day         45211 non-null int64
duration    45211 non-null int64
campaign    45211 non-null int64
pdays       45211 non-null int64
previous    45211 non-null int64
y           45211 non-null int64
dtypes: int64(11)
memory usage: 3.8 MB


In [39]:
# checking for correlation between numerical attributes

data_lab_encd[['age','balance','day','duration','campaign','pdays','previous']].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [54]:
data_uncorr = data_lab_encd.copy()

In [41]:
# importing StandardScaler class from Scikit-Learn

from sklearn.preprocessing import StandardScaler

In [42]:
# creating an object of the StandardScaler class

std_scaler = StandardScaler()

In [43]:
# standardizing 'age' column

import numpy as np
std_scaler.fit_transform(np.array(data_uncorr['age']).reshape(len(data_uncorr['age']),1))



array([[ 1.60696496],
       [ 0.28852927],
       [-0.74738448],
       ...,
       [ 2.92540065],
       [ 1.51279098],
       [-0.37068857]])

In [44]:
data_uncorr.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,58,0,2143,1,0,5,261,1,-1,0,0
1,44,0,29,1,0,5,151,1,-1,0,0
2,33,0,2,1,1,5,76,1,-1,0,0
3,47,0,1506,1,0,5,92,1,-1,0,0
4,33,0,1,0,0,5,198,1,-1,0,0


In [55]:
# standardizing all numerical columns

data_uncorr[['age','balance','day','duration','campaign','pdays','previous']] = std_scaler.fit_transform(data_uncorr[['age','balance','day','duration','campaign','pdays','previous']])

In [46]:
data_uncorr.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,1.606965,0,0.256419,1,0,-1.298476,0.011016,-0.569351,-0.411453,-0.25194,0
1,0.288529,0,-0.437895,1,0,-1.298476,-0.416127,-0.569351,-0.411453,-0.25194,0
2,-0.747384,0,-0.446762,1,1,-1.298476,-0.707361,-0.569351,-0.411453,-0.25194,0
3,0.571051,0,0.047205,1,0,-1.298476,-0.645231,-0.569351,-0.411453,-0.25194,0
4,-0.747384,0,-0.447091,0,0,-1.298476,-0.23362,-0.569351,-0.411453,-0.25194,0


In [47]:
# using inverse_transform function to view the original values from the scaled values

std_scaler.inverse_transform(data_uncorr[['age','balance','day','duration','campaign','pdays','previous']].iloc[:5, :])

array([[ 5.800e+01,  2.143e+03,  5.000e+00,  2.610e+02,  1.000e+00,
        -1.000e+00,  0.000e+00],
       [ 4.400e+01,  2.900e+01,  5.000e+00,  1.510e+02,  1.000e+00,
        -1.000e+00,  0.000e+00],
       [ 3.300e+01,  2.000e+00,  5.000e+00,  7.600e+01,  1.000e+00,
        -1.000e+00,  0.000e+00],
       [ 4.700e+01,  1.506e+03,  5.000e+00,  9.200e+01,  1.000e+00,
        -1.000e+00,  0.000e+00],
       [ 3.300e+01,  1.000e+00,  5.000e+00,  1.980e+02,  1.000e+00,
        -1.000e+00,  0.000e+00]])

In [49]:
# importing Normalizer class from Scikit-Learn

from sklearn.preprocessing import Normalizer

In [50]:
# creating an object of the Normalizer class

norm = Normalizer()

In [51]:
data_uncorr.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,58,0,2143,1,0,5,261,1,-1,0,0
1,44,0,29,1,0,5,151,1,-1,0,0
2,33,0,2,1,1,5,76,1,-1,0,0
3,47,0,1506,1,0,5,92,1,-1,0,0
4,33,0,1,0,0,5,198,1,-1,0,0


In [52]:
# normalizing all numerical columns

data_uncorr[['age','balance','day','duration','campaign','pdays','previous']] = norm.fit_transform(data_uncorr[['age','balance','day','duration','campaign','pdays','previous']])

In [53]:
data_uncorr.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y
0,0.026857,0,0.992304,1,0,0.002315,0.120855,0.000463,-0.000463,0.0,0
1,0.274973,0,0.181232,1,0,0.031247,0.943658,0.006249,-0.006249,0.0,0
2,0.397389,0,0.024084,1,1,0.06021,0.915198,0.012042,-0.012042,0.0,0
3,0.031135,0,0.997649,1,0,0.003312,0.060945,0.000662,-0.000662,0.0,0
4,0.164342,0,0.00498,0,0,0.0249,0.986051,0.00498,-0.00498,0.0,0


In [None]:
# checking the norm of each obervation (row)

s = 0

for i in range(7):
    s += data_uncorr[['age','balance','day','duration','campaign','pdays','previous']].iloc[0, i]**2

s

In [56]:
# merging numerical and categorical data

data_final = pd.concat([data_uncorr, data_dummies], axis=1)
data_final.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,1.606965,0,0.256419,1,0,-1.298476,0.011016,-0.569351,-0.411453,-0.25194,...,0,0,0,1,0,0,0,0,0,1
1,0.288529,0,-0.437895,1,0,-1.298476,-0.416127,-0.569351,-0.411453,-0.25194,...,0,0,0,1,0,0,0,0,0,1
2,-0.747384,0,-0.446762,1,1,-1.298476,-0.707361,-0.569351,-0.411453,-0.25194,...,0,0,0,1,0,0,0,0,0,1
3,0.571051,0,0.047205,1,0,-1.298476,-0.645231,-0.569351,-0.411453,-0.25194,...,0,0,0,1,0,0,0,0,0,1
4,-0.747384,0,-0.447091,0,0,-1.298476,-0.23362,-0.569351,-0.411453,-0.25194,...,0,0,0,1,0,0,0,0,0,1


In [57]:
data_final.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'y', 'education_secondary',
       'education_tertiary', 'education_unknown', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_married', 'marital_single',
       'contact_telephone', 'contact_unknown', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown'],
      dtype='object')

In [65]:
# exporting processed data as csv

data_final.to_csv('Desktop/Data/bank_marketing_processed.csv', index=False)

### Train-Test Split

In [58]:
# splitting input and target variables

X = data_final.drop('y', axis=1)
y = data_final['y']

In [59]:
y.value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [60]:
# checking target variable distribution

y.value_counts()/y.value_counts().sum()

0    0.883015
1    0.116985
Name: y, dtype: float64

In [61]:
# import train_test_split class from Scikit-Learn

from sklearn.model_selection import train_test_split

In [62]:
# splitting data into train and test sets with 70:30 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [63]:
# checking target variable distribution in train set

y_train.value_counts()/y_train.value_counts().sum()

0    0.882706
1    0.117294
Name: y, dtype: float64

In [64]:
# checking target variable distribution in test set

y_test.value_counts()/y_test.value_counts().sum()

0    0.883736
1    0.116264
Name: y, dtype: float64