In [1]:
import pandas as pd

# Reading in the CSV file

In [2]:
bank_data = pd.read_csv("bank-full.csv", delimiter = ';')

# Analyzing the data

In [3]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
bank_data["age"].mean()

40.936210214328369

Now, let's look at datatypes of the elements in the dataframe.

In [5]:
bank_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

Example of changing datatype of elements of a dataframe.

In [6]:
bank_data["balance"] = bank_data["balance"].astype("float64")

In [7]:
bank_data.dtypes

age            int64
job           object
marital       object
education     object
default       object
balance      float64
housing       object
loan          object
contact       object
day            int64
month         object
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome      object
y             object
dtype: object

Normalizing the "balance" series. 

In [8]:
bank_data["balance"] = (bank_data["balance"] - bank_data["balance"].mean()) / (bank_data["balance"].std())

In [9]:
bank_data["balance"].head()

0    0.256416
1   -0.437890
2   -0.446758
3    0.047205
4   -0.447086
Name: balance, dtype: float64

The Z Score method of Normalizing is used.

Now, let's look at "duration" series.

In [10]:
bank_data["duration"].describe()

count    45211.000000
mean       258.163080
std        257.527812
min          0.000000
25%        103.000000
50%        180.000000
75%        319.000000
max       4918.000000
Name: duration, dtype: float64

Let's try binning the "duration" series.

In [11]:
#First, we need to change duration dtype to integer
bank_data["duration"] = bank_data["duration"].astype("int64")
binwidth = int((bank_data["duration"].max() - bank_data["duration"].min()) / 4)
bins = range(min(bank_data["duration"]), max(bank_data["duration"]), binwidth)

In [12]:
names = ["Low", "Medium", "Above average", "High"]

In [13]:
bank_data["duration-binned"] = pd.cut(bank_data["duration"], bins = bins, labels = names)

In [14]:
bank_data["duration-binned"].head()

0    Low
1    Low
2    Low
3    Low
4    Low
Name: duration-binned, dtype: category
Categories (4, object): [Low < Medium < Above average < High]

In [15]:
bank_data["duration-binned"].tail()

45206    Low
45207    Low
45208    Low
45209    Low
45210    Low
Name: duration-binned, dtype: category
Categories (4, object): [Low < Medium < Above average < High]

One-hot encoding is neccesary for categorical variables as many Machine Learning models don't accept categorical variables as inputs.
Let's one-hot encode "marital" series.

In [16]:
one_hot_encoded = pd.get_dummies(bank_data["marital"])

In [17]:
joined_frames = [bank_data, one_hot_encoded]

In [18]:
bank_data = pd.concat(joined_frames, axis = 1)

In [19]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,duration,campaign,pdays,previous,poutcome,y,duration-binned,divorced,married,single
0,58,management,married,tertiary,no,0.256416,yes,no,unknown,5,...,261,1,-1,0,unknown,no,Low,0,1,0
1,44,technician,single,secondary,no,-0.43789,yes,no,unknown,5,...,151,1,-1,0,unknown,no,Low,0,0,1
2,33,entrepreneur,married,secondary,no,-0.446758,yes,yes,unknown,5,...,76,1,-1,0,unknown,no,Low,0,1,0
3,47,blue-collar,married,unknown,no,0.047205,yes,no,unknown,5,...,92,1,-1,0,unknown,no,Low,0,1,0
4,33,unknown,single,unknown,no,-0.447086,no,no,unknown,5,...,198,1,-1,0,unknown,no,Low,0,0,1


Thus, in this way all other categorical variables can be converted to integers thus enabling them to pass through various ML models.

Now, there some "unknown" sample points in the "education" series. Thus, this needs to be cleaned.
As this is an object datatype, one technique would be to replace all the "unknown" training points by the mode of the series.

In [20]:
bank_data["education"].head()

0     tertiary
1    secondary
2    secondary
3      unknown
4      unknown
Name: education, dtype: object

In [28]:
mode = bank_data["education"].mode()
mode

0    secondary
dtype: object

In [33]:
bank_data["education"] = bank_data["education"].replace('unknown', 'secondary')

In [34]:
bank_data["education"].head()

0     tertiary
1    secondary
2    secondary
3    secondary
4    secondary
Name: education, dtype: object

Thus , in this way, we can clean the dataset containing missing values. Another method to clean the dataset would be to entirely delete row consisting of NaN values.