In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [27]:
data = pd.read_csv("bank.csv")
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [28]:
df = data

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [30]:
df.shape

(45211, 17)

In [31]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [32]:
df.duplicated().sum()

0

In [33]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Handling missing values i.e. unknown in Every Categorical Columns

In [34]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
def handle_missing_values(df, categorical_columns):
    for col in categorical_columns:
        most_frequent = df[col].mode()[0]
        df[col].replace('unknown', most_frequent, inplace=True)
    return df
data = handle_missing_values(df, categorical_columns)
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,cellular,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,cellular,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,cellular,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,secondary,no,1506,yes,no,cellular,5,may,92,1,-1,0,unknown,no
4,33,blue-collar,single,secondary,no,1,no,no,cellular,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


# Encode Categorical Variables using LabelEncoder

In [35]:
def encode_categorical_variables(df, categorical_columns):
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df, label_encoders
data, label_encoders = encode_categorical_variables(df, categorical_columns)
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,0,5,8,261,1,-1,0,3,no
1,44,9,2,1,0,29,1,0,0,5,8,151,1,-1,0,3,no
2,33,2,1,1,0,2,1,1,0,5,8,76,1,-1,0,3,no
3,47,1,1,1,0,1506,1,0,0,5,8,92,1,-1,0,3,no
4,33,1,2,1,0,1,0,0,0,5,8,198,1,-1,0,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,977,3,-1,0,3,yes
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3,yes
45208,72,5,1,1,0,5715,0,0,0,17,9,1127,5,184,3,2,yes
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3,no


# Feature Scaling for Numerical Features

In [36]:
def scale_numerical_features(df, numerical_columns):
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df, scaler
data, scaler = scale_numerical_features(df, numerical_columns)
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1.606965,4,1,2,0,0.256419,1,0,0,-1.298476,8,0.011016,-0.569351,-0.411453,-0.251940,3,no
1,0.288529,9,2,1,0,-0.437895,1,0,0,-1.298476,8,-0.416127,-0.569351,-0.411453,-0.251940,3,no
2,-0.747384,2,1,1,0,-0.446762,1,1,0,-1.298476,8,-0.707361,-0.569351,-0.411453,-0.251940,3,no
3,0.571051,1,1,1,0,0.047205,1,0,0,-1.298476,8,-0.645231,-0.569351,-0.411453,-0.251940,3,no
4,-0.747384,1,2,1,0,-0.447091,0,0,0,-1.298476,8,-0.233620,-0.569351,-0.411453,-0.251940,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.947747,9,1,2,0,-0.176460,0,0,0,0.143418,9,2.791329,0.076230,-0.411453,-0.251940,3,yes
45207,2.831227,5,0,0,0,0.120447,0,0,0,0.143418,9,0.768224,-0.246560,-0.411453,-0.251940,3,yes
45208,2.925401,5,1,1,0,1.429593,0,0,0,0.143418,9,3.373797,0.721811,1.436189,1.050473,2,yes
45209,1.512791,1,1,1,0,-0.228024,0,0,1,0.143418,9,0.970146,0.399020,-0.411453,-0.251940,3,no


# Removing Outliers

In [37]:
def remove_outliers(df, numerical_columns):
    for col in numerical_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df
cleaned_data = remove_outliers(df, numerical_columns)
cleaned_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1.606965,4,1,2,0,0.256419,1,0,0,-1.298476,8,0.011016,-0.569351,-0.411453,-0.25194,3,no
1,0.288529,9,2,1,0,-0.437895,1,0,0,-1.298476,8,-0.416127,-0.569351,-0.411453,-0.25194,3,no
2,-0.747384,2,1,1,0,-0.446762,1,1,0,-1.298476,8,-0.707361,-0.569351,-0.411453,-0.25194,3,no
3,0.571051,1,1,1,0,0.047205,1,0,0,-1.298476,8,-0.645231,-0.569351,-0.411453,-0.25194,3,no
4,-0.747384,1,2,1,0,-0.447091,0,0,0,-1.298476,8,-0.233620,-0.569351,-0.411453,-0.25194,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45198,-0.370689,4,1,2,0,0.021587,0,0,0,0.023260,9,0.290601,-0.246560,-0.411453,-0.25194,3,no
45202,-0.653211,0,2,1,0,-0.264480,0,0,0,0.143418,9,-0.132659,-0.569351,-0.411453,-0.25194,3,yes
45203,-1.689124,8,2,2,0,-0.410306,0,0,0,0.143418,9,0.030432,-0.569351,-0.411453,-0.25194,3,yes
45205,-1.500776,9,2,1,0,-0.281559,0,1,0,0.143418,9,0.496406,-0.246560,-0.411453,-0.25194,3,yes


# Encoding of Target Variable 'y'

In [38]:
target_encoder = LabelEncoder()
cleaned_data['y'] = target_encoder.fit_transform(cleaned_data['y'])

# Printing the shape of cleaned data

In [39]:
cleaned_data.shape

(28069, 17)