### Data preprocessing

In [176]:
import pandas as pd
import matplotlib.pyplot as plt


In [177]:
df = pd.read_csv('./Dataset/bank-full.csv', delimiter=';')
df.head(20)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


#### finding missing values

In [178]:
print(df.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [180]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [181]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("Numeric Columns:", numeric_columns)

Numeric Columns: Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')


In [182]:
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)

Categorical Columns: Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'y'],
      dtype='object')


In [183]:
boolean_columns = df.select_dtypes(include=['bool']).columns
print("Boolean Columns:", boolean_columns)

Boolean Columns: Index([], dtype='object')


In [184]:
df['job'].unique()
# df['marital'].unique()
# df['education'].unique()
# df['default'].unique()
# df['housing'].unique()
# df['contact'].unique()
# df['month'].unique()
# df['poutcome'].unique()
# df['y'].unique()
df['campaign'].unique()


array([ 1,  2,  3,  5,  4,  6,  7,  8,  9, 10, 11, 12, 13, 19, 14, 24, 16,
       32, 18, 22, 15, 17, 25, 21, 43, 51, 63, 41, 26, 28, 55, 50, 38, 23,
       20, 29, 31, 37, 30, 46, 27, 58, 33, 35, 34, 36, 39, 44],
      dtype=int64)

In [185]:
pd.crosstab(df['contact'], df['y'], normalize='index')

y,no,yes
contact,Unnamed: 1_level_1,Unnamed: 2_level_1
cellular,0.850811,0.149189
telephone,0.865795,0.134205
unknown,0.959293,0.040707


In [186]:
from scipy.stats import chi2_contingency

# If p value is < 0.05, the feature is influencing the targeted variable

crosstab = pd.crosstab(df['contact'], df['y'])
chi2, p, dof, expected = chi2_contingency(crosstab)
print("Chi-square statistic:", chi2)
print("p-value:", p)


Chi-square statistic: 1035.714225356292
p-value: 1.251738325340638e-225


In [187]:
df = df.drop(columns=['contact'])

In [188]:
crosstab = pd.crosstab(df['poutcome'], df['y'])
chi2, p, dof, expected = chi2_contingency(crosstab)
print("Chi-square statistic:", chi2)
print("p-value:", p)

Chi-square statistic: 4391.5065887686615
p-value: 0.0


In [189]:
crosstab = pd.crosstab(df['pdays'], df['y'])
chi2, p, dof, expected = chi2_contingency(crosstab)
print("Chi-square statistic:", chi2)
print("p-value:", p)

Chi-square statistic: 4527.588105960234
p-value: 0.0


In [190]:
crosstab = pd.crosstab(df['previous'], df['y'])
chi2, p, dof, expected = chi2_contingency(crosstab)
print("Chi-square statistic:", chi2)
print("p-value:", p)

Chi-square statistic: 1376.7948821882237
p-value: 7.564459382673111e-263


In [191]:
print(df.duplicated().sum())  # Count the number of duplicate rows

0


#### Encoding targeted variable y

In [192]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})


In [193]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,5,may,198,1,-1,0,unknown,0


In [194]:
# List of columns to apply Label Encoding to (yes/no columns)
yes_no_columns = ['default', 'housing', 'loan']

# Apply Label Encoding to each of the columns in the list
df[yes_no_columns] = df[yes_no_columns].replace({'yes': 1, 'no': 0})

df.head()

  df[yes_no_columns] = df[yes_no_columns].replace({'yes': 1, 'no': 0})


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,0,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,5,may,198,1,-1,0,unknown,0


In [195]:
df['job'].unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

In [196]:
# Label Encoding
education_mapping = {'primary': 0, 'secondary': 1, 'tertiary': 2, 'unknown': 3}

df['education_encoded'] = df['education'].map(education_mapping)
df = df.drop(columns=['education'])
df.head()

Unnamed: 0,age,job,marital,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,y,education_encoded
0,58,management,married,0,2143,1,0,5,may,261,1,-1,0,unknown,0,2
1,44,technician,single,0,29,1,0,5,may,151,1,-1,0,unknown,0,1
2,33,entrepreneur,married,0,2,1,1,5,may,76,1,-1,0,unknown,0,1
3,47,blue-collar,married,0,1506,1,0,5,may,92,1,-1,0,unknown,0,3
4,33,unknown,single,0,1,0,0,5,may,198,1,-1,0,unknown,0,3


In [197]:
# Apply One-Hot Encoding to the 'job' column
df_encoded = pd.get_dummies(df['job'], prefix='job')

# Convert True/False to 1/0
df_encoded = df_encoded.astype(int)

# Optionally, concatenate the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original 'job' column (optional)
df.drop('job', axis=1, inplace=True)

df.head()

Unnamed: 0,age,marital,default,balance,housing,loan,day,month,duration,campaign,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,58,married,0,2143,1,0,5,may,261,1,...,0,0,1,0,0,0,0,0,0,0
1,44,single,0,29,1,0,5,may,151,1,...,0,0,0,0,0,0,0,1,0,0
2,33,married,0,2,1,1,5,may,76,1,...,1,0,0,0,0,0,0,0,0,0
3,47,married,0,1506,1,0,5,may,92,1,...,0,0,0,0,0,0,0,0,0,0
4,33,single,0,1,0,0,5,may,198,1,...,0,0,0,0,0,0,0,0,0,1


In [198]:
print(df)

       age   marital  default  balance  housing  loan  day month  duration  \
0       58   married        0     2143        1     0    5   may       261   
1       44    single        0       29        1     0    5   may       151   
2       33   married        0        2        1     1    5   may        76   
3       47   married        0     1506        1     0    5   may        92   
4       33    single        0        1        0     0    5   may       198   
...    ...       ...      ...      ...      ...   ...  ...   ...       ...   
45206   51   married        0      825        0     0   17   nov       977   
45207   71  divorced        0     1729        0     0   17   nov       456   
45208   72   married        0     5715        0     0   17   nov      1127   
45209   57   married        0      668        0     0   17   nov       508   
45210   37   married        0     2971        0     0   17   nov       361   

       campaign  ...  job_entrepreneur  job_housemaid job_manag

In [199]:
df = df.drop(columns=['month','day'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                45211 non-null  int64 
 1   marital            45211 non-null  object
 2   default            45211 non-null  int64 
 3   balance            45211 non-null  int64 
 4   housing            45211 non-null  int64 
 5   loan               45211 non-null  int64 
 6   duration           45211 non-null  int64 
 7   campaign           45211 non-null  int64 
 8   pdays              45211 non-null  int64 
 9   previous           45211 non-null  int64 
 10  poutcome           45211 non-null  object
 11  y                  45211 non-null  int64 
 12  education_encoded  45211 non-null  int64 
 13  job_admin.         45211 non-null  int32 
 14  job_blue-collar    45211 non-null  int32 
 15  job_entrepreneur   45211 non-null  int32 
 16  job_housemaid      45211 non-null  int32

In [203]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to the 'poutcome' column
df['poutcome_encoded'] = label_encoder.fit_transform(df['poutcome'])

df.drop('poutcome', axis=1, inplace=True)


# Display the resulting DataFrame
print(df)

       age  default  balance  housing  loan  duration  campaign  pdays  \
0       58        0     2143        1     0       261         1     -1   
1       44        0       29        1     0       151         1     -1   
2       33        0        2        1     1        76         1     -1   
3       47        0     1506        1     0        92         1     -1   
4       33        0        1        0     0       198         1     -1   
...    ...      ...      ...      ...   ...       ...       ...    ...   
45206   51        0      825        0     0       977         3     -1   
45207   71        0     1729        0     0       456         2     -1   
45208   72        0     5715        0     0      1127         5    184   
45209   57        0      668        0     0       508         4     -1   
45210   37        0     2971        0     0       361         2    188   

       previous  y  ...  job_self-employed  job_services  job_student  \
0             0  0  ...               

In [201]:
# Apply One-Hot Encoding to the 'marital' column
df_encoded = pd.get_dummies(df['marital'], prefix='marital')

# Convert boolean columns to integers (1 for True, 0 for False)
df_encoded = df_encoded.astype(int)

# Concatenate the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original 'marital' column
df.drop('marital', axis=1, inplace=True)

# Display the resulting DataFrame
print(df)

       age  default  balance  housing  loan  duration  campaign  pdays  \
0       58        0     2143        1     0       261         1     -1   
1       44        0       29        1     0       151         1     -1   
2       33        0        2        1     1        76         1     -1   
3       47        0     1506        1     0        92         1     -1   
4       33        0        1        0     0       198         1     -1   
...    ...      ...      ...      ...   ...       ...       ...    ...   
45206   51        0      825        0     0       977         3     -1   
45207   71        0     1729        0     0       456         2     -1   
45208   72        0     5715        0     0      1127         5    184   
45209   57        0      668        0     0       508         4     -1   
45210   37        0     2971        0     0       361         2    188   

       previous poutcome  ...  job_self-employed  job_services  job_student  \
0             0  unknown  ...   

In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                45211 non-null  int64
 1   default            45211 non-null  int64
 2   balance            45211 non-null  int64
 3   housing            45211 non-null  int64
 4   loan               45211 non-null  int64
 5   duration           45211 non-null  int64
 6   campaign           45211 non-null  int64
 7   pdays              45211 non-null  int64
 8   previous           45211 non-null  int64
 9   y                  45211 non-null  int64
 10  education_encoded  45211 non-null  int64
 11  job_admin.         45211 non-null  int32
 12  job_blue-collar    45211 non-null  int32
 13  job_entrepreneur   45211 non-null  int32
 14  job_housemaid      45211 non-null  int32
 15  job_management     45211 non-null  int32
 16  job_retired        45211 non-null  int32
 17  job_self-emp

In [205]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier  # Neural Network model
from sklearn.metrics import accuracy_score, classification_report  # Evaluation metrics


# Features (all columns except 'y')
X = df.drop('y', axis=1)

# Target variable
y = df['y']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Neural Network model (MLPClassifier)
model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Optionally, get a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 88.62%

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7952
           1       0.55      0.30      0.38      1091

    accuracy                           0.89      9043
   macro avg       0.73      0.63      0.66      9043
weighted avg       0.87      0.89      0.87      9043

