In [355]:
import os
os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint 
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [356]:
df = pd.read_csv("bank_cleaned.csv").drop(["response_binary", "Unnamed: 0"], axis=1)

In [357]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,response
0,58,management,married,tertiary,no,2143,yes,no,5,may,4.35,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,5,may,2.52,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,1.27,1,-1,0,unknown,no
3,35,management,married,tertiary,no,231,yes,no,5,may,2.32,1,-1,0,unknown,no
4,28,management,single,tertiary,no,447,yes,yes,5,may,3.62,1,-1,0,unknown,no


In [358]:
df["poutcome"] = df["poutcome"].map({'unknown': 0, 'failure': 0, 'success': 1})
df["loan"] = df["loan"].map({'no': 0, 'yes': 1})
df["housing"] = df["housing"].map({'yes': 1, 'no': 0})
df["default"] = df["default"].map({'yes': 1, 'no': 0})
df.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,response
31032,38,technician,divorced,tertiary,0,304,1,0,20,apr,0.13,2,-1,0,0,no
19411,33,management,married,tertiary,0,1071,0,0,12,aug,1.38,4,-1,0,0,no
18077,45,admin.,married,secondary,0,925,0,0,6,aug,6.77,2,-1,0,0,yes
38753,39,technician,single,tertiary,0,25,1,0,28,dec,5.25,1,210,2,1,yes
23476,31,blue-collar,married,secondary,0,225,1,0,18,nov,3.93,1,-1,0,0,no


In [359]:
X = df.drop("response", axis=1)
y = df["response"]

In [360]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 ,random_state = 40)

<h1>Encoding</h1>

<h4>Ordinal Encoding</h4>

In [361]:
from sklearn.preprocessing import OrdinalEncoder

In [362]:
df["education"].unique()

array(['tertiary', 'secondary', 'primary'], dtype=object)

In [363]:
oe = OrdinalEncoder(categories=[["primary", "secondary", "tertiary"]])

In [364]:
oe.fit_transform(X_train[["education"]])

array([[1.],
       [1.],
       [1.],
       ...,
       [2.],
       [1.],
       [1.]])

In [365]:
X_train[["education"]] = oe.transform(X_train[["education"]])

In [366]:
X_test[["education"]] = oe.transform(X_test[["education"]])

<h4>Label Encoding</h4>

In [368]:
from sklearn.preprocessing import LabelEncoder

In [369]:
le = LabelEncoder()

In [370]:
le.fit(y_train)

In [371]:
y_train = le.transform(y_train)

In [372]:
y_test = le.transform(y_test)

<h4>Nominal Encoding</h4>

In [374]:
from sklearn.preprocessing import OneHotEncoder

In [375]:
ohe = OneHotEncoder(drop = 'first', dtype = np.int32)

In [376]:
ohe.fit(X_train[['job', 'marital', 'month', ]])

In [377]:
X_train_encoded = ohe.transform(X_train[['job', 'marital', 'month']]).toarray()
X_test_encoded = ohe.transform(X_test[['job', 'marital', 'month']]).toarray()

In [378]:
X_train.drop(['job', 'marital', 'month'], axis= 1).values
X_test.drop(['job', 'marital', 'month'], axis= 1).values

array([[ 42.,   1.,   0., ...,  -1.,   0.,   0.],
       [ 39.,   1.,   0., ..., 175.,   5.,   0.],
       [ 57.,   2.,   0., ...,  -1.,   0.,   0.],
       ...,
       [ 41.,   2.,   0., ..., 315.,   4.,   0.],
       [ 73.,   1.,   0., ...,  -1.,   0.,   0.],
       [ 35.,   2.,   0., ...,  -1.,   0.,   0.]])

In [379]:
X_train = np.hstack((X_train.drop(['job', 'marital', 'month'], axis= 1).values, X_train_encoded))
X_test = np.hstack((X_test.drop(['job', 'marital', 'month'], axis= 1).values, X_test_encoded))

In [380]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [381]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [382]:
y_pred = rf.predict(X_test)

In [383]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9089239809034153
