In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read the data from the csv on GitHub
bank_marketing_df = pd.read_csv("../Data/bank_marketing.csv")
bank_marketing_df.head()


Unnamed: 0,age,job,marital,education,default credit,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


**Random Forest Model**

In [2]:
# Convert categorical data to numeric with `pd.get_dummies`
bank_marketing_dummies_df = pd.get_dummies(bank_marketing_df, columns=['job', 'marital', 'education', 'default credit', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y'], drop_first=True)
bank_marketing_dummies_df = bank_marketing_dummies_df.astype(int)
bank_marketing_dummies_df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,56,261,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
1,57,149,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
2,37,226,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
3,40,151,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
4,56,307,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0


In [3]:
#shorten DF name
bank_df = bank_marketing_dummies_df.copy()
bank_df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,56,261,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
1,57,149,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
2,37,226,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
3,40,151,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0
4,56,307,1,999,0,1,93,-36,4,5191,...,0,0,0,1,0,0,0,1,0,0


In [4]:
# Define features set
X = bank_df.copy()
X.drop("y_yes", axis=1, inplace=True)
X.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1,93,-36,4,5191,...,1,0,0,0,1,0,0,0,1,0
1,57,149,1,999,0,1,93,-36,4,5191,...,1,0,0,0,1,0,0,0,1,0
2,37,226,1,999,0,1,93,-36,4,5191,...,1,0,0,0,1,0,0,0,1,0
3,40,151,1,999,0,1,93,-36,4,5191,...,1,0,0,0,1,0,0,0,1,0
4,56,307,1,999,0,1,93,-36,4,5191,...,1,0,0,0,1,0,0,0,1,0


In [5]:
# Define target vector
y = bank_df["y_yes"].ravel()
y

array([0, 0, 0, ..., 0, 1, 0])

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [8]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [9]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [13]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8826,298
Actual 1,640,533


Accuracy Score : 0.9089055064581917
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      9124
           1       0.64      0.45      0.53      1173

    accuracy                           0.91     10297
   macro avg       0.79      0.71      0.74     10297
weighted avg       0.90      0.91      0.90     10297



In [15]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.30981460241608705, 'duration'),
 (0.09716649875483727, 'age'),
 (0.05719357173225984, 'nr.employed'),
 (0.04597934445508848, 'campaign'),
 (0.03712655703034047, 'euribor3m'),
 (0.03305703896305684, 'cons.conf.idx'),
 (0.030588677368851645, 'pdays'),
 (0.02300293463730807, 'poutcome_success'),
 (0.02289707521817387, 'emp.var.rate'),
 (0.02149552343461142, 'housing_yes'),
 (0.015101514812117182, 'day_of_week_thu'),
 (0.014393861666987579, 'previous'),
 (0.014235932452778571, 'loan_yes'),
 (0.014153541877316237, 'marital_married'),
 (0.013911214557114132, 'day_of_week_wed'),
 (0.013833728296509947, 'day_of_week_mon'),
 (0.013705265111121778, 'day_of_week_tue'),
 (0.013365089578447589, 'education_university.degree'),
 (0.012422167813963058, 'education_high.school'),
 (0.012181201417138379, 'marital_single'),
 (0.012180571416957704, 'job_technician'),
 (0.011665710759794319, 'contact_telephone'),
 (0.009993180936387704, 'default credit_unknown'),
 (0.009978099817654535, 'education_profe