In [7]:
#Imports
import pandas as pd
import altair as alt
import numpy as np
from sklearn import tree
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
sns.set_theme(style="ticks")
import matplotlib.pyplot as plt

In [8]:
clean = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')
bank_majority = clean[clean['y'] == 'no']
bank_minority = clean[clean['y'] == 'yes']
bank_minority_oversampled = resample(bank_minority, replace=True, n_samples=len(bank_majority), random_state=42)
clean_balanced = pd.concat([bank_majority, bank_minority_oversampled])


In [9]:
#New features
clean_balanced['last_contact'] = clean_balanced['pdays'].apply(lambda x: 0 if x == 999 else 1)
clean_balanced['recent_contact'] = clean_balanced['pdays'].apply(lambda x: 0 if x < 30 else 1)
clean_balanced['previous_contact'] = clean_balanced['previous'].apply(lambda x: 0 if x > 1 else 1)

In [10]:
clean_balanced['education'] = clean_balanced['education'].replace({"unknown": "unknown_education"})
clean_balanced['job'] = clean_balanced['job'].replace({"unknown": "unknown_job"})
clean_balanced['poutcome_success'] = clean_balanced['poutcome'].apply(lambda x: 1 if x == 'success' else 0)

In [30]:
clean_balanced["job"] = clean_balanced["job"].replace({"unknown": "unknown_education"})
clean_balanced["marital"] = clean_balanced["marital"].replace(['unknown'], "married")
clean_balanced["education"] = clean_balanced["education"].replace(['unknown'], "university.degree")
clean_balanced["default"] = clean_balanced["default"].replace(['unknown'], "no")
clean_balanced["housing"] = clean_balanced["housing"].replace(['unknown'], "yes")
clean_balanced["loan"] = clean_balanced["loan"].replace(['unknown'], "no")
clean_balanced['poutcome'] = clean_balanced['poutcome'].replace(['nonexistent'], "failure")
clean_balanced['pdays'] = clean_balanced['pdays'].apply(lambda x: 0 if x == 999 else x)
clean_balanced[['job','marital',"education",'default','housing','contact','month','day_of_week','poutcome','loan','y']] = clean_balanced[['job','marital',"education",'default','housing','contact','month','day_of_week','poutcome','loan','y']].apply(lambda x: pd.factorize(x)[0])
testClean = clean_balanced[~clean_balanced['pdays'].isna()]

In [32]:
X = testClean.drop('y', axis=1)
y = clean_balanced['y']

In [33]:
mutual_info = mutual_info_classif(X, y)
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

euribor3m           0.167292
cons.conf.idx       0.154050
cons.price.idx      0.153473
nr.employed         0.141926
emp.var.rate        0.125974
recent_contact      0.063882
month               0.061552
last_contact        0.056370
pdays               0.055476
poutcome_success    0.052995
poutcome            0.051380
previous            0.044803
previous_contact    0.036927
contact             0.031881
age                 0.031489
job                 0.023772
education           0.010759
campaign            0.010303
day_of_week         0.003281
housing             0.003249
marital             0.002210
loan                0.000307
default             0.000213
dtype: float64

In [34]:
sel_three_feat = SelectKBest(mutual_info_classif, k=9).fit(X, y)
sel_bool = sel_three_feat.get_support()
X_sel = X[X.columns[sel_bool]]
#X_sel.head()

In [37]:
X_main, X_test, y_main, y_test = train_test_split(X_sel, y, test_size=0.15 , random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.15, random_state=42)

In [38]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = RandomForestClassifier(random_state=25, n_jobs=-1)
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)


In [22]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Accuracy score: ", grid_search.best_score_)

Best hyperparameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy score:  0.753671855299043


In [39]:
clf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                              max_depth=grid_search.best_params_['max_depth'],
                              min_samples_split=grid_search.best_params_['min_samples_split'],
                              min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                              random_state=25, n_jobs=-1)

In [40]:
clf.fit(X_train, y_train)

In [44]:
y_pred = clf.predict(X_val)


In [45]:
report = classification_report(y_val, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.71      0.87      0.78      4189
           1       0.84      0.64      0.73      4191

    accuracy                           0.76      8380
   macro avg       0.77      0.76      0.75      8380
weighted avg       0.77      0.76      0.75      8380



In [43]:
conmat = confusion_matrix(y_test, y_pred)
print(conmat)

[[4245  664]
 [1789 3161]]
