In [28]:
# Zavier Morales
# Edouard Mason
from pathlib import Path
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint

# Define global variables & import data

RANDOM_STATE_ID = 100577770

TRAINING_DATA_PATH = Path('dataset', 'bank_06.pkl')
COMPETITION_DATA_PATH = Path('dataset', 'bank_competition.pkl')

training_times = {}
tuning_times = {}
prediction_times = {}

raw_data = pd.read_pickle(TRAINING_DATA_PATH)


## **EDA**

In [29]:
print("Shape of the dataset:\n", raw_data.shape)
print("\nFirst few rows:\n",raw_data.head())
print("\nList of columns:\n",raw_data.columns)
print("\n\n\nTarget variable:",raw_data['deposit'].head())



Shape of the dataset:
 (11000, 17)

First few rows:
    age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    5   may      1389         1     -1         0  unknown     yes  
3    5   may       579         1     -1         0  unknown     yes  
4    5   may       673         2     -1         0  unknown     yes  

List of columns:
 Index(['age', 'job', 'marit

In [30]:
#Cardinality of variables
unique_counts = raw_data.nunique()
print(unique_counts)


age            76
job            12
marital         3
education       4
default         2
balance      3783
housing         2
loan            2
contact         3
day            31
month          12
duration     1423
campaign       36
pdays         472
previous       34
poutcome        4
deposit         2
dtype: int64


In [31]:
# Encoding object-type variables
df_encoded = pd.get_dummies(raw_data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'])

#Converting all boolean columns to integers (1 for True, 0 for False)
df_encoded[df_encoded.select_dtypes(include=['bool']).columns] = df_encoded.select_dtypes(include=['bool']).astype(int)
print("\nColumns in the DataFrame:\n\n",df_encoded.columns)
print("\n\nFirst few rows of the DataFrame:\n\n",df_encoded.head())

#Checking the number of missing values per column
missing_values = df_encoded.isnull().sum()
print("\nMissing values per column:\n\n", missing_values)   #We get 0 for each column so we have no missing values





Columns in the DataFrame:

 Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'deposit', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')


First few rows of the DataFrame:

    age  balanc

In [32]:
# Filter rows where pdays == -1
count_minus1 = (raw_data['pdays'] == -1).sum()

print(count_minus1)

# We see that there are 8203 where pdays = -1, that is where we have no info or contact. 


8203


In [33]:
# We create a new binary column with value = 1 if previously contacted and 0 if pdays = -1 (no previous contact)
df_encoded["prev_contacted"] = (df_encoded["pdays"] != -1).astype(int)
# We only calculate the median on values greater than -1
known_pdays = df_encoded.loc[df_encoded['pdays'] != -1, "pdays"]
median_pdays = known_pdays.median()
print(median_pdays)

# We replace all -1 values with the median of the know_pdays
df_encoded['pdays_duration'] = df_encoded['pdays'].replace(-1, median_pdays)

# We drop the original pdays variable
df_encoded.drop('pdays', axis=1, inplace=True)


182.0


In [34]:
print(df_encoded['pdays_duration'].tail())
print(df_encoded.tail())

11157    182
11158    182
11159    182
11160    172
11161    182
Name: pdays_duration, dtype: int64
       age  balance  day  duration  campaign  previous deposit  job_admin.  \
11157   33        1   20       257         1         0      no           0   
11158   39      733   16        83         4         0      no           0   
11159   32       29   19       156         2         0      no           0   
11160   43        0    8         9         2         5      no           0   
11161   34        0    9       628         1         0      no           0   

       job_blue-collar  job_entrepreneur  ...  month_may  month_nov  \
11157                1                 0  ...          0          0   
11158                0                 0  ...          0          0   
11159                0                 0  ...          0          0   
11160                0                 0  ...          1          0   
11161                0                 0  ...          0          0   

    

In [35]:
print(raw_data.dtypes)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
deposit      object
dtype: object


In [36]:
#We convert yes to 1 and no to 0 for the target 'deposit'
df_encoded["deposit"] = df_encoded["deposit"].map({"yes": 1, "no": 0}) #Used AI to convert yes to 1 and no to 0 in the target column 'deposit'

y = df_encoded["deposit"]
X = df_encoded.drop("deposit", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE_ID)

num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'previous', 'pdays_duration']
cat_cols = [col for col in df_encoded.columns if col not in num_cols and col != 'deposit']  # Used AI to create the loop selecting all columns that are not in num_cols and are not 'deposit'

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols), 
        ('cat', 'passthrough', cat_cols) 
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)



### **Logistic Regression**

In [37]:
logreg = LogisticRegression(random_state=RANDOM_STATE_ID)

start_time = time.time()
logreg.fit(X_train_processed, y_train)
training_times["lr"] = time.time() - start_time

y_pred_log = logreg.predict(X_test_processed)

accuracy_log = accuracy_score(y_test, y_pred_log)

print(accuracy_log)



0.8181818181818182


### **KNN**

In [38]:
knn = KNeighborsClassifier(n_neighbors=3)

start_time = time.time()
knn.fit(X_train_processed, y_train)
training_times["knn"] = time.time() - start_time

y_pred_knn = knn.predict(X_test_processed)

accuracy_knn = accuracy_score(y_test, y_pred_knn)

print(accuracy_knn)


0.7995454545454546


### **Tree** 

In [39]:
tree = DecisionTreeClassifier()

start_time = time.time()
tree = tree.fit(X_train_processed, y_train)
training_times["tree"] = time.time() - start_time

y_pred_tree = tree.predict(X_test_processed)

accuracy_tree = accuracy_score(y_test, y_pred_tree)

print(accuracy_tree)

0.785


## **Hyperparameter Tuning**

### **Logistic Regression**

In [40]:

param_grid_lr = {
    'C' : np.logspace(-4, 4, 20), 
    'penalty': ['l2'],             
    'solver': ['lbfgs', 'liblinear'] 
}

logistic = LogisticRegression(random_state=RANDOM_STATE_ID, max_iter=1000)

# Find best hyperparameters using RandomizedSearchCV
grid_search = GridSearchCV(logistic, param_grid=param_grid_lr, 
                                     cv=5,
                                   scoring='f1',
                                    n_jobs=1
)

# Train the model with best hyperparameters
start_time = time.time()
grid_search.fit(X_train_processed, y_train)
tuning_times["lr"] = time.time() - start_time
best_params= grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters (Random Search): {best_params}")
print(f"Best Score (Random Search): {best_score}")

Best Parameters (Random Search): {'C': np.float64(4.281332398719396), 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score (Random Search): 0.8185949201235179


In [41]:
# Find accuracy over entire training set
tuned_lr = grid_search.best_estimator_
y_pred_tuned = tuned_lr.predict(X_test_processed)
accuracy_tuned_lr = accuracy_score(y_pred_tuned,y_test )

print(accuracy_tuned_lr)
print(tuning_times)

0.8154545454545454
{'lr': 5.3054118156433105}


### **KNN**

In [42]:
param_grid_knn = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

knn = KNeighborsClassifier()

#Tuning using RandomizedSearchCV
grid_search_knn = GridSearchCV(knn, param_grid=param_grid_knn, 
                                cv=5, scoring="accuracy", 
                                n_jobs=1
)

start_time = time.time()
grid_search_knn.fit(X_train_processed, y_train)
tuning_times["knn"] = time.time() - start_time
best_params= grid_search_knn.best_params_
best_score = grid_search_knn.best_score_

print(f"Best Parameters (Random Search): {best_params}")
print(f"Best Score (Random Search): {best_score}")


Best Parameters (Random Search): {'metric': 'minkowski', 'n_neighbors': 15, 'weights': 'distance'}
Best Score (Random Search): 0.8103409090909091


In [43]:
tuned_knn = grid_search_knn.best_estimator_
y_pred_tuned = tuned_knn.predict(X_test_processed)
accuracy_tuned_knn = accuracy_score(y_pred_tuned, y_test)

print(accuracy_tuned_knn)
print(tuning_times)

0.8163636363636364
{'lr': 5.3054118156433105, 'knn': 16.83409810066223}


### **Tree**

In [44]:
param_dist_tree = {
     'criterion': ['gini', 'entropy'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}
tree = DecisionTreeClassifier(random_state=RANDOM_STATE_ID)

#Tuning using RandomizedSearchCV
random_search_tree = RandomizedSearchCV(tree, param_distributions=param_dist_tree, 
                                   cv=3, scoring='accuracy',
                                   n_iter=50, random_state=RANDOM_STATE_ID,
                                   n_jobs=1
)

start_time = time.time()
random_search_tree.fit(X_train_processed, y_train)
tuning_times["tree"] = time.time() - start_time
best_params= random_search_tree.best_params_
best_score = random_search_tree.best_score_

print(f"Best Parameters (Random Search): {best_params}")
print(f"Best Score (Random Search): {best_score}")

Best Parameters (Random Search): {'criterion': 'gini', 'max_depth': 13, 'min_samples_leaf': 18, 'min_samples_split': 3}
Best Score (Random Search): 0.8294310648952874


In [45]:
tuned_tree = random_search_tree.best_estimator_
y_pred_tuned = tuned_tree.predict(X_test_processed)

accuracy_tree_tuned = accuracy_score(y_pred_tuned, y_test)

print(accuracy_tree_tuned)
print(tuning_times)

0.8368181818181818
{'lr': 5.3054118156433105, 'knn': 16.83409810066223, 'tree': 4.990444898605347}


## **Advanced Models**

### **SVM**

In [46]:
from sklearn.svm import SVC

svm_model = SVC(random_state=RANDOM_STATE_ID)

svm_model.fit(X_train_processed, y_train)

y_pred_svm = svm_model.predict(X_test_processed)

accuracy_svm = accuracy_score(y_pred_svm, y_test)

print("SVM Accuracy Score:", accuracy_svm)
                  

SVM Accuracy Score: 0.8563636363636363


### **Random Forest Classifier**


In [47]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=RANDOM_STATE_ID)

rf.fit(X_train_processed, y_train)

y_pred_rf= rf.predict(X_test_processed)

accuracy_rf = accuracy_score(y_pred_rf, y_test)

print("Random Forest Accuracy score:", accuracy_rf)

Random Forest Accuracy score: 0.8522727272727273


#### **XGBoost**

In [48]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=100,        
    max_depth=6,             
    learning_rate=0.1,      
    objective='binary:logistic', 
    random_state=RANDOM_STATE_ID
)

xgb_model.fit(X_train_processed, y_train)

y_pred_xgb = xgb_model.predict(X_test_processed)

accuracy_xgb = accuracy_score(y_pred_xgb, y_test)

print("XGBoost accuracy score:", accuracy_xgb)



XGBoost accuracy score: 0.8563636363636363


## **Results**

In [50]:
%pip install tabulate
import tabulate

## Results to report (in a table):

# by model (KNN, LR, DT)
models = ['Logistic Regression', 'KNN', 'Decision Tree']

# confidence intervals (RMSE)
#  before hyperparameter tuning
accuracies = [accuracy_log, accuracy_knn, accuracy_tree]
#  after hyperparameter tuning
tuned_accuracies = [accuracy_tuned_lr, accuracy_tuned_knn, accuracy_tree_tuned]
# accuracy improvement
accuracy_improvement = [(tuned_accuracies[idx] - acc) / acc for idx, acc in enumerate(accuracies)]

# training time
training_times
# hyperparam tuning time
tuning_times


headings = [
    'Model',
    'Accuracy without hyperparameter tuning (%)',
    'Accuracy after hyperparameters tuning (%)',
    'Accuracy improvement (%)',
    'Training time (s)',
    'Hyperparameter tuning time (s)',
]

data = [
    models,
    accuracies,
    tuned_accuracies,
    accuracy_improvement,
    [training_times[key] for key in ['lr', 'knn', 'tree']],
    [tuning_times[key] for key in ['lr', 'knn', 'tree']],
]

# round all numbered data
data = [[round(e, 4) for e in col] if type(col[0]) is not str else col for col in data]

data = np.array(data).transpose()

print(tabulate.tabulate(data, headers=headings))

# draw conclusions about different models (which ones benefited the most from hyper-tuning, speed, etc.)

Note: you may need to restart the kernel to use updated packages.
Model                  Accuracy without hyperparameter tuning (%)    Accuracy after hyperparameters tuning (%)    Accuracy improvement (%)    Training time (s)    Hyperparameter tuning time (s)
-------------------  --------------------------------------------  -------------------------------------------  --------------------------  -------------------  --------------------------------
Logistic Regression                                        0.8182                                       0.8155                     -0.0033               0.0529                            5.3054
KNN                                                        0.7995                                       0.8164                      0.021                0.0039                           16.8341
Decision Tree                                              0.785                                        0.8368                      0.066                0.080

## **Final Model**

In [51]:
#Load test dataset

test_set = pd.read_pickle(COMPETITION_DATA_PATH)

print(test_set.columns)

# make predictions about this using the best method

# save final model (in notes)
# save predictions in a file (pkl)

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')
