# Imports

In [None]:
import numpy as np 
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
import h5py
import pickle

# Data

In [None]:
bank_c_df = pd.read_csv('/content/bank_clean.csv', sep=';')
bank_c16_df = pd.read_csv('/content/bank_clean16_without_default.csv', sep=';')
bank_c15_df = pd.read_csv('/content/bank_clean15_without_duration.csv', sep=';')

## Bank Dataset

In [None]:
bank_c_df.head()

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,log_age,log_duration
0,0,0,0,0,1787,0,0,0,19,0,1,-1,0,0,0,3.401197,4.369448
1,1,0,1,0,4789,1,1,0,11,1,1,339,4,1,0,3.496508,5.393628
2,2,1,2,0,1350,1,0,0,16,2,1,330,1,1,0,3.555348,5.220356
3,2,0,2,0,1476,1,1,1,3,3,4,-1,0,0,0,3.401197,5.293305
4,3,0,1,0,0,1,0,1,5,1,1,-1,0,0,0,4.077537,5.420535


In [None]:
# Separate the majority and minority classes
majority_class = bank_c_df[bank_c_df.y == 0]
minority_class = bank_c_df[bank_c_df.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_data.drop("y", axis=1), balanced_data["y"], test_size=0.2, random_state=42)

# Create an instance of the RandomForestClassifier or RandomForestRegressor class
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)



In [None]:
# Train the model
rfc.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = rfc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.79
Precision: 0.75
Recall: 0.82
F1-score: 0.79


## Bank Dataset Without Default

In [None]:
bank_c16_df.shape

(4248, 16)

In [None]:
bank_c16_df.head()

Unnamed: 0,job,marital,education,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,log_age,log_duration
0,0,0,0,1787,0,0,0,19,0,1,-1,0,0,0,3.401197,4.369448
1,1,0,1,4789,1,1,0,11,1,1,339,4,1,0,3.496508,5.393628
2,2,1,2,1350,1,0,0,16,2,1,330,1,1,0,3.555348,5.220356
3,2,0,2,1476,1,1,1,3,3,4,-1,0,0,0,3.401197,5.293305
4,3,0,1,0,1,0,1,5,1,1,-1,0,0,0,4.077537,5.420535


In [None]:
# Separate the majority and minority classes
majority_class = bank_c16_df[bank_c16_df.y == 0]
minority_class = bank_c16_df[bank_c16_df.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_data.drop("y", axis=1), balanced_data["y"], test_size=0.2, random_state=42)

# Create an instance of the RandomForestClassifier or RandomForestRegressor class
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model
y_pred = rfc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.78
Precision: 0.75
Recall: 0.80
F1-score: 0.78


## Bank Dataset C15

In [None]:
bank_c15_df.shape

(4248, 15)

In [None]:
bank_c15_df.head()

Unnamed: 0,job,marital,education,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,log_age
0,0,0,0,1787,0,0,0,19,0,1,-1,0,0,0,3.401197
1,1,0,1,4789,1,1,0,11,1,1,339,4,1,0,3.496508
2,2,1,2,1350,1,0,0,16,2,1,330,1,1,0,3.555348
3,2,0,2,1476,1,1,1,3,3,4,-1,0,0,0,3.401197
4,3,0,1,0,1,0,1,5,1,1,-1,0,0,0,4.077537


In [None]:
# Separate the majority and minority classes
majority_class = bank_c15_df[bank_c15_df.y == 0]
minority_class = bank_c15_df[bank_c15_df.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_data.drop("y", axis=1), balanced_data["y"], test_size=0.2, random_state=42)

# Create an instance of the RandomForestClassifier or RandomForestRegressor class
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model
y_pred = rfc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.62
Precision: 0.59
Recall: 0.56
F1-score: 0.58


## Bank Full Dataset

In [None]:
bank_full_df = pd.read_csv('/content/bank-full-clean.csv', sep=";")

In [None]:
bank_full_df.shape

(43010, 17)

In [None]:
bank_full_df.head()

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,log_age,log_duration
0,0,0,0,0,2143,0,0,0,5,0,1,-1,0,0,0,4.060443,5.56452
1,1,1,1,0,29,0,0,0,5,0,1,-1,0,0,0,3.78419,5.01728
2,2,0,1,0,2,0,1,0,5,0,1,-1,0,0,0,3.496508,4.330733
3,3,0,2,0,1506,0,0,0,5,0,1,-1,0,0,0,3.850148,4.521789
4,4,1,2,0,1,1,0,0,5,0,1,-1,0,0,0,3.496508,5.288267


In [None]:
# Separate the majority and minority classes
majority_class = bank_full_df[bank_full_df.y == 0]
minority_class = bank_full_df[bank_full_df.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_data.drop("y", axis=1), balanced_data["y"], test_size=0.2, random_state=42)

# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create an instance of the RandomForestClassifier or RandomForestRegressor class
rfc = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, min_samples_leaf=1)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model
y_pred = rfc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.86
Precision: 0.84
Recall: 0.88
F1-score: 0.86


In [None]:
# Evaluate the model on the validation set
y_val_pred = rfc.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print("Validation set performance:")
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

Validation set performance:
Accuracy: 0.85
Precision: 0.84
Recall: 0.88
F1-score: 0.86


In [None]:
# GridSearch
# Define the hyperparameter space to search
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}



# Create a grid search object
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


## Save The Model

In [None]:

# Save the trained model as a binary file using pickle
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rfc, f)

## Load The Model

In [None]:
# Load the saved model from the binary file using pickle
with open('random_forest_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Use the loaded model to make predictions
y_pred = model.predict(X_test)

<hr>

In [None]:
c16_df = pd.read_csv('/content/bank-full-clean16.csv', sep=';')

In [None]:
c16_df.shape

(43010, 16)

In [None]:
c16_df.head()

Unnamed: 0,job,marital,education,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,log_age,log_duration
0,0,0,0,2143,0,0,0,5,0,1,-1,0,0,0,4.060443,5.56452
1,1,1,1,29,0,0,0,5,0,1,-1,0,0,0,3.78419,5.01728
2,2,0,1,2,0,1,0,5,0,1,-1,0,0,0,3.496508,4.330733
3,3,0,2,1506,0,0,0,5,0,1,-1,0,0,0,3.850148,4.521789
4,4,1,2,1,1,0,0,5,0,1,-1,0,0,0,3.496508,5.288267


In [None]:
# Separate the majority and minority classes
majority_class = c16_df[c16_df.y == 0]
minority_class = c16_df[c16_df.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_data.drop("y", axis=1), balanced_data["y"], test_size=0.2, random_state=42)

# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create an instance of the RandomForestClassifier or RandomForestRegressor class
rfc = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, min_samples_leaf=1)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model
y_pred = rfc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.86
Precision: 0.85
Recall: 0.89
F1-score: 0.87


In [None]:
# Evaluate the model on the validation set
y_val_pred = rfc.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print("Validation set performance:")
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

Validation set performance:
Accuracy: 0.85
Precision: 0.84
Recall: 0.88
F1-score: 0.86


<hr>

In [None]:
c15_df = pd.read_csv('/content/bank-full-clean15.csv', sep=';')

In [None]:
c15_df.shape

(43010, 15)

In [None]:
c15_df.head()

Unnamed: 0,job,marital,education,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,log_age
0,0,0,0,2143,0,0,0,5,0,1,-1,0,0,0,4.060443
1,1,1,1,29,0,0,0,5,0,1,-1,0,0,0,3.78419
2,2,0,1,2,0,1,0,5,0,1,-1,0,0,0,3.496508
3,3,0,2,1506,0,0,0,5,0,1,-1,0,0,0,3.850148
4,4,1,2,1,1,0,0,5,0,1,-1,0,0,0,3.496508


In [None]:
# Separate the majority and minority classes
majority_class = c15_df[c15_df.y == 0]
minority_class = c15_df[c15_df.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_data.drop("y", axis=1), balanced_data["y"], test_size=0.2, random_state=42)

# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create an instance of the RandomForestClassifier or RandomForestRegressor class
rfc = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, min_samples_leaf=1)

# Train the model
rfc.fit(X_train, y_train)

# Evaluate the model
y_pred = rfc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.72
Precision: 0.76
Recall: 0.66
F1-score: 0.70


In [None]:
# Evaluate the model on the validation set
y_val_pred = rfc.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print("Validation set performance:")
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

Validation set performance:
Accuracy: 0.73
Precision: 0.76
Recall: 0.70
F1-score: 0.73


# Conclusions


When we drop the "default' column, our model performs slightly better, but when we drop the "log_duration' column, our model performs poorly.
