# Imports

In [21]:
import numpy as np 
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
import pickle
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


from sklearn.utils import class_weight
from xgboost import XGBClassifier

# Data

In [22]:
data = pd.read_csv('/content/bank-full-clean16.csv', sep=';')

In [23]:
# Separate the majority and minority classes
majority_class = data[data.y == 0]
minority_class = data[data.y == 1]

# Keep all samples from the minority class
sampled_minority = minority_class

# Choose a random sample of the same size from the majority class
sampled_majority = resample(majority_class, n_samples=len(minority_class), replace=False, random_state=42)

# Combine the balanced samples from both classes
balanced_data = pd.concat([sampled_majority, sampled_minority])

# Shuffle the rows of the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new file
#balanced_data.to_csv('balanced_dataset.csv', index=False)

In [24]:
X = balanced_data.drop("y", axis=1)
y = balanced_data["y"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, penalty='l2'))

# Train a logistic regression model
pipe.fit(X_train, y_train)

# Use the trained model to make predictions on the test set
y_pred = pipe.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.82
Precision: 0.82
Recall: 0.84
F1-score: 0.83


In [25]:
results = pd.DataFrame([["Logistic Regression",accuracy,precision,recall, f1]],
                       columns = ['Model','Accuracy','Precision','Recall','f1 score'])
results 

Unnamed: 0,Model,Accuracy,Precision,Recall,f1 score
0,Logistic Regression,0.823293,0.816327,0.837488,0.826772


# XGBoost

In [26]:
model = XGBClassifier(random_state=42, max_depth=10, learning_rate=0.01, n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict (X_test)

In [27]:
print("Evaluation metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Evaluation metrics:
Accuracy: 0.82
Precision: 0.82
Recall: 0.84
F1-score: 0.83


In [28]:
model_result = pd.DataFrame([["XGBoost",accuracy,precision,recall, f1]],
                       columns = ['Model','Accuracy','Precision','Recall','f1 score'])
results = results.append(model_result, ignore_index=True)
results 

  results = results.append(model_result, ignore_index=True)


Unnamed: 0,Model,Accuracy,Precision,Recall,f1 score
0,Logistic Regression,0.823293,0.816327,0.837488,0.826772
1,XGBoost,0.823293,0.816327,0.837488,0.826772


# Conclusion

In [29]:
# create data for the table
result = {
    'Model': ['Logistic Regression', 'XGBoost', 'Random Forest', 'SVMs'],
    'Accuracy': [0.823293, 0.823293, 0.85, 0.83],
    'Precision': [0.816327, 0.816327, 0.84, 0.82],
    'Recall': [0.837488, 0.837488, 0.88, 0.86],
    'F1-score': [0.826772, 0.826772, 0.86, 0.84]
}

# create the pandas DataFrame
df = pd.DataFrame(result)
df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.823293,0.816327,0.837488,0.826772
1,XGBoost,0.823293,0.816327,0.837488,0.826772
2,Random Forest,0.85,0.84,0.88,0.86
3,SVMs,0.83,0.82,0.86,0.84
