In [1]:
import pandas as pd
import numpy as np
import researchpy as rp
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
dataset = pd.read_csv('src/bank-additional-full.csv', delimiter=';', na_values=['unknown', 'NaN']).dropna()

In [6]:
# changed to g1, g2, g3 instead of teen, adult, elder
# to increase the number of younster for more accurate
# prediction

In [7]:
age_group = pd.cut(dataset['age'], bins=[0, 30, 55, 100], labels=['g1', 'g2', 'g3'])
dataset.insert(5, 'age_group', age_group)

In [8]:
dataset['job']= dataset['job'].map({
    'admin.': 1, 
    'blue-collar': 2, 
    'technician': 3, 
    'services': 4, 
    'management': 5, 
    'retired': 6, 
    'self-employed': 7, 
    'entrepreneur': 8, 
    'unemployed': 9,
    'housemaid': 10,
    'student': 11
    }).astype({'job': 'category'})

dataset['education']= dataset['education'].map({
    'university.degree': 1, 
    'high.school': 2, 
    'professional.course': 3, 
    'basic.9y': 4, 
    'basic.4y': 5, 
    'basic.6y': 6, 
    'illiterate': 7
    }).astype({'education': 'category'})

dataset['marital'] = dataset['marital'].map({
    'single': 1, 
    'married': 2, 
    'divorced': 3
    }).astype({'marital': 'category'})

dataset['age_group'] = dataset['age_group'].map({
    'g1': 1, 'g2': 2, 'g3': 3
    }).astype({'age_group': 'category'})

dataset['contact'] = dataset['contact'].map({
    'cellular': 1, 'telephone': 2
    }).astype({'contact': 'category'})

dataset['month'] = dataset['month'].map({
    'jan': 1, 
    'feb': 2, 
    'mar': 3, 
    'apr': 4, 
    'may': 5, 
    'jun': 6, 
    'jul': 7, 
    'aug': 8, 
    'sep': 9, 
    'oct': 10, 
    'nov': 11, 
    'dec': 12}).astype({'month': 'category'})

dataset['poutcome'] = dataset['poutcome'].map({
    'failure': 1, 'nonexistent': 2, 'success': 3
    }).astype({'poutcome': 'category'})

In [9]:
dataset['housing'] = dataset['housing'].map({'yes': 1, 'no': 0})
dataset['default'] = dataset['default'].map({'yes': 1, 'no': 0})
dataset['loan'] = dataset['loan'].map({'yes': 1, 'no': 0})

In [10]:
x = dataset.drop(['age', 'day_of_week', 'duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'nr.employed', 'y'], axis=1)
y = dataset['y']
y = y.map({'no': 0, 'yes': 1}).astype({'y': 'category'})

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=35)

In [12]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_predict = log_reg.predict(x_test)
print(classification_report(y_predict,y_test))

precision    recall  f1-score   support

           0       0.99      0.90      0.94      5880
           1       0.21      0.72      0.32       218

    accuracy                           0.89      6098
   macro avg       0.60      0.81      0.63      6098
weighted avg       0.96      0.89      0.92      6098



In [13]:
result = cross_val_score(log_reg, x_train, y_train, cv=5, scoring='precision')
result.mean()

0.6500838252147652

In [16]:
result = cross_val_score(log_reg, x_train, y_train, cv=5, scoring='accuracy')
result.mean()

0.8848298482984831

In [14]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
print(f'True negative: {tn}')
print(f'True positive: {tp}')
print(f'False negative: {fn}')
print(f'False positive: {fp}')

True negative: 5289
True positive: 156
False negative: 591
False positive: 62


In [15]:
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1 = f1_score(y_test, y_predict)
accuracy = accuracy_score(y_test, y_predict)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 score: {f1}')
print(f'Accuracy: {accuracy}')

Precision: 0.7155963302752294
Recall: 0.20883534136546184
F1 score: 0.32331606217616576
Accuracy: 0.892915710068875
