In [232]:
import pandas as pd
import numpy as np
import researchpy as rp
import matplotlib.pyplot as plt

In [233]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


In [234]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [235]:
from sklearn.linear_model import LogisticRegression

In [236]:
dataset = pd.read_csv('bank-additional-full.csv', delimiter=';', na_values=['unknown', 'NaN']).dropna()

In [211]:
# changed to g1, g2, g3 instead of teen, adult, elder
# to increase the number of younster for more accurate
# prediction

In [237]:
age_group = pd.cut(dataset['age'], bins=[0, 30, 55, 100], labels=['g1', 'g2', 'g3'])
dataset.insert(5, 'age_group', age_group)

In [238]:
dataset['job']= dataset['job'].map({
    'admin.': 1, 
    'blue-collar': 2, 
    'technician': 3, 
    'services': 4, 
    'management': 5, 
    'retired': 6, 
    'self-employed': 7, 
    'entrepreneur': 8, 
    'unemployed': 9,
    'housemaid': 10,
    'student': 11
    }).astype({'job': 'category'})

dataset['education']= dataset['education'].map({
    'university.degree': 1, 
    'high.school': 2, 
    'professional.course': 3, 
    'basic.9y': 4, 
    'basic.4y': 5, 
    'basic.6y': 6, 
    'illiterate': 7
    }).astype({'education': 'category'})

dataset['marital'] = dataset['marital'].map({
    'single': 1, 
    'married': 2, 
    'divorced': 3
    }).astype({'marital': 'category'})

dataset['age_group'] = dataset['age_group'].map({
    'g1': 1, 'g2': 2, 'g3': 3
    }).astype({'age_group': 'category'})

dataset['contact'] = dataset['contact'].map({
    'cellular': 1, 'telephone': 2
    }).astype({'contact': 'category'})

dataset['month'] = dataset['month'].map({
    'jan': 1, 
    'feb': 2, 
    'mar': 3, 
    'apr': 4, 
    'may': 5, 
    'jun': 6, 
    'jul': 7, 
    'aug': 8, 
    'sep': 9, 
    'oct': 10, 
    'nov': 11, 
    'dec': 12}).astype({'month': 'category'})

dataset['poutcome'] = dataset['poutcome'].map({
    'failure': 1, 'nonexistent': 2, 'success': 3
    }).astype({'poutcome': 'category'})

In [239]:
dataset['job']

0        10
2         4
3         1
4         4
6         1
         ..
41183     6
41184     2
41185     6
41186     3
41187     6
Name: job, Length: 30488, dtype: category
Categories (11, int64): [1, 2, 3, 4, ..., 8, 9, 10, 11]

In [240]:
dataset['housing'] = dataset['housing'].map({'yes': 1, 'no': 0})
dataset['default'] = dataset['default'].map({'yes': 1, 'no': 0})
dataset['loan'] = dataset['loan'].map({'yes': 1, 'no': 0})
dataset['y'] = dataset['y'].map({'no': 0, 'yes': 1}).astype({'y': 'category'})

In [241]:
dataset = dataset.drop(['age', 'day_of_week', 'duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'nr.employed'], axis=1)
yy = dataset['y']
yy = yy.map({'no': 0, 'yes': 1}).astype({'y': 'category'})

In [242]:
dataset.head()

Unnamed: 0,job,marital,education,default,age_group,housing,loan,contact,month,campaign,pdays,previous,poutcome,euribor3m,y
0,10,2,5,0,3,0,0,2,5,1,999,0,2,4.857,0
2,4,2,2,0,2,1,0,2,5,1,999,0,2,4.857,0
3,1,2,6,0,2,0,0,2,5,1,999,0,2,4.857,0
4,4,2,2,0,3,0,1,2,5,1,999,0,2,4.857,0
6,1,2,3,0,3,0,0,2,5,1,999,0,2,4.857,0


In [243]:
dataset.isnull().any()

job          False
marital      False
education    False
default      False
age_group    False
housing      False
loan         False
contact      False
month        False
campaign     False
pdays        False
previous     False
poutcome     False
euribor3m    False
y            False
dtype: bool

In [244]:
X = dataset.loc[:, dataset.columns != 'y']
y = dataset.loc[:, dataset.columns == 'y']
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

length of oversampled data is  42546
Number of no subscription in oversampled data 21273
Number of subscription 21273
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [248]:
os_data_X.isnull().sum()

job          3902
marital      5214
education    5369
default         0
age_group    4304
housing         0
loan            0
contact      1625
month        5716
campaign        0
pdays           0
previous        0
poutcome     1050
euribor3m       0
dtype: int64

In [206]:
X_train, X_test, y_train, y_test = train_test_split(os_data_X, os_data_y, test_size=0.2, random_state=None)

In [207]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34124 entries, 22463 to 24761
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   job        31004 non-null  category
 1   marital    30132 non-null  category
 2   education  29700 non-null  category
 3   default    34124 non-null  int64   
 4   age_group  30676 non-null  category
 5   housing    34124 non-null  int64   
 6   loan       34124 non-null  int64   
 7   contact    32877 non-null  category
 8   month      29543 non-null  category
 9   campaign   34124 non-null  int64   
 10  pdays      34124 non-null  int64   
 11  previous   34124 non-null  int64   
 12  poutcome   33330 non-null  category
 13  euribor3m  34124 non-null  float64 
dtypes: category(7), float64(1), int64(6)
memory usage: 2.3 MB


In [247]:
X_train['job']

21983    1
38908    1
16292    2
27929    1
17108    3
        ..
25692    1
26506    5
21050    7
27670    7
8891     2
Name: job, Length: 24390, dtype: category
Categories (11, int64): [1, 2, 3, 4, ..., 8, 9, 10, 11]

In [209]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

#log_reg.score(X_train,y_train)
#y_pred = log_reg.predict(X_test)
#print(classification_report(y_test, y_pred))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
result = cross_val_score(log_reg, x_train, y_train, cv=5, scoring='precision')
result.mean()

In [None]:
result = cross_val_score(log_reg, x_train, y_train, cv=5, scoring='accuracy')
result.mean()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
print(f'True negative: {tn}')
print(f'True positive: {tp}')
print(f'False negative: {fn}')
print(f'False positive: {fp}')

In [None]:
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1 = f1_score(y_test, y_predict)
accuracy = accuracy_score(y_test, y_predict)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 score: {f1}')
print(f'Accuracy: {accuracy}')