In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
np.random.seed(42)

In [20]:

df = pd.read_csv('bank-full.csv', sep=';')
print(df.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [21]:
missing_values = df.isnull().sum()
print('Missing values: \n', missing_values)

Missing values: 
 age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [22]:
colomns_to_use = ['age', 'job',  'education', 'balance', 'housing', 'day', 'month', 'duration', 'pdays', 'previous', 'poutcome', 'y']
df = df[colomns_to_use]
print(df.head())

   age           job  education  balance housing  day month  duration  pdays  \
0   58    management   tertiary     2143     yes    5   may       261     -1   
1   44    technician  secondary       29     yes    5   may       151     -1   
2   33  entrepreneur  secondary        2     yes    5   may        76     -1   
3   47   blue-collar    unknown     1506     yes    5   may        92     -1   
4   33       unknown    unknown        1      no    5   may       198     -1   

   previous poutcome   y  
0         0  unknown  no  
1         0  unknown  no  
2         0  unknown  no  
3         0  unknown  no  
4         0  unknown  no  


In [23]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})
print(df['y'].value_counts())

y
0    39922
1     5289
Name: count, dtype: int64


In [24]:
train_data, temp_data = train_test_split(df, test_size=0.4, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f'Train data: {train_data.shape}')
print(f'Validation data: {valid_data.shape}')
print(f'Test data: {test_data.shape}')

Train data: (27126, 12)
Validation data: (9042, 12)
Test data: (9043, 12)


In [25]:
categorical_colomns = ['job', 'education', 'housing', 'poutcome']
X_train = pd.get_dummies(train_data.drop('y', axis = 1), columns=categorical_colomns)
y_train = train_data['y']

X_valid = pd.get_dummies(valid_data.drop('y', axis = 1), columns=categorical_colomns)
y_valid = valid_data['y']

print(X_train.head())

       age  balance  day month  duration  pdays  previous  job_admin.  \
6377    45     -100   27   may       240     -1         0       False   
17236   29      166   28   jul       108     -1         0       False   
4490    31      121   20   may       187     -1         0        True   
24231   40     1693   17   nov       353     -1         0       False   
3978    28      317   16   may        21     -1         0       False   

       job_blue-collar  job_entrepreneur  ...  education_primary  \
6377             False              True  ...               True   
17236            False             False  ...              False   
4490             False             False  ...              False   
24231            False             False  ...              False   
3978             False             False  ...              False   

       education_secondary  education_tertiary  education_unknown  housing_no  \
6377                 False               False              False      

In [31]:
cat_cols = X.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = encoder.fit_transform(X[cat_cols])
X_numeric = df[['job', 'education', 'housing', 'poutcome']]
print(X_numeric)
scaler = StandardScaler()
"""X_numeric_scaler = scaler.fit_transform(X_numeric)"""
X_final_scaled = np.hstack((X_numeric_scaler, X_encoded))

X_train, X_test, y_train, y_test = train_test_split(X_final_scaled, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

y_predict = model.predict(X_test)
accur = accuracy_score(y_test, y_predict)
print(f'Value of accuracy: {accur:.3f}')

                job  education housing poutcome
0        management   tertiary     yes  unknown
1        technician  secondary     yes  unknown
2      entrepreneur  secondary     yes  unknown
3       blue-collar    unknown     yes  unknown
4           unknown    unknown      no  unknown
...             ...        ...     ...      ...
45206    technician   tertiary      no  unknown
45207       retired    primary      no  unknown
45208       retired  secondary      no  success
45209   blue-collar  secondary      no  unknown
45210  entrepreneur  secondary      no    other

[45211 rows x 4 columns]


NameError: name 'X_numeric_scaler' is not defined

In [None]:
X = df[colomns_to_use]
y = df['y']
cat_cols = X.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = encoder.fit_transform(X[cat_cols])
feature_names = encoder.get_feature_names_out(cat_cols)

X_train_df = pd.DataFrame(X_encoded, columns=feature_names)
numeric_cols = X.select_dtypes(include=['number']).columns
X_train_df = pd.concat([X_train_df,X[numeric_cols].reset_index(drop=True)], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X_train_df, y, test_size=0.2, random_state=42)

mutual_info = mutual_info_classif(X_train, y_train)
mutual_info_df = pd.DataFrame(mutual_info, index = X_train.columns, columns=['Mutual info']).sort_values(by = 'Mutual info', ascending = False)

print(mutual_info_df.head())

                  Mutual info
y                    0.359282
duration             0.070835
pdays                0.026445
poutcome_success     0.026112
poutcome_unknown     0.021006
