In [2]:
# I am importing the dataset from the UC Irvine Machine Learning Repository
# See https://archive.ics.uci.edu/dataset/222/bank+marketing
# You need to run 'pip install ucimlrepo' at the command level to get package that includes the dataset

import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo 
from sklearn.linear_model import LogisticRegression

# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
data_x = bank_marketing.data.features 
data_y = bank_marketing.data.targets



In [14]:
# this is just to convert the label y into a 0/1 variable
#pd.set_option('future.no_silent_downcasting', True)
y = data_y['y'].replace({'no': 0, 'yes': 1}).astype('int')

In [5]:
# this creates one hot encoding of all dummy variables
data_encoded = pd.get_dummies(data_x, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'month'])

In [6]:
print(data_encoded.columns)

Index(['age', 'balance', 'contact', 'day_of_week', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep'],
      dtype='object')


In [24]:
data_encoded

Unnamed: 0,age,balance,contact,day_of_week,duration,campaign,pdays,previous,poutcome,job_admin.,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,2143,,5,261,1,-1,0,,0,...,0,0,0,0,0,0,1,0,0,0
1,44,29,,5,151,1,-1,0,,0,...,0,0,0,0,0,0,1,0,0,0
2,33,2,,5,76,1,-1,0,,0,...,0,0,0,0,0,0,1,0,0,0
3,47,1506,,5,92,1,-1,0,,0,...,0,0,0,0,0,0,1,0,0,0
4,33,1,,5,198,1,-1,0,,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,cellular,17,977,3,-1,0,,0,...,0,0,0,0,0,0,0,1,0,0
45207,71,1729,cellular,17,456,2,-1,0,,0,...,0,0,0,0,0,0,0,1,0,0
45208,72,5715,cellular,17,1127,5,184,3,success,0,...,0,0,0,0,0,0,0,1,0,0
45209,57,668,telephone,17,508,4,-1,0,,0,...,0,0,0,0,0,0,0,1,0,0


In [36]:
# This is a selection of the X variables used
X = data_encoded[['age', 'balance', 'day_of_week', 'duration', 'campaign',
       'pdays', 'previous', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep']]

In [37]:
# Here, we fit the model
model = LogisticRegression(max_iter=4000)
model.fit(X, y.values.ravel())

LogisticRegression(max_iter=4000)

In [38]:
#This displays the model coefficients; scikit learn does not automatically give you p-values. The package statsmodels does.
coefficients = model.coef_[0]
intercept = model.intercept_[0]
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
})
coef_df = pd.concat([coef_df, pd.DataFrame({'Feature': 'Intercept', 'Coefficient': [intercept]})])
coef_df

Unnamed: 0,Feature,Coefficient
0,age,-0.004554
1,balance,1.5e-05
2,day_of_week,-0.00191
3,duration,0.003976
4,campaign,-0.120606
5,pdays,0.002368
6,previous,0.055296
7,job_admin.,0.063411
8,job_blue-collar,-0.317848
9,job_entrepreneur,-0.308275


In [39]:
# calculating the predicted probabilities
y_pred = model.predict_proba(X)[:,1]

In [40]:
# a comparison of predicted probabilities across true outcome categories
print(y_pred)
prob_true_actuals = y_pred[y == 1]
prob_false_actuals = y_pred[y == 0]

# Calculate the average predicted probability for true actuals (label = 1)
average_prob_true = np.mean(prob_true_actuals)

# Calculate the average predicted probability for false actuals (label = 0)
average_prob_false = np.mean(prob_false_actuals)

print(f"Average predicted probability for true actuals (label = 1): {average_prob_true:.4f}")
print(f"Average predicted probability for false actuals (label = 0): {average_prob_false:.4f}")

[0.04636896 0.02871936 0.00873816 ... 0.87221124 0.13722156 0.27072071]
Average predicted probability for true actuals (label = 1): 0.3458
Average predicted probability for false actuals (label = 0): 0.0876
