In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import (confusion_matrix, accuracy_score)
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
insurance_df = pd.read_excel('insurance.xlsx')

In [3]:
# drop 'Seq nr' column
insurance_df = insurance_df.drop(['Seq nr'], axis=1)

In [4]:
insurance_df

Unnamed: 0,Group,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,Training,30,management,married,tertiary,no,1119,no,no,cellular,6,aug,199,7,-1,0,unknown,no
1,Training,54,management,married,tertiary,no,-1415,yes,yes,cellular,17,nov,135,1,-1,0,unknown,no
2,Training,46,admin.,single,unknown,yes,0,no,no,unknown,23,may,378,2,-1,0,unknown,no
3,Training,33,management,married,tertiary,no,2213,no,no,cellular,18,feb,240,1,385,9,failure,no
4,Training,52,admin.,married,secondary,no,484,yes,no,unknown,6,may,128,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,Test,56,self-employed,married,tertiary,yes,0,no,no,cellular,19,nov,122,1,-1,0,unknown,no
39996,Test,27,blue-collar,single,secondary,no,87,no,no,telephone,10,jul,245,1,-1,0,unknown,no
39997,Test,30,retired,single,secondary,no,48,no,no,cellular,18,may,142,2,300,1,other,no
39998,Test,36,blue-collar,married,secondary,no,1797,yes,yes,cellular,17,nov,166,1,181,2,failure,no


In [5]:
# view counts of classes
insurance_df['y'].value_counts().to_dict()

{'no': 35330, 'yes': 4670}

In [6]:
# encode categorical features
X = insurance_df.drop(['y', 'Group'], axis=1)
X_encoded = pd.get_dummies(X, drop_first=True)
insurance_df_encoded = X_encoded
insurance_df_encoded['y'] = insurance_df['y']
insurance_df_encoded['Group'] = insurance_df['Group']

In [7]:
# split variables and create training and testing sets
train_df = insurance_df_encoded[insurance_df_encoded['Group'] == 'Training'].drop(['Group'], axis = 1)
test_df = insurance_df_encoded[insurance_df_encoded['Group'] == 'Test'].drop(['Group'], axis = 1)

X_train, X_test = train_df.drop(['y'], axis=1), test_df.drop(['y'], axis=1)
y_train, y_test = train_df['y'], test_df['y']

y_train_encoded = [1 if i == 'yes' else 0 for i in y_train]
y_test_encoded = [1 if i == 'yes' else 0 for i in y_test]

In [27]:
# view counts of classes in test set
test_df['y'].value_counts().to_dict()

{'no': 3540, 'yes': 460}

# Linear prediction model

In [29]:
# Initialize and fit model
X_train_with_constant = sm.add_constant(X_train)
mls_reg = sm.OLS(y_train_encoded, X_train_with_constant)
mls_res = mls_reg.fit()

In [30]:
# Get summary of model
print(mls_res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.305
Model:                            OLS   Adj. R-squared:                  0.304
Method:                 Least Squares   F-statistic:                     375.3
Date:                Fri, 10 May 2024   Prob (F-statistic):               0.00
Time:                        11:38:13   Log-Likelihood:                -3670.5
No. Observations:               36000   AIC:                             7427.
Df Residuals:                   35957   BIC:                             7792.
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.0715    

In [31]:
# convert results to pandas dataframe
mls_results_as_html = mls_res.summary().tables[1].as_html()
mls_results_as_df = pd.read_html(mls_results_as_html, header=0, index_col=0)[0]
mls_results_as_df

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0715,0.016,4.552,0.0,0.041,0.102
age,0.0002,0.0,0.867,0.386,-0.0,0.0
balance,8.901e-07,4.73e-07,1.881,0.06,-3.72e-08,2e-06
day,0.0009,0.0,4.408,0.0,0.0,0.001
duration,0.0005,5.54e-06,85.198,0.0,0.0,0.0
campaign,-0.0015,0.0,-3.057,0.002,-0.002,-0.001
pdays,-4.931e-05,3.05e-05,-1.616,0.106,-0.0,1e-05
previous,0.0007,0.001,0.985,0.325,-0.001,0.002
job_blue-collar,-0.0195,0.005,-3.581,0.0,-0.03,-0.009
job_entrepreneur,-0.0238,0.009,-2.622,0.009,-0.042,-0.006


In [32]:
# view insignificant features
insignificant_features_mls = mls_results_as_df[mls_results_as_df['P>|t|'] > 0.05]
insignificant_features_mls

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
age,0.0002,0.0,0.867,0.386,-0.0,0.0
balance,8.901e-07,4.73e-07,1.881,0.06,-3.72e-08,2e-06
pdays,-4.931e-05,3.05e-05,-1.616,0.106,-0.0,1e-05
previous,0.0007,0.001,0.985,0.325,-0.001,0.002
job_management,-0.0093,0.006,-1.542,0.123,-0.021,0.003
job_services,-0.0114,0.006,-1.808,0.071,-0.024,0.001
job_unemployed,-0.0097,0.009,-1.032,0.302,-0.028,0.009
job_unknown,-0.0167,0.019,-0.882,0.378,-0.054,0.02
marital_single,0.0063,0.005,1.188,0.235,-0.004,0.017
education_secondary,0.0052,0.005,1.14,0.254,-0.004,0.014


In [33]:
# get latex syntax for implementation of table
#print(mls_res.summary().as_latex())

In [34]:
# Get predictions and metrics - classification view (accuracy, confusion matrix)
X_test_with_constant = sm.add_constant(X_test)
y_pred = mls_res.predict(X_test_with_constant)
y_pred_encoded = [1 if i >= 0.5 else 0 for i in y_pred]
tn, fp, fn, tp = confusion_matrix(y_test_encoded, y_pred_encoded).ravel()
specificity_mls = tn / (tn + fp)
sensitivity_ml = tp / (tp + fn)
accuracy_ml = (tp + tn) / (tp + tn + fp + fn)
print(f'specificity: {specificity_mls}' )
print(f'sensitivity: {sensitivity_ml}' )
print(f'accuracy: {accuracy_ml}' )

specificity: 0.976271186440678
sensitivity: 0.3065217391304348
accuracy: 0.89925


# Linear discriminant model (LDA)

In [14]:
# Initialize and fit model
lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, y_train)

In [15]:
# Get prediction and metrics
y_pred = lda_clf.predict(X_test)
tn,fp,fn,tp = confusion_matrix(y_test, y_pred).ravel()
specificity_lda = tn / (tn + fp)
sensitivity_lda = tp / (tp + fn)
accuracy_lda = (tp + tn) / (tp + tn + fp + fn)
print(f'specificity: {specificity_lda}' )
print(f'sensitivity: {sensitivity_lda}' )
print(f'accuracy: {accuracy_lda}' )

specificity: 0.9533898305084746
sensitivity: 0.4369565217391304
accuracy: 0.894


# Logistic regression model

In [35]:
# initialize and fit model
glm = sm.GLM(y_train_encoded,X_train_with_constant,family=sm.families.Binomial())
glm_res = glm.fit()

In [36]:
# Get summary of model
print(glm_res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                36000
Model:                            GLM   Df Residuals:                    35957
Model Family:                Binomial   Df Model:                           42
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -8560.2
Date:                Fri, 10 May 2024   Deviance:                       17120.
Time:                        11:38:37   Pearson chi2:                 1.56e+07
No. Iterations:                     7   Pseudo R-squ. (CS):             0.2181
Covariance Type:            nonrobust                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -2.4892    

In [37]:
# convert results to pandas dataframe
glm_res_as_html = glm_res.summary().tables[1].as_html()
glm_res_as_df = pd.read_html(glm_res_as_html, header=0, index_col=0)[0]
glm_res_as_df

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.4892,0.205,-12.122,0.0,-2.892,-2.087
age,-3.1e-05,0.002,-0.013,0.99,-0.005,0.005
balance,1e-05,6e-06,1.682,0.093,-2e-06,2.1e-05
day,0.0086,0.003,3.081,0.002,0.003,0.014
duration,0.0042,7.3e-05,58.018,0.0,0.004,0.004
campaign,-0.0889,0.011,-7.812,0.0,-0.111,-0.067
pdays,3.5e-05,0.0,0.102,0.919,-0.001,0.001
previous,0.007,0.006,1.112,0.266,-0.005,0.019
job_blue-collar,-0.3421,0.081,-4.202,0.0,-0.502,-0.183
job_entrepreneur,-0.3822,0.141,-2.714,0.007,-0.658,-0.106


In [38]:
# view insignificant features
insignificant_features_glm = glm_res_as_df[glm_res_as_df['P>|z|'] > 0.05]
insignificant_features_glm

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
age,-3.1e-05,0.002,-0.013,0.99,-0.005,0.005
balance,1e-05,6e-06,1.682,0.093,-2e-06,2.1e-05
pdays,3.5e-05,0.0,0.102,0.919,-0.001,0.001
previous,0.007,0.006,1.112,0.266,-0.005,0.019
job_retired,0.205,0.109,1.874,0.061,-0.009,0.419
job_unemployed,-0.2282,0.126,-1.814,0.07,-0.475,0.018
job_unknown,-0.225,0.262,-0.858,0.391,-0.739,0.289
marital_single,0.0572,0.075,0.762,0.446,-0.09,0.204
default_yes,-0.0674,0.185,-0.364,0.716,-0.431,0.296
month_feb,-0.1686,0.1,-1.686,0.092,-0.365,0.027


In [39]:
# get latex syntax for implementation of table
#print(glm_res.summary().as_latex())

In [41]:
# Get prediction and metrics
y_pred = glm_res.predict(X_test_with_constant)
y_pred_encoded = [1 if i >= 0.5 else 0 for i in y_pred]
tn, fp, fn, tp = confusion_matrix(y_test_encoded, y_pred_encoded).ravel()
specificity_glm = tn / (tn + fp)
sensitivity_glm = tp / (tp + fn)
accuracy_glm = (tp + tn) / (tp + tn + fp + fn)
print(f'specificity: {specificity_glm}' )
print(f'sensitivity: {sensitivity_glm}' )
print(f'accuracy: {accuracy_glm}' )

specificity: 0.9697740112994351
sensitivity: 0.35434782608695653
accuracy: 0.899


# Model results

In [42]:
# create df of specificty, sensitivity and aacuracy
spec_list = [specificity_lda, specificity_mls, specificity_glm]
sens_list = [sensitivity_lda, sensitivity_ml, sensitivity_glm]
acc_list = [accuracy_lda, accuracy_ml, accuracy_glm]
model_list = ['Linear discriminant model', 'Linear prediction model', 'Logistic regression model'] 
df_results = pd.DataFrame(list(zip(model_list, spec_list, sens_list, acc_list)), 
                          columns=['Model', 'Specificity', 'Sensitivity', 'Accuracy'], index=None)

In [43]:
df_results

Unnamed: 0,Model,Specificity,Sensitivity,Accuracy
0,Linear discriminant model,0.95339,0.436957,0.894
1,Linear prediction model,0.976271,0.306522,0.89925
2,Logistic regression model,0.969774,0.354348,0.899


In [44]:
# get latex code
print(df_results.to_latex())

\begin{tabular}{llrrr}
\toprule
{} &                      Model &  Specificity &  Sensitivity &  Accuracy \\
\midrule
0 &  Linear discriminant model &     0.953390 &     0.436957 &   0.89400 \\
1 &    Linear prediction model &     0.976271 &     0.306522 &   0.89925 \\
2 &  Logistic regression model &     0.969774 &     0.354348 &   0.89900 \\
\bottomrule
\end{tabular}



  print(df_results.to_latex())
