In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices

# Define the directories
sourcedir = "J:/SYS 4021 - 6021/Sessions 13-15/R code"
datadir = "J:/SYS 4021 - 6021/Sessions 13-15/Data"

# Load the data
spam = pd.read_csv(f"{datadir}/Spam.txt", sep=" ", header=None)

# Define the formula for GLM
formula = 'V57 ~ ' + ' + '.join(['V'+str(i) for i in range(1, 58)])

# Fit the GLM model
spam_glm_main = smf.glm(formula=formula, data=spam, family=sm.families.Binomial()).fit()

# Print the summary
print(spam_glm_main.summary())

# Contribution of variable 51 to the model
v51_contribution = (np.exp(spam_glm_main.params[51]) - 1) * 100
print(f"Contribution of variable V51: {v51_contribution}%")

# Predicting for the first observation
log_odds = spam_glm_main.predict(spam.iloc[0, :])
odds = np.exp(log_odds)
print(f"Log-odds of being spam for observation 1: {log_odds}")
print(f"Odds of being spam for observation 1: {odds}")

# Model utility test using Chi2 statistic
spam_null = smf.glm(formula='V57 ~ 1', data=spam, family=sm.families.Binomial()).fit()
anova_results = sm.stats.anova_lm(spam_null, spam_glm_main, test='Chisq')
print(anova_results)

# Model with just the capital letters, V55 - V58 as predictors
cap_formula = 'V57 ~ V55 + V56 + V57 + V58'
spam_cap = smf.glm(formula=cap_formula, data=spam, family=sm.families.Binomial()).fit()

# Model utility test for capital letters model
anova_cap = sm.stats.anova_lm(spam_null, spam_cap, test='Chisq')
print(anova_cap)

# Compare full model with capital letters model
anova_full_vs_cap = sm.stats.anova_lm(spam_cap, spam_glm_main, test='Chisq')
print(anova_full_vs_cap)

# Use the t-test for individual coefficients
spam_no57 = smf.glm(formula=formula.replace(' + V57', ''), data=spam, family=sm.families.Binomial()).fit()
anova_no57 = sm.stats.anova_lm(spam_no57, spam_glm_main, test='Chisq')
print(anova_no57)

# Contribution of variable 57 to the model
v57_contribution = (np.exp(spam_glm_main.params[56]) - 1) * 100
print(f"Contribution of variable V57: {v57_contribution}%")

# Stepwise model selection for capital letter predictors
step_cap = smf.glm(formula=cap_formula, data=spam, family=sm.families.Binomial()).fit()
print(step_cap.summary())

# GLM with interactions
spam_glm2 = smf.glm(formula=formula + ' + V5*V6 + V5*V7 + V6*V7', data=spam, family=sm.families.Binomial()).fit()
print(spam_glm2.summary())

# Log transformed predictors with interactions
Lspam = np.log(spam.iloc[:, :-1] + 0.1)
Lspam['V57'] = spam.iloc[:, -1]

Lspam_glm = smf.glm(formula=formula, data=Lspam, family=sm.families.Binomial()).fit()
Lspam_glm2 = smf.glm(formula=formula + ' + V5*V6 + V5*V7 + V6*V7', data=Lspam, family=sm.families.Binomial()).fit()

anova_Lspam = sm.stats.anova_lm(Lspam_glm, Lspam_glm2, test='Chisq')
print(anova_Lspam)
