In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# Load data
sourcedir = '/Volumes/sb5ce/SYS 4021 - 6021/Sessions 13-15/R code'
datadir = "/Volumes/sb5ce/SYS 4021 - 6021/Sessions 13-15/Data"
spam = pd.read_csv(f'{datadir}/Spam.txt', sep=' ', header=None)

# Split data into training and test sets
X = spam.iloc[:, :-1]
y = spam.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

# Compare the response variable in the test and training sets to the original complete data set
portions = pd.DataFrame({
    'dataset': ['Full', 'Train', 'Test'] * 2,
    'class': ['Spam'] * 3 + ['Ham'] * 3,
    'portion': [
        y.mean(),
        y_train.mean(),
        y_test.mean(),
        1 - y.mean(),
        1 - y_train.mean(),
        1 - y_test.mean()
    ]
})

sns.barplot(x='dataset', y='portion', hue='class', data=portions)
plt.show()

# Look at a random sample of predictors and compare training, testing, and original data sets
samples = np.random.choice(X.columns, 4, replace=False)
for i in samples:
    plt.figure()
    sns.boxplot(data=[X[i], X_train[i], X_test[i]], palette='Set3')
    plt.xticks([0, 1, 2], ['Original', 'Train', 'Test'])
    plt.title(f'V{i}')
    plt.show()

# GLM with training data
spam_glm = sm.Logit(y_train, sm.add_constant(X_train)).fit()
print(spam_glm.summary())

spam_null = sm.Logit(y_train, np.ones(len(y_train))).fit()
anova_result = sm.stats.anova_lm(spam_null, spam_glm, typ=1)
print(anova_result)

# GLM with log transform of all predictor variables
LSpam_train = np.log(X_train + 0.1)
LSpam_train['V58'] = y_train

Lspam_glm = sm.Logit(LSpam_train['V58'], sm.add_constant(LSpam_train.iloc[:, :-1])).fit()
print(Lspam_glm.summary())

Lspam_null = sm.Logit(LSpam_train['V58'], np.ones(len(LSpam_train['V58']))).fit()
anova_result_log = sm.stats.anova_lm(Lspam_null, Lspam_glm, typ=1)
print(anova_result_log)

# Variable Selection with Stepwise
spam_step = spam_glm.model.fit_regularized(maxiter=1000, method='elastic_net', alpha=0.01, L1_wt=0.5)
print(len(spam_step.params) - 1)

spam_step1 = spam_glm.model.fit_regularized(maxiter=1000, method='elastic_net', alpha=0.01, L1_wt=0.5, steps=5)
print(len(spam_step1.params) - 1)

Lspam_step = Lspam_glm.model.fit_regularized(maxiter=1000, method='elastic_net', alpha=0.01, L1_wt=0.5)
Lspam_step1 = Lspam_glm.model.fit_regularized(maxiter=1000, method='elastic_net', alpha=0.01, L1_wt=0.5, steps=5)
print(len(Lspam_step1.params) - 1)

# Interaction terms
V1_factor = pd.cut(spam[0], bins=[-np.inf, spam[0].median(), np.inf], labels=['low', 'high'])
V38_factor = pd.cut(spam[37], bins=[-np.inf, spam[37].median(), np.inf], labels=['low', 'high'])

interaction_data = pd.DataFrame({'V1_factor': V1_factor, 'V38_factor': V38_factor, 'V58': spam[57]})
sns.pointplot(x='V38_factor', y='V58', hue='V1_factor', data=interaction_data)
plt.show()

# GLM Principal Components Regression
pca = PCA(n_components=0.90)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

spampca_glm90 = sm.Logit(y_train, sm.add_constant(X_train_pca)).fit()
spampca_null = sm.Logit(y_train, np.ones(len(y_train))).fit()
anova_result_pca90 = sm.stats.anova_lm(spampca_null, spampca_glm90, typ=1)
print(anova_result_pca90)

# PCA for 98% variance
pca98 = PCA(n_components=0.98)
X_train_pca98 = pca98.fit_transform(X_train)
X_test_pca98 = pca98.transform(X_test)

spampca_glm98 = sm.Logit(y_train, sm.add_constant(X_train_pca98)).fit()
spampca_null98 = sm.Logit(y_train, np.ones(len(y_train))).fit()
anova_result_pca98 = sm.stats.anova_lm(spampca_null98, spampca_glm98, typ=1)
print(anova_result_pca98)

# Partial likelihood test between 98% and 90% models
anova_result_90_98 = sm.stats.anova_lm(spampca_glm90, spampca_glm98, typ=1)
print(anova_result_90_98)

# Compare AIC and BIC
models = [spam_glm, spam_step1, Lspam_glm, Lspam_step1, spampca_glm98, spampca_glm90]
aic_values = [model.aic for model in models]
bic_values = [model.bic for model in models]

print('AIC:', aic_values)
print('BIC:', bic_values)

# Diagnostic plots
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
sns.residplot(spam_glm.fittedvalues, spam_glm.resid_deviance, lowess=True, ax=ax[0, 0], line_kws={'color': 'red'})
sns.residplot(Lspam_glm.fittedvalues, Lspam_glm.resid_deviance, lowess=True, ax=ax[0, 1], line_kws={'color': 'red'})
ax[0, 0].set_title('spam_glm Residuals')
ax[0, 1].set_title('Lspam_glm Residuals')

sns.scatterplot(x=range(len(spam_glm.fittedvalues)), y=spam_glm.resid_deviance, ax=ax[1, 0])
sns.scatterplot(x=range(len(Lspam_glm.fittedvalues)), y=Lspam_glm.resid_deviance, ax=ax[1, 1])
ax[1, 0].set_title('spam_glm Deviance Residuals')
ax[1, 1].set_title('Lspam_glm Deviance Residuals')

plt.tight_layout()
plt.show()

# Cook's distance
influence_spam = spam_glm.get_influence()
cooks_d_spam = influence_spam.cooks_distance[0]
influence_Lspam = Lspam_glm.get_influence()
cooks_d_Lspam = influence_Lspam.cooks_distance[0]

print('Influential points in spam_glm:', np.where(cooks_d_spam > 0.5))
print('Influential points in Lspam_glm:', np.where(cooks_d_Lspam > 0.5))

# Predictions with test data
spam_pred = spam_glm.predict(sm.add_constant(X_test))
Lspam_test = np.log(X_test + 0.1)
Lspam_pred = Lspam_glm.predict(sm.add_constant(Lspam_test))

spampca_pred = spampca_glm98.predict(sm.add_constant(X_test_pca98))
step_pred1 = spam_step1.predict(sm.add_constant(X_test))
Lstep_pred1 = Lspam_step1.predict(sm.add_constant(Lspam_test))

# Confusion Matrices
threshold = 0.5
predictions = [spam_pred, Lspam_pred, spampca_pred, step_pred1, Lstep_pred1]
labels = ['spam_glm', 'Lspam_glm', 'spampca_glm98', 'spam_step1', 'Lspam_step1']

for i, pred in enumerate(predictions):
    print(f'Confusion Matrix for {labels[i]}:')
    print(confusion_matrix(y_test, pred > threshold))
    print(classification_report(y_test, pred > threshold))

# ROC Curves
plt.figure()
for i, pred in enumerate(predictions):
    fpr, tpr, _ = roc_curve(y_test, pred)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{labels[i]} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - SPAM Filter')
plt.legend(loc='lower right')
plt.show()
