In [None]:
import pandas as pd

datadir = "E:/GoogleDrive/SYS4021/2021/GLM/Data"
spam = pd.read_csv(f"{datadir}/Spam.txt", sep=" ", header=None)

In [None]:
print(spam.shape)
print(spam.describe())

In [None]:
response_variable = spam.iloc[:, 57]
print(response_variable.value_counts())
print(response_variable.sum() / len(response_variable))

In [None]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import numpy as np

figures_dir = os.path.join(datadir, "figures")
os.makedirs(figures_dir, exist_ok=True)

plt.figure(figsize=(20, 10))
scatter_matrix(spam.iloc[:, np.r_[0:10, 57]], figsize=(20, 10), diagonal='kde')
plt.savefig(os.path.join(figures_dir, 'Image1.png'))
plt.close()

log_spam = np.log(spam.iloc[:, np.r_[0:10, 57]] + 0.00001)
plt.figure(figsize=(20, 10))
scatter_matrix(log_spam, figsize=(20, 10), diagonal='kde')
plt.savefig(os.path.join(figures_dir, 'Image2.png'))
plt.close()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(1, 10):
    plt.figure()
    sns.boxplot(x=spam.iloc[:, 57], y=spam.iloc[:, i-1])
    plt.title(f'V{i}')
    plt.savefig(os.path.join(figures_dir, f'V{i}.png'))
    plt.close()

# Combining plots into a grid
fig, axs = plt.subplots(3, 3, figsize=(15, 15))
for i, ax in zip(range(1, 10), axs.flatten()):
    sns.boxplot(x=spam.iloc[:, 57], y=spam.iloc[:, i-1], ax=ax)
    ax.set_title(f'V{i}')
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, 'Boxplots_V1_to_V9.png'))
plt.close()


In [None]:
for i in range(49, 58):
    plt.figure()
    sns.boxplot(x=spam.iloc[:, 57], y=spam.iloc[:, i-1])
    plt.title(f'V{i}')
    plt.savefig(os.path.join(figures_dir, f'V{i}.png'))
    plt.close()

# Combining plots into a grid
fig, axs = plt.subplots(3, 3, figsize=(15, 15))
for i, ax in zip(range(49, 58), axs.flatten()):
    sns.boxplot(x=spam.iloc[:, 57], y=spam.iloc[:, i-1], ax=ax)
    ax.set_title(f'V{i}')
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, 'Boxplots_V49_to_V57.png'))
plt.close()


In [None]:
Lspam = np.log(spam.iloc[:, :-1] + 0.1)
Lspam[57] = spam.iloc[:, 57]


In [None]:
plt.figure(figsize=(20, 10))
for i in range(1, 10):
    plt.figure()
    sns.boxplot(x=Lspam[57], y=Lspam[i-1])
    plt.title(f'V{i}')
    plt.savefig(os.path.join(figures_dir, f'Log_V{i}.png'))
    plt.close()

fig, axs = plt.subplots(3, 3, figsize=(15, 15))
for i, ax in zip(range(1, 10), axs.flatten()):
    sns.boxplot(x=Lspam[57], y=Lspam[i-1], ax=ax)
    ax.set_title(f'Log V{i}')
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, 'Boxplots_Log_V1_to_V9.png'))
plt.close()


In [None]:
plt.figure(figsize=(20, 10))
for i in range(49, 58):
    plt.figure()
    sns.boxplot(x=Lspam[57], y=Lspam[i-1])
    plt.title(f'Log V{i}')
    plt.savefig(os.path.join(figures_dir, f'Log_V{i}.png'))
    plt.close()

fig, axs = plt.subplots(3, 3, figsize=(15, 15))
for i, ax in zip(range(49, 58), axs.flatten()):
    sns.boxplot(x=Lspam[57], y=Lspam[i-1], ax=ax)
    ax.set_title(f'Log V{i}')
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, 'Boxplots_Log_V49_to_V57.png'))
plt.close()


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

spam_pca = PCA(n_components=2)
principal_components = spam_pca.fit_transform(spam.iloc[:, :-1])

plt.figure(figsize=(10, 7))
plt.scatter(principal_components[:, 0], principal_components[:, 1], c=spam.iloc[:, -1])
plt.title('PCA of Spam Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar()
plt.show()


In [None]:
loadings = spam_pca.components_[1]
outliers = np.where(np.abs(loadings) > 0.2)[0]
print(spam.iloc[:, outliers].describe())

plt.figure()
plt.boxplot(spam.iloc[:, 55])
plt.title('Boxplot of V56')
plt.show()

# Remove outlier
spam_no_outlier = spam.drop(1754)

spam_pca_no_outlier = PCA(n_components=2)
principal_components_no_outlier = spam_pca_no_outlier.fit_transform(spam_no_outlier.iloc[:, :-1])

plt.figure(figsize=(10, 7))
plt.scatter(principal_components_no_outlier[:, 0], principal_components_no_outlier[:, 1], c=spam_no_outlier.iloc[:, -1])
plt.title('PCA of Spam Dataset (Outlier Removed)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar()
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.scatterplot(x=principal_components_no_outlier[:, 0], y=principal_components_no_outlier[:, 1], hue=spam_no_outlier.iloc[:, -1], palette={0: 'blue', 1: 'red'}, style=spam_no_outlier.iloc[:, -1])
plt.title('PCA of Spam Dataset (Outlier Removed) with Labels')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Type', labels=['Ham', 'Spam'])
plt.show()


In [None]:
Lspam_pca = PCA(n_components=2)
principal_components_Lspam = Lspam_pca.fit_transform(Lspam.iloc[:, :-1])

plt.figure(figsize=(10, 7))
plt.scatter(principal_components_Lspam[:, 0], principal_components_Lspam[:, 1], c=Lspam.iloc[:, -1])
plt.title('PCA of Log-Transformed Spam Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar()
plt.show()


In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(x=principal_components_Lspam[:, 0], y=principal_components_Lspam[:, 1], hue=Lspam.iloc[:, -1], palette={0: 'blue', 1: 'red'}, style=Lspam.iloc[:, -1])
plt.title('PCA of Log-Transformed Spam Dataset with Labels')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Type', labels=['Ham', 'Spam'])
plt.show()
