In [None]:
#************************************************************
#
#  			SPAM Filter  
#			Time Series Analysis 
#
#************************************************************

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import statsmodels.api as sm
from scipy import stats
from statsmodels.tsa.stattools import acf
from statsmodels.graphics.tsaplots import plot_acf

# Load the data
datadir = "J:/private/SYS4021/2021/Data/Spam/"
ham = pd.read_csv(datadir + 'ham_ts.csv')
spam = pd.read_csv(datadir + 'spam_ts.csv')

# Summarize the datasets
print(ham.describe())
print(spam.describe())

# Create time series
ham_ts = pd.Series(ham['count'])
spam_ts = pd.Series(spam['count'])

# Plot the time series for ham
plt.figure(figsize=(10, 6))
plt.plot(ham_ts)
plt.title('Time Series of Ham Emails')
plt.xlabel('Day')
plt.ylabel('Number of Ham Emails')
plt.show()

# Add datetime to the ham dataframe
ham['date'] = pd.to_datetime(ham[['year', 'month', 'day']])

# Plot using seaborn
sns.lineplot(data=ham, x='date', y='count')
plt.ylabel('Ham Count')
plt.xlabel('')
plt.show()

# Notched box plots for days 1:464 and days 465:506
plt.figure(figsize=(10, 6))
plt.boxplot([ham_ts[:464], ham_ts[464:]], notch=True, labels=['Weeks 1/13/00 - 4/20/01', 'Weeks 4/20/01 - 6/1/01'])
plt.ylabel('Ham Emails')
plt.title('Notched Box Plots for Ham Emails')
plt.show()

ham['set'] = 0
ham.loc[464:, 'set'] = 1
sns.boxplot(data=ham, x='set', y='count')
plt.ylabel('Ham Emails')
plt.xlabel('Pre 4/20/01 (0) and post (1)')
plt.show()

# Wilcoxon test
stat, p_value = stats.mannwhitneyu(ham_ts[:464], ham_ts[464:])
print('Wilcoxon test p-value:', p_value)

# Remove the last six weeks from ham_ts
ham_ts = ham_ts[:464]
ham = ham.iloc[:464]

# Use the acf() command on ham time series
plot_acf(ham_ts, lags=30)
plt.show()

# Plot the time series for spam
spam['date'] = pd.to_datetime(spam[['year', 'month', 'day']])
sns.lineplot(data=spam, x='date', y='count')
plt.ylabel('Spam Count')
plt.xlabel('')
plt.show()

# Use the acf() command on spam time series
plot_acf(spam_ts, lags=30)
plt.show()

# Model trend of spam
time_spam = np.arange(len(spam_ts))
spam_trend = sm.OLS(spam_ts, sm.add_constant(time_spam)).fit()
print(spam_trend.summary())

# Plot the trend line for spam_ts
plt.figure(figsize=(10, 6))
plt.plot(spam_ts, label='Spam Emails')
plt.plot(spam_trend.fittedvalues, color='red', label='Trend')
plt.legend()
plt.show()

# Model trend of ham
time_ham = np.arange(len(ham_ts))
ham_trend = sm.OLS(ham_ts, sm.add_constant(time_ham)).fit()
print(ham_trend.summary())

# Plot the trend line for ham_ts
plt.figure(figsize=(10, 6))
plt.plot(ham_ts, label='Ham Emails')
plt.plot(ham_trend.fittedvalues, color='red', label='Trend')
plt.legend()
plt.show()

# Model the seasonality for ham data set using dummy variables
ham['day_of_week'] = (time_ham % 7) + 1

# Convert numerical day of week to actual day names starting from Thursday
days = {1: 'Th', 2: 'F', 3: 'Sa', 4: 'S', 5: 'M', 6: 'T', 7: 'W'}
ham['day_of_week'] = ham['day_of_week'].map(days)

# Base case is Thursday (Th)
ham_trendseason = sm.OLS(ham_ts, sm.add_constant(pd.get_dummies(ham[['day_of_week', 'time_ham']]))).fit()
print(ham_trendseason.summary())

# Plot ham_trendseason
plt.figure(figsize=(10, 6))
plt.plot(ham_ts, label='Ham Emails')
plt.plot(ham_trendseason.fittedvalues, color='red', label='Trend + Seasonality')
plt.legend()
plt.show()

sns.lineplot(data=ham, x='date', y='count')
plt.plot(ham['date'], ham_trendseason.fittedvalues, color='red')
plt.ylabel('Ham Emails')
plt.show()
