# Get and check Data

In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# get raw unprocessed data
dataset = pd.read_csv('spambase.data')
datanames = open('spambase.names', 'r')
num_attr = dataset.shape[1]
dataset

Unnamed: 0,0,0.64,0.64.1,0.1,0.32,0.2,0.3,0.4,0.5,0.6,...,0.40,0.41,0.42,0.778,0.43,0.44,3.756,61,278,1
0,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
1,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
2,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.0,0.000,0.000,0.000,3.000,15,54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4596,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4597,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4598,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


### Title Attributes
The dataset intepreted by pandas is misaligned,
with the column label being the features of the first email rather than its official labels

In [3]:
# extracting keys for each column of the data
def get_keys(datanames):
    """
    Function which gets the column labels (attributes) from datanames file
    Args:
        datanames:   file object (Can be any string iterable)
    
    Return:
        keys:    list
    """
    keys = []

    for line in datanames:
        line = re.match(r"^(word|char|cap)",line)
        if line:
            line = line.string # convert back into string
            attribute = re.split(r'\s+', line)[0]
            keys.append(attribute)


    # last attribute is whether email is spam or not
    keys.append("is_spam:")
    return keys

datanames = list(datanames)
copy_datanames = datanames[:] # makes a copy of datanames as get_keys function seems to modify the input

keys = get_keys(copy_datanames)
#  appends attribute/column keys onto raw dataset dataframe
labelled_dataset = pd.DataFrame(data=dataset.to_numpy(), columns=keys) 
labelled_dataset


Unnamed: 0,word_freq_make:,word_freq_address:,word_freq_all:,word_freq_3d:,word_freq_our:,word_freq_over:,word_freq_remove:,word_freq_internet:,word_freq_order:,word_freq_mail:,...,char_freq_;:,char_freq_(:,char_freq_[:,char_freq_!:,char_freq_$:,char_freq_#:,capital_run_length_average:,capital_run_length_longest:,capital_run_length_total:,is_spam:
0,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101.0,1028.0,1.0
1,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485.0,2259.0,1.0
2,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40.0,191.0,1.0
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40.0,191.0,1.0
4,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.000,0.223,0.0,0.000,0.000,0.000,3.000,15.0,54.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3.0,88.0,0.0
4596,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4.0,14.0,0.0
4597,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6.0,118.0,0.0
4598,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5.0,78.0,0.0


In [4]:
# check how many null values in data
num_nulls = labelled_dataset.isnull().sum()

# for loop which checks whether there is a any null fields
for nulls in num_nulls:
    if 0:
        print("Data not properly processed")

num_nulls

word_freq_make:                0
word_freq_address:             0
word_freq_all:                 0
word_freq_3d:                  0
word_freq_our:                 0
word_freq_over:                0
word_freq_remove:              0
word_freq_internet:            0
word_freq_order:               0
word_freq_mail:                0
word_freq_receive:             0
word_freq_will:                0
word_freq_people:              0
word_freq_report:              0
word_freq_addresses:           0
word_freq_free:                0
word_freq_business:            0
word_freq_email:               0
word_freq_you:                 0
word_freq_credit:              0
word_freq_your:                0
word_freq_font:                0
word_freq_000:                 0
word_freq_money:               0
word_freq_hp:                  0
word_freq_hpl:                 0
word_freq_george:              0
word_freq_650:                 0
word_freq_lab:                 0
word_freq_labs:                0
word_freq_

# Split data into training data and testing data

In [5]:
# seperate features from result
X = labelled_dataset.iloc[:,:57] #
y = labelled_dataset.iloc[:,57] # this is a binary array 

In [6]:
# Train and Test splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Machine Learning

###  using randomForestClassifier

In [7]:
# using randomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=30)
clf = clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
pred_clf[:20]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1.,
       1., 1., 0.])

In [8]:
# check performance of ml model
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94       530
         1.0       0.95      0.89      0.92       390

    accuracy                           0.93       920
   macro avg       0.94      0.93      0.93       920
weighted avg       0.93      0.93      0.93       920

[[511  19]
 [ 42 348]]


### Using Decision Tree

In [9]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
pred_clf[:20]

array([0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1.,
       1., 1., 0.])

In [10]:
# check performance of ml model
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.92      0.95      0.93       530
         1.0       0.93      0.89      0.91       390

    accuracy                           0.92       920
   macro avg       0.92      0.92      0.92       920
weighted avg       0.92      0.92      0.92       920

[[503  27]
 [ 44 346]]


### using neural network
IMPORTANT: still a work in progress, isn't as accurate as non neural nets

In [11]:
# using neural network
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(num_attr, num_attr, num_attr), random_state=1)

# convert X_train array into double in order to use MLPClassifier
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)


clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
pred_clf[:20]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       1., 1., 0.])

In [12]:
# check performance of ml model
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.92      0.90      0.91       530
         1.0       0.86      0.89      0.88       390

    accuracy                           0.89       920
   macro avg       0.89      0.89      0.89       920
weighted avg       0.90      0.89      0.89       920

[[475  55]
 [ 42 348]]
