# Get and check Data

In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
dataset = pd.read_csv('spambase.data')
datanames = open('spambase.names', 'r')
print(dataset.head())
num_attr = dataset.shape[1]

      0  0.64  0.64.1  0.1  0.32   0.2   0.3   0.4   0.5   0.6  ...  0.40  \
0  0.21  0.28    0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...  0.00   
1  0.06  0.00    0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...  0.01   
2  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   
3  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   
4  0.00  0.00    0.00  0.0  1.85  0.00  0.00  1.85  0.00  0.00  ...  0.00   

    0.41  0.42  0.778   0.43   0.44  3.756   61   278  1  
0  0.132   0.0  0.372  0.180  0.048  5.114  101  1028  1  
1  0.143   0.0  0.276  0.184  0.010  9.821  485  2259  1  
2  0.137   0.0  0.137  0.000  0.000  3.537   40   191  1  
3  0.135   0.0  0.135  0.000  0.000  3.537   40   191  1  
4  0.223   0.0  0.000  0.000  0.000  3.000   15    54  1  

[5 rows x 58 columns]


In [3]:
# check how many null values in data
dataset.isnull().sum()


0         0
0.64      0
0.64.1    0
0.1       0
0.32      0
0.2       0
0.3       0
0.4       0
0.5       0
0.6       0
0.7       0
0.64.2    0
0.8       0
0.9       0
0.10      0
0.32.1    0
0.11      0
1.29      0
1.93      0
0.12      0
0.96      0
0.13      0
0.14      0
0.15      0
0.16      0
0.17      0
0.18      0
0.19      0
0.20      0
0.21      0
0.22      0
0.23      0
0.24      0
0.25      0
0.26      0
0.27      0
0.28      0
0.29      0
0.30      0
0.31      0
0.32.2    0
0.33      0
0.34      0
0.35      0
0.36      0
0.37      0
0.38      0
0.39      0
0.40      0
0.41      0
0.42      0
0.778     0
0.43      0
0.44      0
3.756     0
61        0
278       0
1         0
dtype: int64

# Title Attributes
This section is probably not necessary

In [4]:
# extracting keys for each column of the data
keys = []

for line in datanames:
    line = re.match(r"^(word|char|cap)",line)
    if line:
        line = line.string # convert back into string
        attribute = re.split(r'\s+', line)[0]
        keys.append(attribute)


# last attribute is whether email is spam or not
keys.append("is_spam:")

# attach keys to data set
labelled_dataset = np.vstack((keys,dataset))

print(keys)
print(len(keys))

['word_freq_make:', 'word_freq_address:', 'word_freq_all:', 'word_freq_3d:', 'word_freq_our:', 'word_freq_over:', 'word_freq_remove:', 'word_freq_internet:', 'word_freq_order:', 'word_freq_mail:', 'word_freq_receive:', 'word_freq_will:', 'word_freq_people:', 'word_freq_report:', 'word_freq_addresses:', 'word_freq_free:', 'word_freq_business:', 'word_freq_email:', 'word_freq_you:', 'word_freq_credit:', 'word_freq_your:', 'word_freq_font:', 'word_freq_000:', 'word_freq_money:', 'word_freq_hp:', 'word_freq_hpl:', 'word_freq_george:', 'word_freq_650:', 'word_freq_lab:', 'word_freq_labs:', 'word_freq_telnet:', 'word_freq_857:', 'word_freq_data:', 'word_freq_415:', 'word_freq_85:', 'word_freq_technology:', 'word_freq_1999:', 'word_freq_parts:', 'word_freq_pm:', 'word_freq_direct:', 'word_freq_cs:', 'word_freq_meeting:', 'word_freq_original:', 'word_freq_project:', 'word_freq_re:', 'word_freq_edu:', 'word_freq_table:', 'word_freq_conference:', 'char_freq_;:', 'char_freq_(:', 'char_freq_[:', '

# Split data into training data and testing data

In [5]:
# seperate features from result
X = labelled_dataset[1:,:57] #
y = labelled_dataset[1:,57] # this is a binary array 
print(X)

[['0.21' '0.28' '0.5' ... '5.114' '101.0' '1028.0']
 ['0.06' '0.0' '0.71' ... '9.821' '485.0' '2259.0']
 ['0.0' '0.0' '0.0' ... '3.537' '40.0' '191.0']
 ...
 ['0.3' '0.0' '0.3' ... '1.4040000000000001' '6.0' '118.0']
 ['0.96' '0.0' '0.0' ... '1.147' '5.0' '78.0']
 ['0.0' '0.0' '0.65' ... '1.25' '5.0' '40.0']]


In [6]:
# Train and Test splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train)
print(y_train)

[['0.0' '0.0' '0.0' ... '5.5' '10.0' '11.0']
 ['0.0' '0.0' '0.0' ... '3.95' '23.0' '79.0']
 ['0.0' '0.0' '0.0' ... '1.526' '7.0' '87.0']
 ...
 ['0.0' '14.28' '0.0' ... '1.8' '5.0' '9.0']
 ['0.0' '0.0' '0.0' ... '1.058' '2.0' '18.0']
 ['0.14' '0.0' '0.28' ... '1.867' '14.0' '521.0']]
['0.0' '1.0' '0.0' ... '0.0' '0.0' '1.0']


# Machine Learning

###  using randomForestClassifier

In [7]:
# using randomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=30)
clf = clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
pred_clf[:20]



array(['0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0',
       '0.0', '1.0', '0.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0',
       '1.0', '0.0'], dtype='<U32')

In [8]:
# check performance of ml model
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95       530
         1.0       0.95      0.90      0.93       390

    accuracy                           0.94       920
   macro avg       0.94      0.94      0.94       920
weighted avg       0.94      0.94      0.94       920

[[513  17]
 [ 38 352]]


### Using Decision Tree

In [9]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
pred_clf[:20]

array(['0.0', '1.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0',
       '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0',
       '1.0', '0.0'], dtype='<U32')

In [10]:
# check performance of ml model
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.92      0.95      0.94       530
         1.0       0.93      0.89      0.91       390

    accuracy                           0.93       920
   macro avg       0.93      0.92      0.92       920
weighted avg       0.93      0.93      0.92       920

[[502  28]
 [ 41 349]]


### using neural network

In [11]:
# using neural network
from sklearn.neural_network import MLPClassifier

In [12]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(num_attr, num_attr, num_attr), random_state=1)

# convert X_train array into double in order to use MLPClassifier
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)


clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
pred_clf[:20]

array(['0.0', '0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0', '0.0',
       '1.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0',
       '1.0', '0.0'], dtype='<U3')

In [13]:
# check performance of ml model
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test, pred_clf))

              precision    recall  f1-score   support

         0.0       0.92      0.90      0.91       530
         1.0       0.86      0.89      0.88       390

    accuracy                           0.89       920
   macro avg       0.89      0.89      0.89       920
weighted avg       0.90      0.89      0.89       920

[[475  55]
 [ 42 348]]
