In [1]:
''' This Jupyter Notebook creates a Multinomial Naive Bayes Model for Emailing Phishing Detection.'''

import pandas # reading in the dataset
import numpy as np # matrix creation
import string # obtaining accepted punctuation types

from sklearn import metrics # metrics displaying
from sklearn.naive_bayes import MultinomialNB # chosen model
from sklearn.model_selection import train_test_split # spliting dataset

In [2]:
# read in CSV dataset into a dataframe and create training and testing datasets for our model
dataframe = pandas.read_csv("Phishing_Email.csv")

# trainemails = training dataset
# testemails = testing dataset
trainemails, testemails = train_test_split(dataframe, test_size=0.2)

In [3]:
dataframe.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\r\nHello I am your hot lil horny toy.\r\n ...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [4]:
# separate email text from their labels
trainemailstext = trainemails['Email Text'].values.tolist()   # '0' refers to the review text
trainemailstype = trainemails['Email Type'].values.tolist()   # '1' corresponds to Label (1 - positive and 0 - negative)

testemailstext = testemails['Email Text'].values.tolist()
testemailstype = testemails['Email Type'].values.tolist()

In [5]:
# there are 32 acceptable punctuation types
print(string.punctuation)
print(len(string.punctuation))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
32


In [16]:
# filter training text to only include uses of punctuation
traintext=[]
for i in range(0, len(trainemailstext)):
    punctuation = []
    for char in str(trainemailstext[i]):
        if char in string.punctuation:
            punctuation.append(char)
    review = ' '.join(punctuation)
    traintext.append(review)

# example view of new training text
print(traintext[0])

- - - - - - ? . . . . . . . . . . . . . . $ . . . . . . . . . . . . . . . . . . . . . . . $ . . . . $ . . . . . . . . . . . . . $ . . . - - - - - -


In [7]:
# filter testing text to only include uses of punctuation
testtext=[]
for i in range(0, len(testemailstext)):
    punctuation = []
    for char in str(testemailstext[i]):
        if char in string.punctuation:
            punctuation.append(char)
    review = ' '.join(punctuation)
    testtext.append(review)

In [8]:
# create feature matrix for each training email

# each row is an email
# each column corresponds to one type of punctuation
train_feature_matrix = []
for text in traintext:
    counts = [0 for i in range(len(string.punctuation))]
    for char in text:
        for i in range(len(string.punctuation)):
            if char == string.punctuation[i]:
                counts[i] += 1
    train_feature_matrix.append(counts)
train_feature_matrix = np.array(train_feature_matrix)

# test print of first training email punctuation counts
print(train_feature_matrix[0])

[ 0  0  0  4  0  0  0  0  0  0  0  0 12 57  0  0  0  0  0  0  1  0  0  0
  0  0  0  0  0  0  0  0]


In [9]:
# create feature matrix for each training email

test_feature_matrix = []
for text in testtext:
    counts = [0 for i in range(len(string.punctuation))]
    for char in text:
        for i in range(len(string.punctuation)):
            if char == string.punctuation[i]:
                counts[i] += 1
    test_feature_matrix.append(counts)
test_feature_matrix = np.array(test_feature_matrix)

In [10]:
# create Multinomial Naive Bayes model, train the model on our training matrix and labels

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_feature_matrix, trainemailstype)

In [11]:
# test predictions of the model
y_pred = naive_bayes_classifier.predict(test_feature_matrix)
print(y_pred)

['Phishing Email' 'Safe Email' 'Safe Email' ... 'Safe Email'
 'Phishing Email' 'Safe Email']


In [12]:
# print metrics report and confusion matrix

'''
Confusion Matrix Legend:
|TruePositive FalseNegative|
|FalsePositive TrueNegative|

'''

print(metrics.classification_report(testemailstype, y_pred, target_names=['Positive', 'Negative']))
print("Confusion Matrix:")
print(metrics.confusion_matrix(testemailstype, y_pred))

              precision    recall  f1-score   support

    Positive       0.74      0.62      0.68      1453
    Negative       0.78      0.86      0.82      2277

    accuracy                           0.77      3730
   macro avg       0.76      0.74      0.75      3730
weighted avg       0.77      0.77      0.76      3730

Confusion matrix:
[[ 901  552]
 [ 312 1965]]


In [13]:

results = [0,0,0,0]
for i in range(len(testemailstype)):
    if testemailstype[i] == y_pred[i] and testemailstype[i] == 'Phishing Email':
        results[0] += 1 #true positive
    elif testemailstype[i] == y_pred[i] and testemailstype[i] == 'Safe Email':
        results[1] += 1 #true negative
    elif testemailstype[i] != y_pred[i] and testemailstype[i] == 'Phishing Email':
        results[2] += 1 #false negative
    else:
        results[3] += 1 #false positive
print(results)

# print test results as percentages
for total in results:
    print(total/len(testemailstype) * 100)

[901, 1965, 552, 312]
24.15549597855228
52.68096514745309
14.798927613941018
8.36461126005362


## Of 3730 test values:
24.15549597855228% True Positive\
52.68096514745309% True Negative\
14.798927613941018% False Negative\
8.36461126005362% False Positive\
\
76.83646113% Correct Detection

In [14]:
# one-off feature creation

oneOff = ['hi michael, please find enclosed vendor banking instructions for a payment that was suppose to go out in the previous week. i need you to process it immediately. i am a bit busy now but will give you a call within the hour regarding the payment. regards, sent from my mobile',
        'dear michael, phishing attacks are a growing threat to online security, but there are steps you can take to protect yourself. here are some best practices to help you stay safe online: keep your software up to date: make sure that your operating system, antivirus software, and other applications are always up to date to reduce your vulnerability to malware. use strong passwords: use strong, unique passwords for each of your online accounts and consider using a password manager to keep them secure. think before you click: always be wary of emails, links and attachments from unknown or suspicious sources. if in doubt, do not click. be cautious of public wi-fi: public wi-fi networks are often secured, so avoid accessing sensitve information when using them. remember, the best defense against phishing attacks is awareness and education. stay vigilant and stay safe online. best regards, hung']

oneOff_feature_matrix = []
for text in oneOff:
    counts = [0 for i in range(len(string.punctuation))]
    for char in text:
        for i in range(len(string.punctuation)):
            if char == string.punctuation[i]:
                counts[i] += 1
    oneOff_feature_matrix.append(counts)
oneOff_feature_matrix = np.array(oneOff_feature_matrix)

print(len(oneOff_feature_matrix))

2


In [15]:
# one-off testing and results

oneOff_Y = naive_bayes_classifier.predict(oneOff_feature_matrix)

print(oneOff_Y)

['Safe Email' 'Safe Email']
