In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import nltk
import pandas
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [24]:
dataframe = pandas.read_csv("Phishing_Email.csv")
trainemails, testemails = train_test_split(dataframe, test_size=0.2)

In [25]:
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
trainemails.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
9334,9335,hot penny piay on strong breakout the oi | and...,Phishing Email
11987,11988,URL: http://scriptingnews.userland.com/backiss...,Safe Email
6867,6868,"Hey there,If you're like me, you've tried EVER...",Phishing Email
6491,6492,use Perl Daily Headline MailerUnderground Movi...,Safe Email
11408,11409,sell advertising space on your website did you...,Phishing Email


In [27]:
trainemailstext = trainemails['Email Text'].values.tolist()
trainemailstype = trainemails['Email Type'].values.tolist()
testemailstext = testemails['Email Text'].values.tolist()
testemailstype = testemails['Email Type'].values.tolist()

In [None]:
trainemailstext

In [29]:
#Clean the Train data
traintext=[]
for i in range(0, len(trainemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(trainemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    traintext.append(review)

In [30]:
#Clean the Test data
testtext=[]
for i in range(0, len(testemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(testemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    testtext.append(review)

In [42]:
tf_idf = TfidfVectorizer()

In [None]:
#Test calls to test output
testemailstext

In [None]:
#Test calls to test output
traintext

In [None]:
#Test calls to test output
testtext

In [35]:
traintf = tf_idf.fit_transform(traintext)

In [36]:
print("n_samples: %d, n_features: %d" % traintf.shape)

n_samples: 14920, n_features: 137927


In [37]:
#Transform tf_idf into input for bayes
testtf = tf_idf.transform(testtext)

In [38]:
print("n_samples: %d, n_features: %d" % testtf.shape)

n_samples: 3730, n_features: 137927


In [39]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(traintf, trainemailstype)

In [40]:
y_pred = naive_bayes_classifier.predict(testtf)

In [41]:
#print out metrics
print(metrics.classification_report(testemailstype, y_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.97      0.84      0.90      1458
    Negative       0.91      0.98      0.94      2272

    accuracy                           0.93      3730
   macro avg       0.94      0.91      0.92      3730
weighted avg       0.93      0.93      0.93      3730



In [42]:
#Confusion matrix
print("Confusion matrix:")
print(metrics.confusion_matrix(testemailstype, y_pred))

Confusion matrix:
[[1230  228]
 [  41 2231]]


## of 3730 test values:
32.9222520107% true positive\
5.92493297587% false positive\
1.20643431635% false negative\
59.9463806971% true negative\
\
A 92.8686327078% detection rate.

In [43]:
oneOff = ['hi michael, please find enclosed vendor banking instructions for a payment that was suppose to go out in the previous week. i need you to process it immediately. i am a bit busy now but will give you a call within the hour regarding the payment. regards, sent from my mobile',
        'dear michael, phishing attacks are a growing threat to online security, but there are steps you can take to protect yourself. here are some best practices to help you stay safe online: keep your software up to date: make sure that your operating system, antivirus software, and other applications are always up to date to reduce your vulnerability to malware. use strong passwords: use strong, unique passwords for each of your online accounts and consider using a password manager to keep them secure. think before you click: always be wary of emails, links and attachments from unknown or suspicious sources. if in doubt, do not click. be cautious of public wi-fi: public wi-fi networks are often secured, so avoid accessing sensitve information when using them. remember, the best defense against phishing attacks is awareness and education. stay vigilant and stay safe online. best regards, hung']

oneOffText=[]
for i in range(0, len(oneOff)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(testemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    oneOffText.append(review)

In [44]:
# print(testemailstype)

oneOffTf = tf_idf.transform(oneOffText)
print("n_samples: %d, n_features: %d" % oneOffTf.shape)
oneOffPred = naive_bayes_classifier.predict(oneOffTf)
print(oneOffPred)

n_samples: 2, n_features: 137927
['Safe Email' 'Safe Email']
