In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import nltk
import pandas
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [24]:
dataframe = pandas.read_csv("Phishing_Email.csv")
trainemails, testemails = train_test_split(dataframe, test_size=0.2)

In [25]:
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
trainemails.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
9334,9335,hot penny piay on strong breakout the oi | and...,Phishing Email
11987,11988,URL: http://scriptingnews.userland.com/backiss...,Safe Email
6867,6868,"Hey there,If you're like me, you've tried EVER...",Phishing Email
6491,6492,use Perl Daily Headline MailerUnderground Movi...,Safe Email
11408,11409,sell advertising space on your website did you...,Phishing Email


In [27]:
trainemailstext = trainemails['Email Text'].values.tolist()   # '0' refers to the review text
trainemailstype = trainemails['Email Type'].values.tolist()   # '1' corresponds to Label (1 - positive and 0 - negative)
testemailstext = testemails['Email Text'].values.tolist()
testemailstype = testemails['Email Type'].values.tolist()

In [28]:
trainemailstext

['hot penny piay on strong breakout the oi | and gas advisory now that oil and gas has entered a | ong - term buil market , our specialty in pinpointing the hottest companies of the few remaining undervaiued energy piays has produced soaring returns . emerson oil and gas ( eogi ) is an energy developer in the us " oi | belt " and in canada \' s most highly coveted reservoirs with generating potentia | of miliions per week . breaking news ! ! ! emerson oi | and gas , inc . , ( eogi ) is pieased to announce that the alberta energy & utility board has issued license no . 033 o 206 for the company \' s we | | 11 - 16 - 24 - 2 the acadia project . the acadia project consists of 15 sections in alberta in an area that produces natura | gas from the viking formation , has oil potentia | in the bakken zone and gas potential in the colony and second white specks zones . the viking contains natura | gas in weils around the acadia project and has the potential for 13 bcf gas in the reservoir under

In [29]:
traintext=[]
for i in range(0, len(trainemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(trainemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    traintext.append(review)

In [30]:
testtext=[]
for i in range(0, len(testemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(testemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    testtext.append(review)

In [31]:
testemailstext

 "URL: http://jeremy.zawodny.com/blog/archives/000225.html\r\nDate: 2002-10-07T12:30:42-08:00For the past week, I've been waiting for one of my FreeBSD servers to reproduce \r\na problem we've been seeing with MySQL + LinuxThreads. This particular machine \r\nis running a custom build of MySQL 4.0.4 (or MySQL 404, as I...\r\n",
 'albert trevino 4 - 12 , tejas pipeline , trevino plant gas sales fyi . . . . . . . . . . . . . . . - - - - - - - - - - - - - - - - - - - - - - forwarded by beverly beaty / hou / ect on 07 / 25 / 2000 07 : 57 am - - - - - - - - - - - - - - - - - - - - - - - - - - - enron capital & trade resources corp . from : " evelyn daniel " 07 / 24 / 2000 04 : 53 pm to : cc : subject : albert trevino 4 - 12 , tejas pipeline , trevino plant gas sales hello : steve holmes said i should e - mail you to let you know the above well availability will change from 2 mmbtu / d to 8000 mmbtu / d effective 7 / 26 / 00 . it was in the process of fractionation . please call me at ( 405 

In [32]:
tf_idf = TfidfVectorizer()

In [33]:
traintext

['hot penny piay strong breakout oi gas advisory oil gas entered ong term buil market specialty pinpointing hottest company remaining undervaiued energy piays produced soaring return emerson oil gas eogi energy developer u oi belt canada highly coveted reservoir generating potentia miliions per week breaking news emerson oi gas inc eogi pieased announce alberta energy utility board issued license 033 206 company 11 16 24 2 acadia project acadia project consists 15 section alberta area produce natura gas viking formation oil potentia bakken zone gas potential colony second white speck zone viking contains natura gas weil around acadia project potential 13 bcf gas reservoir eas gas area caicuiated aof rate 14 mmcf per day project ocated eastern alberta year round access established production equipment infrastructure cost expected 6 oo ooo drilied cased compieted advanced fund go towards dri ing first lease earns emerson 49 working interest one section emerson oil gas inc eogi pleased an

In [34]:
testtext

['use perl daily newsletterin issue new perl monger web site java v perl week perl5 porter 9 15 september 2002 new perl monger web site posted km monday september 16 08 41 group http use perl org article pl sid 02 09 16 1243234 0 davorg writes leon brocard working hard update 1 perl monger web site still going thru process cleaning data perl monger group see something quite right please 2 let u know discus story http use perl org comment pl sid 02 09 16 1243234links 0 mailto dave dave org uk 1 http www pm org 2 mailto user group pm org java v perl posted pudge monday september 16 11 15 java http use perl org article pl sid 02 09 16 1448246 seems older perl get willing people believe suck without reasonable fact 0 davorg writes may seen article 1 java technology beat perl home turf pattern matching large file debate perl comp lang perl misc today one biggest criticism article author published perl code comparing java story continues http use perl org article pl sid 02 09 16 1448246discu

In [35]:
traintf = tf_idf.fit_transform(traintext)

In [36]:
print("n_samples: %d, n_features: %d" % traintf.shape)

n_samples: 14920, n_features: 137927


In [37]:
testtf = tf_idf.transform(testtext)

In [38]:
print("n_samples: %d, n_features: %d" % testtf.shape)

n_samples: 3730, n_features: 137927


In [39]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(traintf, trainemailstype)

In [40]:
y_pred = naive_bayes_classifier.predict(testtf)

In [41]:
print(metrics.classification_report(testemailstype, y_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.97      0.84      0.90      1458
    Negative       0.91      0.98      0.94      2272

    accuracy                           0.93      3730
   macro avg       0.94      0.91      0.92      3730
weighted avg       0.93      0.93      0.93      3730



In [42]:
print("Confusion matrix:")
print(metrics.confusion_matrix(testemailstype, y_pred))

Confusion matrix:
[[1230  228]
 [  41 2231]]


## of 3730 test values:
32.9222520107% true positive\
5.92493297587% false positive\
1.20643431635% false negative\
59.9463806971% true negative\
\
A 92.8686327078% detection rate.

In [43]:
oneOff = ['hi michael, please find enclosed vendor banking instructions for a payment that was suppose to go out in the previous week. i need you to process it immediately. i am a bit busy now but will give you a call within the hour regarding the payment. regards, sent from my mobile',
        'dear michael, phishing attacks are a growing threat to online security, but there are steps you can take to protect yourself. here are some best practices to help you stay safe online: keep your software up to date: make sure that your operating system, antivirus software, and other applications are always up to date to reduce your vulnerability to malware. use strong passwords: use strong, unique passwords for each of your online accounts and consider using a password manager to keep them secure. think before you click: always be wary of emails, links and attachments from unknown or suspicious sources. if in doubt, do not click. be cautious of public wi-fi: public wi-fi networks are often secured, so avoid accessing sensitve information when using them. remember, the best defense against phishing attacks is awareness and education. stay vigilant and stay safe online. best regards, hung']

oneOffText=[]
for i in range(0, len(oneOff)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(testemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    oneOffText.append(review)

In [44]:
# print(testemailstype)

oneOffTf = tf_idf.transform(oneOffText)
print("n_samples: %d, n_features: %d" % oneOffTf.shape)
oneOffPred = naive_bayes_classifier.predict(oneOffTf)
print(oneOffPred)

n_samples: 2, n_features: 137927
['Safe Email' 'Safe Email']
