In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import nltk
import pandas
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split


In [11]:
dataframe = pandas.read_csv("Phishing_Email.csv")
trainemails, testemails = train_test_split(dataframe, test_size=0.2)

In [17]:
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mario\AppData\Roaming\nltk_data...


In [12]:
trainemails.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
2418,2418,"invoice processing as you are aware , the sap ...",Safe Email
1928,1928,Ah yes..Yet another case where 'marriage' is a...,Safe Email
260,260,rachael are you looking for more ? give her so...,Phishing Email
17876,17877,"kitahara review . hisatsugu kitahara , ( 1997 ...",Safe Email
1552,1552,epe lending / cali short for 8 / 29 in the sou...,Safe Email


In [50]:
trainemailstext = trainemails['Email Text'].values.tolist()   # '0' refers to the review text
trainemailstype = trainemails['Email Type'].values.tolist()   # '1' corresponds to Label (1 - positive and 0 - negative)
testemailstext = testemails['Email Text'].values.tolist()
testemailstype = testemails['Email Type'].values.tolist()

In [51]:
trainemailstext

['invoice processing as you are aware , the sap financial systems will be implemented for the enron gas pipeline group , enron north america and enron energy services companies on july 1 , 2000 . to effectively transfer the invoice processing from the legacy msa system to the sap system , the following dates should be followed : june 23 : invoice processing on the msa system will stop except for emergency payments , wire transfers and achs . june 26 : invoice processing on the sap system will begin for the gpg , ena and ees companies . accounts payables will input invoices into the sap system starting on this date , but checks will not be printed until the first payment run is processed on july 5 . june 28 : last day to input employee expense reports with ach in the msa system . june 29 : last day to input emergency payments and wire transfers in the msa system . june 30 : accounts payable to clean up outstanding open balances in the msa system . july 5 : all invoices will be entered i

In [52]:
traintext=[]
for i in range(0, len(trainemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(trainemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    traintext.append(review)

In [57]:
testtext=[]
for i in range(0, len(testemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(testemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    testtext.append(review)

In [56]:
testemailstext

['commercial adroit bacilli decry less than two hours later , two veiled women - - reportedly the man \' s sister and fiancee - - attacked a tour bus . egyptian police officials and the government - guided al - ahram newspaper said the bus was carrying israeli tourists . after firing on the tour bus , negat yassin then shot and wounded her companion before killing herself . khamis died later of her wounds . officials said they acted in revenge for yassin \' s death . it said police earlier in the day captured two suspects - - ashraf saeed youssef and gamal ahmed abdel aal - - in connection with that attack and were chasing a third , ehab yousri yassin , on a highway overpass when he jumped off , setting off the nail - filled bomb . " we only hope that these attacks do not stand in the way of political reform , " he said in a statement , acknowledging that mubarak had no plans to end emergency law " whether these attacks take place or they don \' t . " two militant groups posted web sta

In [42]:
tf_idf = TfidfVectorizer()

In [54]:
traintext

['invoice processing aware sap financial system implemented enron gas pipeline group enron north america enron energy service company july 1 2000 effectively transfer invoice processing legacy msa system sap system following date followed june 23 invoice processing msa system stop except emergency payment wire transfer achs june 26 invoice processing sap system begin gpg ena ee company account payable input invoice sap system starting date check printed first payment run processed july 5 june 28 last day input employee expense report ach msa system june 29 last day input emergency payment wire transfer msa system june 30 account payable clean outstanding open balance msa system july 5 invoice entered sap first payment run processed please remember invoice sent account payable june 23 must sap general ledger coding applied invoice question please contact larry dallman 713 853 7222 ben gwaltney 713 853 1550 andy zabriskie 713 853 6892 linda martin 713 853 7038',
 'ah yes yet another case

In [58]:
testtext

['commercial adroit bacillus decry le two hour later two veiled woman reportedly man sister fiancee attacked tour bus egyptian police official government guided al ahram newspaper said bus carrying israeli tourist firing tour bus negat yassin shot wounded companion killing khamis died later wound official said acted revenge yassin death said police earlier day captured two suspect ashraf saeed youssef gamal ahmed abdel aal connection attack chasing third ehab yousri yassin highway overpass jumped setting nail filled bomb hope attack stand way political reform said statement acknowledging mubarak plan end emergency law whether attack take place two militant group posted web statement claiming responsibility twin attack mujahedeen egypt abdullah azzam brigade neither claim authenticity could verified woman known carried past attack egypt fouad allam retired general egypt anti terrorism security apparatus said way compare recent attack 1990 led larger organized group since marginalized pr

In [59]:
traintf = tf_idf.fit_transform(traintext)

In [60]:
print("n_samples: %d, n_features: %d" % traintf.shape)

n_samples: 14920, n_features: 138621


In [61]:
testtf = tf_idf.transform(testtext)

In [62]:
print("n_samples: %d, n_features: %d" % testtf.shape)

n_samples: 3730, n_features: 138621


In [63]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(traintf, trainemailstype)

MultinomialNB()

In [64]:
y_pred = naive_bayes_classifier.predict(testtf)

In [65]:
print(metrics.classification_report(testemailstype, y_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.96      0.85      0.90      1449
    Negative       0.91      0.98      0.94      2281

    accuracy                           0.93      3730
   macro avg       0.94      0.91      0.92      3730
weighted avg       0.93      0.93      0.93      3730



In [66]:
print("Confusion matrix:")
print(metrics.confusion_matrix(testemailstype, y_pred))

Confusion matrix:
[[1228  221]
 [  45 2236]]


## of 3730 test values:
32.9222520107% true positive\
5.92493297587% false positive\
1.20643431635% false negative\
59.9463806971% true negative\
\
A 92.8686327078% detection rate.