In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import nltk
import pandas
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [2]:
dataframe = pandas.read_csv("Phishing_Email.csv")
trainemails, testemails = train_test_split(dataframe, test_size=0.2)

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
trainemails.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
9549,9550,power trading hi folks : very glad to hear abo...,Safe Email
15151,15152,the only solution to penis growth limited time...,Phishing Email
18171,18172,Take Control Of Your Conference Calls\r\nCryst...,Phishing Email
14402,14403,its not too late just following up with you ab...,Phishing Email
4100,4100,"On Mon, 2002-08-12 at 19:56, Kenn Humborg wrot...",Safe Email


In [5]:
trainemailstext = trainemails['Email Text'].values.tolist()   # '0' refers to the review text
trainemailstype = trainemails['Email Type'].values.tolist()   # '1' corresponds to Label (1 - positive and 0 - negative)
testemailstext = testemails['Email Text'].values.tolist()
testemailstype = testemails['Email Type'].values.tolist()

In [6]:
trainemailstext

["power trading hi folks : very glad to hear about the new developments . just to recap what we discussed this morning about different things you need to look into to set up trading operations and the contacts : 1 . licence to trade : regulatory people : i guess you know about this part better than me . 2 . trading & risk mgmt : global risk mgmt oversight : john sherrif in london has the overall responsibility outside western hemisphere . research group can help with the structuring models used for trading . 3 . risk conrols : before trading group is operational , it needs to get the authorization from the board of directors of enron along with total position limits and possibly value @ risk limits . these limits are typically by commodity type and by region . risk assessment & control ( rac ) under rick buy performs the internal control function ensuring that the businesses adhere to these trading limits . ted murphy is the vp in rac overseeing the trading operations . the value @ ris

In [7]:
traintext=[]
for i in range(0, len(trainemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(trainemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    traintext.append(review)

In [8]:
testtext=[]
for i in range(0, len(testemailstext)):
    review = re.sub('[^a-zA-Z0-9 ]', ' ', str(testemailstext[i]))
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    testtext.append(review)

In [9]:
testemailstext

['empty',
 "query : different in one of the novels by ( walter m . ) greeley , i noted _ it 's not much different a life than that of the students _ . in many ways this just confirms the status of _ different _ as a crypto-comparative : cf . * it 's not easy a life , ok it 's not much easier a life than . . . however , what are native speakers ' intuitions about this when _ different _ is biased towards being interpreted as an ' ordinary ' adjective : ( i ) ? it 's entirely different a life from that of the students cf . ( ii ) * ? it 's not entirely novel a notion for me and what about : ( iii ) ? it 's not entirely separate a notion from that of greed or , for those speakers who accept ( iv ) , ( iv ) it 's not much easier of a life than that of the students , what about : ( v ) ? it 's not much different of a life than that of the students i also feel unsure about _ other _ : ( vi ) ? * it 's not ( much ) other a life than . . . ( vii ) ? it 's no other a life than . . . ( viii ) ? 

In [10]:
tf_idf = TfidfVectorizer()

In [11]:
traintext

['power trading hi folk glad hear new development recap discussed morning different thing need look set trading operation contact 1 licence trade regulatory people guess know part better 2 trading risk mgmt global risk mgmt oversight john sherrif london overall responsibility outside western hemisphere research group help structuring model used trading 3 risk conrols trading group operational need get authorization board director enron along total position limit possibly value risk limit limit typically commodity type region risk assessment control rac rick buy performs internal control function ensuring business adhere trading limit ted murphy vp rac overseeing trading operation value risk model come research group day day monitoring number done ted group 4 credit risk credit reserve number computed deal model created research transferred credit group bill bradford also rac basically like buying insurance corp couterparty default loss covered reserve pool credit dept 5 legal name give

In [12]:
testtext

['empty',
 'query different one novel walter greeley noted much different life student many way confirms status different crypto comparative cf easy life ok much easier life however native speaker intuition different biased towards interpreted ordinary adjective entirely different life student cf ii entirely novel notion iii entirely separate notion greed speaker accept iv iv much easier life student v much different life student also feel unsure vi much life vii life viii life let alone vi viii life frits',
 'impotency embarressing issue buy online cater cattleman optimistic detestation cominform flight petulant beatitude limp refrigerate weaponry sidelong aureomycin ppm bequeath desicate design recession assemblage accent babel desuetude shrine reel persuade debtor abound commutate crusty latin housewife sightsee amp pluton dispensary antiquity intricacy expository canoga liquefaction faust landowner columbia phycomycetes extemporaneous bialystok linkage valedictory metabolic balfour

In [13]:
traintf = tf_idf.fit_transform(traintext)

In [14]:
print("n_samples: %d, n_features: %d" % traintf.shape)

n_samples: 14920, n_features: 140298


In [15]:
testtf = tf_idf.transform(testtext)

In [16]:
print("n_samples: %d, n_features: %d" % testtf.shape)

n_samples: 3730, n_features: 140298


In [17]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(traintf, trainemailstype)

In [18]:
y_pred = naive_bayes_classifier.predict(testtf)

In [19]:
print(metrics.classification_report(testemailstype, y_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.96      0.84      0.90      1481
    Negative       0.90      0.98      0.94      2249

    accuracy                           0.92      3730
   macro avg       0.93      0.91      0.92      3730
weighted avg       0.93      0.92      0.92      3730



In [20]:
print("Confusion matrix:")
print(metrics.confusion_matrix(testemailstype, y_pred))

Confusion matrix:
[[1244  237]
 [  49 2200]]


## of 3730 test values:
32.9222520107% true positive\
5.92493297587% false positive\
1.20643431635% false negative\
59.9463806971% true negative\
\
A 92.8686327078% detection rate.

In [28]:
oneOff = ['hi michael, please find enclosed vendor banking instructions for a payment that was suppose to go out in the previous week. i need you to process it immediately. i am a bit busy now but will give you a call within the hour regarding the payment. regards, sent from my mobile',
        'dear michael, phishing attacks are a growing threat to online security, but there are steps you can take to protect yourself. here are some best practices to help you stay safe online: keep your software up to date: make sure that your operating system, antivirus software, and other applications are always up to date to reduce your vulnerability to malware. use strong passwords: use strong, unique passwords for each of your online accounts and consider using a password manager to keep them secure. think before you click: always be wary of emails, links and attachments from unknown or suspicious sources. if in doubt, do not click. be cautious of public wi-fi: public wi-fi networks are often secured, so avoid accessing sensitve information when using them. remember, the best defense against phishing attacks is awareness and education. stay vigilant and stay safe online. best regards, hung']
oneOffType = ['Phishing Email', 'Safe Email']
# print(testemailstype)

oneOffTf = tf_idf.fit_transform(oneOff)
print("n_samples: %d, n_features: %d" % oneOffTf.shape)
naive_bayes_classifier.fit(oneOffTf, oneOffType)
oneOffPred = naive_bayes_classifier.predict(oneOffTf)
print(oneOffPred)

n_samples: 2, n_features: 129
['Phishing Email' 'Safe Email']
