# Model

In [1]:
#type: ignore
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv("./datasets/phishing_email.csv")

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
data = data.drop("Unnamed: 0", axis=1)
data

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re : 1086 ; sex / lang dick hudson 's observations on us use of 's on ' but not 'd aughter ' as a vocative are very thought-provoking , but i am not sure that it is fair to attribute this to "" sons "" being "" treated like senior relatives "" . for one thing , we do n't normally use ' brother ' in this way any more than we do 'd aughter ' , and it is hard to imagine a natural class comprising senior relatives and 's on ' but excluding ' brother ' . for another , there seem to me to be differences here . if i am not imagining a distinction that is not there , it seems to me that the senior relative terms are used in a wider variety of contexts , e . g . , calling out from a distance to get someone 's attention , and hence at the beginning of an utterance , whereas 's on ' seems more natural in utterances like ' yes , son ' , ' hand me that , son ' than in ones like ' son ! ' or ' son , help me ! ' ( although perhaps these latter ones are not completely impossible ) . alexis mr",Safe Email
1,"the other side of * galicismos * * galicismo * is a spanish term which names the improper introduction of french words which are spanish sounding and thus very deceptive to the ear . * galicismo * is often considered to be a * barbarismo * . what would be the term which designates the opposite phenomenon , that is unlawful words of spanish origin which may have crept into french ? can someone provide examples ? thank you joseph m kozono < kozonoj @ gunet . georgetown . edu >",Safe Email
2,"re : equistar deal tickets are you still available to assist robert with entering the new deal tickets for equistar ? after talking with bryan hull and anita luong , kyle and i decided we only need 1 additional sale ticket and 1 additional buyback ticket set up . - - - - - - - - - - - - - - - - - - - - - - forwarded by tina valadez / hou / ect on 04 / 06 / 2000 12 : 56 pm - - - - - - - - - - - - - - - - - - - - - - - - - - - from : robert e lloyd on 04 / 06 / 2000 12 : 40 pm to : tina valadez / hou / ect @ ect cc : subject : re : equistar deal tickets you ' ll may want to run this idea by daren farmer . i don ' t normally add tickets into sitara . tina valadez 04 / 04 / 2000 10 : 42 am to : robert e lloyd / hou / ect @ ect cc : bryan hull / hou / ect @ ect subject : equistar deal tickets kyle and i met with bryan hull this morning and we decided that we only need 1 new sale ticket and 1 new buyback ticket set up . the time period for both tickets should be july 1999 - forward . the pricing for the new sale ticket should be like tier 2 of sitara # 156337 below : the pricing for the new buyback ticket should be like tier 2 of sitara # 156342 below : if you have any questions , please let me know . thanks , tina valadez 3 - 7548",Safe Email
3,"\nHello I am your hot lil horny toy.\n I am the one you dream About,\n I am a very open minded person,\n Love to talk about and any subject.\n Fantasy is my way of life, \n Ultimate in sex play. Ummmmmmmmmmmmmm\n I am Wet and ready for you. It is not your looks but your imagination that matters most,\n With My sexy voice I can make your dream come true...\n \n Hurry Up! call me let me Cummmmm for you..........................\nTOLL-FREE: 1-877-451-TEEN (1-877-451-8336)For phone billing: 1-900-993-2582\n-- \n_______________________________________________\nSign-up for your own FREE Personalized E-mail at Mail.com\nhttp://www.mail.com/?sr=signup",Phishing Email
4,"software at incredibly low prices ( 86 % lower ) . drapery seventeen term represent any sing . feet wild break able build . tail , send subtract represent . job cow student inch gave . let still warm , family draw , land book . glass plan include . sentence is , hat silent nothing . order , wild famous long their . inch such , saw , person , save . face , especially sentence science . certain , cry does . two depend yes , written carry .",Phishing Email
...,...,...
18645,date a lonely housewife always wanted to date a lonely housewife ? well this is your chance . thousands of lonely housewifes are waiting for you at cheating housewife personals . go here : www . liveadulthosting . biz / lm / signup . php,Phishing Email
18646,request submitted : access request for anita . dupont @ enron . com you have received this email because you are listed as an alternate data approver . please click approval to review and act upon this request . request id : 000000000012735 approver : stinson . gibner @ enron . com request create date : 1 / 8 / 01 4 : 26 : 26 pm requested for : anita . dupont @ enron . com resource name : \ \ enehou \ houston \ common \ research - [ read / write ] resource type : directory,Safe Email
18647,"re : important - prc mtg hi dorn & john , as you discovered recently , i am still ' officially ' in vince kaminski ' s group ( my original enron corp . group ) . this holds true for shalesh ganjoo as well . i did not explicitly pick dorn or john as reviewers thinking that they will show up automatically as a result of my assumed reporting structure . so , vince has agreed to ' host ' the review in his group and proceed to transfer me over to ebs officially when this quarter is overs ( apprently that was scheduled to be automatic ) . in the mean time , vasant , stinson or vince would like to get a e - mail from either dorn or john regarding my performance from their perspective for consideration as soon as possible . i had plan on being on vacation starting tomorrow and have made arrangement with my family already . since i am not reviewing shalesh directly ( since he is in research under stionson ) , i am assuming i don ' t have to attend the review meetin tommorrow . i ' ll be on the road starting in the morning . if i change this , i am told at home , that i will be in the market for another family ! i can phone in if that is okay . kayla , could you page me with the details ? regards , ravi .",Safe Email
18648,"press clippings - letter on californian utilities please find attached the following article : ' californian utilities ' - financial times ( also sent to california distribution team ) kind regards , kuldeep chana",Safe Email


In [5]:
def clean_text(text):
	text = str(text)
	text = text.lower()
	text = re.sub(r'[^A-Za-z\s]', '', text)
	text = re.sub(r'\n', ' ', text)
	return text

data = data.rename(columns={"Email Text": "email_content", "Email Type": "email_type"})
data["email_content"] = data["email_content"].apply(clean_text)
data['email_type'] = data['email_type'].map({'Safe Email': 0, 'Phishing Email': 1})

data = data[data["email_content"].str.strip() != "empty"]

In [6]:
X = data['email_content']
y = data['email_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
nb_model = MultinomialNB()
lr_model = LogisticRegression()
rf_model = RandomForestClassifier()
svm_model = SVC()
gb_model = GradientBoostingClassifier()

In [9]:
def test_model(model):
	model.fit(X_train_tfidf, y_train)
	y_pred = model.predict(X_test_tfidf)

	print(f'{str(model.get_metadata_routing).split(" ")[-1][:-3]} accuracy: {accuracy_score(y_test, y_pred)}')


In [10]:
# test_model(nb_model)
# test_model(lr_model)
# test_model(rf_model)
test_model(svm_model)
# test_model(gb_model)

SVC accuracy: 0.9804083885209713


In [24]:
def test_model_email(model):
	test_mail = ["""See how companies are moving fast and confidently with secure gen AI Join us at AWS Innovate to discover how Amazon Web Services (AWS) can help you harness the full potential of generative AI and data. Learn about the tools and infrastructure needed to build and scale gen AI, how to make your data AI-ready, and create gen AI applications that drive real business value. See how customers are using the most comprehensive set of AI and data services on AWS to innovate faster and build a competitive advantage with gen AI."""]
	test_mail = clean_text(test_mail[0])
	test_mail = vectorizer.transform([test_mail])

	model_name = str(model.get_metadata_routing).split(" ")[-1][:-3]
	print(f"{model_name} detected email as pishing" if model.predict(test_mail)[0] else f"{model_name} detected email as safe")

In [25]:
# test_model_email(nb_model)
# test_model_email(lr_model)
# test_model_email(rf_model)
test_model_email(svm_model)

SVC detected email as safe


In [26]:
import joblib

joblib.dump(svm_model, "saved_data/svm_model.pkl")
joblib.dump(vectorizer, "saved_data/tfidf_vectorizer.pkl")
print("Done!")

Done!


# Server

In [14]:
import socket
import signal

def handle_interrupt(signum, frame):
	conn.close()
	print("Cell execution interrupted!")

signal.signal(signal.SIGINT, handle_interrupt)

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # socket.AF_INET->ipv4, socket.SOCK_STREAM->tcp connection
sock.bind(("", 3001)) # Listen ""->all interfaces, 9000->port
sock.listen(5) # max count of connections

while True:
	conn, addr = sock.accept() # conn->new socket object, 
	print ('connected:', addr)

	data = conn.recv(4096)
	if not data:
		break
	data = data.decode()
	if data == "exit": #-----------------------------------------
		break
	data = clean_text(data)
	data = vectorizer.transform([data])

	if svm_model.predict(data)[0]:
		conn.send(b"pishing")
	else:
		conn.send(b"safe")

conn.close()