In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [24]:
import joblib
import pickle


In [11]:
file_path = 'data/Phishing_Email.csv'
phishing_email_df = pd.read_csv(file_path)


In [12]:
phishing_email_df.drop(columns=['Unnamed: 0'], inplace=True)
phishing_email_df.dropna(inplace=True)


In [26]:
phishing_email_df.head()


Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [13]:
texts = phishing_email_df['Email Text'].values
labels = phishing_email_df['Email Type'].values


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)


In [15]:
sequence_lengths = [len(seq) for seq in sequences]

print(f'Max sequence length: {max(sequence_lengths)}')
print(f'Mean sequence length: {np.mean(sequence_lengths)}')
print(f'Median sequence length: {np.median(sequence_lengths)}')
print(f'90th percentile sequence length: {np.percentile(sequence_lengths, 90)}')
print(f'95th percentile sequence length: {np.percentile(sequence_lengths, 95)}')
print(f'99th percentile sequence length: {np.percentile(sequence_lengths, 99)}')


Max sequence length: 2516361
Mean sequence length: 423.53606311044325
Median sequence length: 140.0
90th percentile sequence length: 598.0
95th percentile sequence length: 970.3499999999985
99th percentile sequence length: 2463.0099999999948


In [15]:
max_sequence_length = 1000
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [16]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [18]:
def create_mode():
    model = Sequential()
    model.add(Embedding( input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length))
    model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    return model


In [19]:
model = create_mode()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch 1/5




[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 876ms/step - accuracy: 0.7951 - loss: 0.4161 - val_accuracy: 0.9557 - val_loss: 0.1120
Epoch 2/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 873ms/step - accuracy: 0.9756 - loss: 0.0706 - val_accuracy: 0.9564 - val_loss: 0.1295
Epoch 3/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 863ms/step - accuracy: 0.9782 - loss: 0.0587 - val_accuracy: 0.9655 - val_loss: 0.0887
Epoch 4/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 874ms/step - accuracy: 0.9859 - loss: 0.0305 - val_accuracy: 0.9621 - val_loss: 0.0903
Epoch 5/5
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 870ms/step - accuracy: 0.9890 - loss: 0.0280 - val_accuracy: 0.9608 - val_loss: 0.1038
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 160ms/step - accuracy: 0.9631 - loss: 0.1215
Test Accuracy: 95.95%


In [22]:
model.save('phishing_detection_model.keras')


In [25]:
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
