In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout

# Load the pilot dataset from Excel
pilot_dataset_file = "hand_coded_dataset.xlsx"
pilot_df = pd.read_excel(pilot_dataset_file)

# Shuffle the data to avoid any bias in the ordering
pilot_df = pilot_df.sample(frac=1).reset_index(drop=True)

# Split data into features and labels
X_text_features = pilot_df[["subject", "sender", "sender_email", "body", "phishing"]]
y = pilot_df["persuation"].values

# Combine text features into a single column
X = X_text_features.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Tokenize the text data
max_words = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

# Convert text data to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of the same length
max_sequence_length = 100  # Maximum length of sequences
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length, padding="post")

# Define the BiLSTM model
embedding_dim = 100  # Dimensionality of word embeddings
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Split data into training and test sets
test_size = 0.2
split_idx = int(len(X_padded) * (1 - test_size))
X_train, X_test = X_padded[:split_idx], X_padded[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# Train the model
epochs = 10
batch_size = 32
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.8192, Test Accuracy: 0.8203


In [13]:
large_dataset_file = "../Data_Collection/final_data.csv"
large_dataset = pd.read_csv(large_dataset_file, encoding="ISO-8859-1")

# Preprocess the large dataset
X_large_text_features = large_dataset[["subject", "sender", "sender_email", "body", "phishing"]]
X_large = X_large_text_features.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
X_large_sequences = tokenizer.texts_to_sequences(X_large)
X_large_padded = pad_sequences(X_large_sequences, maxlen=max_sequence_length, padding="post")

# Predict on the large dataset
threshold = 0.5
predictions = model.predict(X_large_padded)
predicted_labels = (predictions >= threshold).astype(int)

# Add the predicted labels to the large dataset DataFrame
large_dataset["persuation"] = predicted_labels

# Save the labeled dataset to a new Excel file
labeled_dataset_file = "labeled_dataset.xlsx"
large_dataset.to_excel(labeled_dataset_file, index=False)

# Display the labeled dataset
print(large_dataset)


                                                 subject  \
0                                 Re: Credit Derivatives   
1                              Meter #1591 Lamay Gaslift   
2                                   Re: man night again?   
3                                Enron 480, 1480 charges   
4                                         Transport Deal   
...                                                  ...   
33325                        Online Banking Notification   
33326  HSBC Internet Banking : Temporary Access Suspe...   
33327   Please Read: Important Message From Halifax Bank   
33328                  HSBC Mail - Alert Account Locked!   
33329                           CUSTOMER SERVICE MESSAGE   

                                    sender               sender_email  \
0                          Sara Shackleton  sara.shackleton@enron.com   
1                               Pat Clynes       pat.clynes@enron.com   
2              chad knipe <knipe3@msn.com>             knipe