In [58]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.models import load_model
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import os

# Spam E-Mail Detector
This notebooks contains a model which is able to predict whether a given embedding of an email is a Phishing Mail or is safe. To achieve this, it loads previously created embeddings of emails that are classified either as Phishing or Safe. It then splits up the data into a train, validate and test split before defining and training a model with the data. Finally the models performance is evaluated. The basic structure is oriented towards CRISP DM.
## CRISP-DM Phases

1. Business Understanding
2. Data Understanding
3. Data Preparation
4. Modeling
5. Evaluation
6. Deployment


### 1. Business Understanding

##### Objective
To develop a model that accurately predicts whether an email is a phishing email or not in order to enhance email security and protect users from potential threats.

##### Goals
- Reduce the number of phishing emails that reach users' inboxes.
- Minimize false positives to ensure legitimate emails are not incorrectly marked as phishing.
- Improve overall email security and user trust.

##### Business Questions
- What are the common characteristics of phishing emails?
- How frequently do phishing emails occur in the current email system?
- What impact do phishing emails have on user security and business operations?

##### Success Criteria
- Achieve a high accuracy rate in detecting phishing emails (e.g., over 95%).
- Maintain a low false positive rate (e.g., below 1%).
- Demonstrate improvement in email security metrics post-implementation.

##### Constraints
- Ensure the model can process and classify emails in real-time.
- Maintain user privacy and data protection while analyzing email content.
- Integrate seamlessly with existing email infrastructure and systems.

##### Key Stakeholders
- Email security team
- IT and cybersecurity departments
- End-users (employees, customers)
- Business executives

##### Requirements
- Access to historical email data, including labeled examples of phishing and non-phishing emails.
- Collaboration with the cybersecurity team to identify key features indicative of phishing.
- Tools and infrastructure for developing, testing, and deploying the model.


### 2. Data Understanding
The data comes from the following dataset: https://www.kaggle.com/datasets/subhajournal/phishingemails/data. It contains a total of 18,600 emails with 61% being labeled safe and 39% being labeled Phishing.

In [59]:
data = pd.read_csv("data/Phishing_Email.csv", skiprows=0)
data_size = len(data)

print(f"The amount of data entries is: {data_size}")
print(data.head())

The amount of data entries is: 18650
   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


### 3. Data Preparation

#### Data cleanup
The following data preparation steps include:
- Removing empty texts
- Making all letters lowercase
- Removing Punctuation
- Removing stopwords
- Stemming
- Lemmatization

In [60]:
# Remove rows with missing text
data = data.dropna(subset=["Email Text"])

# Convert to lowercase
data["Email Text"] = data["Email Text"].str.lower()

# Remove punctuation
data["Email Text"] = data["Email Text"].str.translate(str.maketrans("", "", string.punctuation))

# Remove stop words
stop_words = set(stopwords.words('english'))
data["Email Text"] = data["Email Text"].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Tokenize
data["Email Text"] = data["Email Text"].apply(word_tokenize)

# Stemming
stemmer = PorterStemmer()
data["Email Text"] = data["Email Text"].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
data["Email Text"] = data["Email Text"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

#### Load/Create the embeddings

In [61]:
# Check if the embeddings exist and if not, create them
if not os.path.exists("data/embeddings.csv"):
    # Load the sentence transformer model
    sentence_transformer_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    # Encode the text data to get embeddings
    embeddings = sentence_transformer_model.encode(data["Email Text"].tolist(), show_progress_bar=True)

    # Convert embeddings to a list of lists with Python floats
    embeddings_list = [list(map(float, embedding)) for embedding in embeddings]

    # Convert embeddings to a single string representation
    embeddings_str_list = [str(embedding) for embedding in embeddings_list]

    # Create a new DataFrame with a single column for embeddings
    embeddings_df = pd.DataFrame({"embedding": embeddings_str_list})

    # Concatenate the label column with the embeddings
    final_df = pd.concat([data["Email Type"].reset_index(drop=True), embeddings_df], axis=1)

    # Save the final DataFrame to a CSV file
    final_df.to_csv("data/embeddings.csv", index=False, sep=";")


# Load the embeddings and skip the header
data_embeddings = pd.read_csv('data/embeddings.csv', sep=';', skiprows=0)

#### Create a train, validate and test dataset

In [62]:
def format_and_load_data(train_split=0.3, validate_split=0.2, test_split=0.5):
    # Calculate the number of samples for each dataset split
    train_amount = int(data_size * train_split)
    validate_amount = int(data_size * validate_split)
    test_amount = int(data_size * test_split)
    
    examples = []

    # Process each embedding
    for _, embedding in data_embeddings.iterrows():
        label = 1 if embedding.iloc[0] == 'Phishing Email' else 0
        embedding_list = [float(i) for i in embedding.iloc[1].strip('[]').split(', ')]
        examples.append((embedding_list, label))

    # Shuffle the examples
    np.random.shuffle(examples)

    # Split data into features and labels
    X, y = zip(*examples)

    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Split data into training, validation and testing sets
    X_train = X[:train_amount]
    X_val = X[train_amount:train_amount + validate_amount]
    X_test = X[train_amount + validate_amount:]
    y_train = y[:train_amount]
    y_val = y[train_amount:train_amount + validate_amount]
    y_test = y[train_amount + validate_amount:]

    return X_train, y_train, X_val, y_val, X_test, y_test

### 4. Modeling

#### Create a binary classifier model that can predict whether a mail is spam or not

In [63]:
# Load the data splits
X_train, y_train, X_val, y_val, X_test, y_test = format_and_load_data()

# Reshape data to make it acceptable for the model
y_train = np.array(y_train, dtype=np.int32)
y_val = np.array(y_val, dtype=np.int32)
y_test = np.array(y_test, dtype=np.int32)

# Define model architecture
no_embedding_dim = len(X_train[0])
model = models.Sequential([
    layers.Dense(no_embedding_dim, activation='relu', input_shape=(no_embedding_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Define callbacks
early_stopping = callbacks.EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(factor=0.5, patience=2, monitor='val_loss', verbose=1)

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr])

# Save the model for later use
model.save("models/phishing_email_classifier.h5")

model_summary = model.summary()
model_summary

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/50
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 384)               147840    
                                                                 
 dense_13 (Dense)            (None, 128)               49280     
                                                                 
 dense_14 (Dense)            (None, 1)                 129       
                                                                 
Total params: 197249 (770.50 KB)
Trainable params: 197249 (770.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


  saving_api.save_model(


#### Evaluate the model

In [64]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Test Loss: 0.12684206664562225
Test Accuracy: 0.9509919285774231


### 5. Evaluation
The model shows good results with a loss of 0.14 and an accuracy of 0.95.

### 6. Deployment
The model can be retrieved for usage from the models folder.

In [67]:
# Load the saved model for phishing email classification
model = keras.models.load_model("models/phishing_email_classifier.h5")

# Load the Sentence Transformer model
sentence_transformer_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Function to preprocess input data
def preprocess_input(email_text):
    # Your preprocessing steps here (e.g., tokenization, cleaning)
    return email_text

# Example email data (you should replace this with your own email data)
email_text = "This is an example email. It might contain phishing links."

# Preprocess the input data
preprocessed_email = preprocess_input(email_text)

# Encode the preprocessed email text to get its embedding
email_embedding = sentence_transformer_model.encode([preprocessed_email])[0]

# Reshape the embedding to match the model input shape
X_test = np.array([email_embedding])  # Adding an extra dimension to match model input shape

# Make predictions
predictions = model.predict(X_test)

# Thresholding the predictions (assuming it's binary classification)
threshold = 0.5
predicted_label = "Phishing" if predictions[0][0] >= threshold else "Not Phishing"

print("Prediction:", predicted_label)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Prediction: Phishing
