# Spam E-Mail Detector
This notebooks contains a model which is able to predict whether a given embedding of an email is a Phishing Mail or is safe. To achieve this, it loads previously created embeddings of emails that are classified either as Phishing or Safe. It then splits up the data into a train, validate and test split before defining and training a model with the data. Finally the models performance is evaluated.

In [98]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

### Load the embeddings

In [99]:
# Load the embeddings and skip the header
data_embeddings = pd.read_csv('data/embeddings.csv', sep=';', skiprows=0)
data_size = len(data_embeddings)

print(data_size)
print(data_embeddings.head())

5
       Email Type                                          embedding
0      Safe Email  [-0.08889792859554291, -0.001748952199704945, ...
1      Safe Email  [0.040912602096796036, -0.10533677786588669, 0...
2      Safe Email  [-0.0216156505048275, -0.02926367148756981, -0...
3  Phishing Email  [-0.0783514603972435, -0.06557461619377136, 0....
4  Phishing Email  [0.038430340588092804, 0.05078845098614693, 0....


### Create a train, validate and test dataset

In [100]:
def format_and_load_data(train_split=0.6, validate_split=0.2, test_split=0.2):
    # Calculate the number of samples for each dataset split
    train_amount = int(data_size * train_split)
    validate_amount = int(data_size * validate_split)
    test_amount = int(data_size * test_split)
    
    examples = []

    # Process each embedding
    for _, embedding in data_embeddings.iterrows():
        label = 1 if embedding.iloc[0] == 'Phishing Email' else 0
        embedding_list = [float(i) for i in embedding.iloc[1].strip('[]').split(', ')]
        examples.append((embedding_list, label))

    # Shuffle the examples
    np.random.shuffle(examples)

    # Split data into features and labels
    X, y = zip(*examples)

    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Split data into training, validation and testing sets
    X_train = X[:train_amount]
    X_val = X[train_amount:train_amount + validate_amount]
    X_test = X[train_amount + validate_amount:]
    y_train = y[:train_amount]
    y_val = y[train_amount:train_amount + validate_amount]
    y_test = y[train_amount + validate_amount:]

    return X_train, y_train, X_val, y_val, X_test, y_test

### Create a binary classifier model that can predict whether a mail is spam or not

In [101]:
# Load the data splits
X_train, y_train, X_val, y_val, X_test, y_test = format_and_load_data()

# Reshape data to make it acceptable for the model
y_train = np.array(y_train, dtype=np.int32)
y_val = np.array(y_val, dtype=np.int32)
y_test = np.array(y_test, dtype=np.int32)

# Define model architecture
no_embedding_dim = len(X_train[0])
model = models.Sequential([
    layers.Dense(no_embedding_dim, activation='relu', input_shape=(no_embedding_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Define callbacks
early_stopping = callbacks.EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(factor=0.5, patience=2, monitor='val_loss', verbose=1)

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr])

model.summary()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 4/50
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 384)               147840    
                                                                 
 dense_29 (Dense)            (None, 128)               49280     
                                                                 
 dense_30 (Dense)            (None, 1)                 129       
                                                                 
Total params: 197249 (770.50 KB)
Trainable params: 197249 (770.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Evaluate the model

In [102]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Test Loss: 0.7040073871612549
Test Accuracy: 0.0
