In [1]:
pip install pandas scikit-learn transformers tensorflow py3-validate-email joblib tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pathos

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import re
from pathos.multiprocessing import ProcessingPool as Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
from validate_email import validate_email
import joblib
from tqdm import tqdm
import time
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
import re
from pathos.multiprocessing import ProcessingPool as Pool
from validate_email import validate_email
from tqdm import tqdm
import time

# Function to validate email with timeout handling
def validate_email_with_timeout(email):
    try:
        return validate_email(email_address=email, check_format=True, check_blacklist=True, check_dns=True, check_smtp=False, smtp_debug=False)
    except Exception as e:
        return False

def validate_emails():
    try:
        # Try to load the saved DataFrame
        df = pd.read_csv('validated_emails.csv')
        df['date'] = pd.to_datetime(df['date'], utc=True)
        print("Loaded the DataFrame from 'validated_emails.csv'.")
    except FileNotFoundError:
        # Load dataset
        df = pd.read_csv('CEAS_08.csv')
        df = df.head(50000)

        # Data preprocessing
        df['sender_email'] = df['sender'].apply(lambda x: re.findall(r'<(.*?)>', x)[0] if re.findall(r'<(.*?)>', x) else x)
        df['date'] = pd.to_datetime(df['date'], errors='coerce', utc=True)
        df = df.dropna(subset=['date'])

        df['domain'] = df['sender_email'].apply(lambda x: x.split('@')[-1])
        df['day_of_week'] = df['date'].dt.dayofweek
        df['hour'] = df['date'].dt.hour
        df['urls'] = df['body'].apply(lambda x: 1 if re.search(r'http[s]?://', str(x)) else 0)

        print("Validating emails...")
        start_time = time.time()
        with Pool(processes=4) as pool:
            df['email_validity'] = list(tqdm(pool.imap(validate_email_with_timeout, df['sender_email']), total=len(df)))
        end_time = time.time()
        print(f"Email validation completed in {end_time - start_time:.2f} seconds")

        # Save the validated DataFrame
        df.to_csv('validated_emails.csv', index=False)
        print("Saved the validated DataFrame to 'validated_emails.csv'.")

    print("Columns after validation:", df.columns.tolist())
    return df

df = validate_emails()


Loaded the DataFrame from 'validated_emails.csv'.
Columns after validation: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls', 'sender_email', 'domain', 'day_of_week', 'hour', 'email_validity']


In [5]:
import pandas as pd

def load_dataset():
    df = pd.read_csv('validated_emails.csv')
    df['date'] = pd.to_datetime(df['date'], utc=True)
    print("Loaded the DataFrame from 'validated_emails.csv'.")
    print("Columns after loading dataset:", df.columns.tolist())
    return df

df = load_dataset()


Loaded the DataFrame from 'validated_emails.csv'.
Columns after loading dataset: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls', 'sender_email', 'domain', 'day_of_week', 'hour', 'email_validity']


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
import joblib

# Function to encode texts using RoBERTa tokenizer
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='tf')

# Function to get RoBERTa embeddings
def get_roberta_embeddings(encoded_texts, roberta_model, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(encoded_texts['input_ids']), batch_size), desc="Generating embeddings"):
        batch = {key: val[i:i+batch_size] for key, val in encoded_texts.items()}
        outputs = roberta_model(batch)
        embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())
    return tf.convert_to_tensor(np.concatenate(embeddings, axis=0))

def train_model(df):
    # Ensure all necessary columns are present
    required_columns = ['domain', 'day_of_week', 'hour', 'email_validity', 'urls']
    for column in required_columns:
        if column not in df.columns:
            raise KeyError(f"'{column}' not found in DataFrame columns. Available columns: {df.columns.tolist()}")

    # Train-test split before tokenization
    X_train, X_test, y_train, y_test = train_test_split(df, df['label'], test_size=0.3, random_state=42)

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    encoded_subjects_train = encode_texts(X_train['subject'].astype(str).tolist(), tokenizer, max_length=64)
    encoded_bodies_train = encode_texts(X_train['body'].astype(str).tolist(), tokenizer, max_length=64)
    encoded_subjects_test = encode_texts(X_test['subject'].astype(str).tolist(), tokenizer, max_length=64)
    encoded_bodies_test = encode_texts(X_test['body'].astype(str).tolist(), tokenizer, max_length=64)

    X_meta_train = X_train[required_columns]
    X_meta_test = X_test[required_columns]
    
    X_meta_train = pd.get_dummies(X_meta_train)
    X_meta_test = pd.get_dummies(X_meta_test)
    
    X_meta_test = X_meta_test.reindex(columns = X_meta_train.columns, fill_value=0)
    X_meta_train.columns = X_meta_train.columns.astype(str)
    X_meta_test.columns = X_meta_test.columns.astype(str)

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_meta_train, y_train)

    roberta_model = TFRobertaModel.from_pretrained('roberta-base')

    print("Generating RoBERTa embeddings...")
    start_time = time.time()

    train_embeddings_subjects = get_roberta_embeddings(encoded_subjects_train, roberta_model, batch_size=16)
    test_embeddings_subjects = get_roberta_embeddings(encoded_subjects_test, roberta_model, batch_size=16)

    train_embeddings_bodies = get_roberta_embeddings(encoded_bodies_train, roberta_model, batch_size=16)
    test_embeddings_bodies = get_roberta_embeddings(encoded_bodies_test, roberta_model, batch_size=16)

    end_time = time.time()
    print(f"RoBERTa embeddings generated in {end_time - start_time:.2f} seconds")

    train_embeddings = tf.concat([train_embeddings_subjects, train_embeddings_bodies], axis=1)
    test_embeddings = tf.concat([test_embeddings_subjects, test_embeddings_bodies], axis=1)

    meta_features_train = pd.concat([pd.DataFrame(train_embeddings.numpy()), pd.DataFrame(X_meta_train.reset_index(drop=True))], axis=1)
    meta_features_test = pd.concat([pd.DataFrame(test_embeddings.numpy()), pd.DataFrame(X_meta_test.reset_index(drop=True))], axis=1)

    meta_features_train.columns = meta_features_train.columns.astype(str)
    meta_features_test.columns = meta_features_test.columns.astype(str)

    meta_classifier = LogisticRegression(max_iter=1000)
    meta_classifier.fit(meta_features_train, y_train)

    joblib.dump(rf_model, 'random_forest_model.pkl')
    joblib.dump(meta_classifier, 'meta_classifier_model.pkl')
    roberta_model.save_pretrained('roberta_model')
    tokenizer.save_pretrained('roberta_tokenizer')

    # Save columns used in metadata training
    with open('metadata_columns.txt', 'w') as f:
        for column in X_meta_train.columns:
            f.write(f"{column}\n")

    return rf_model, meta_classifier, roberta_model, tokenizer, X_test, y_test, meta_features_test

rf_model, meta_classifier, roberta_model, tokenizer, X_test, y_test, meta_features_test = train_model(df)


2024-05-15 21:52:43.607752: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-05-15 21:52:43.607780: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-15 21:52:43.607785: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-15 21:52:43.607817: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-15 21:52:43.607838: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.laye

Generating RoBERTa embeddings...


Generating embeddings: 100%|████████████████| 1713/1713 [02:24<00:00, 11.83it/s]
Generating embeddings: 100%|██████████████████| 734/734 [01:01<00:00, 12.02it/s]
Generating embeddings: 100%|████████████████| 1713/1713 [02:21<00:00, 12.09it/s]
Generating embeddings: 100%|██████████████████| 734/734 [01:00<00:00, 12.11it/s]


RoBERTa embeddings generated in 408.37 seconds


In [7]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

def evaluate_model(meta_classifier, meta_features_test, y_test):
    y_pred = meta_classifier.predict(meta_features_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'F1 Score: {f1:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'Confusion Matrix:\n{cm}')

    return accuracy, f1, recall, cm

accuracy, f1, recall, cm = evaluate_model(meta_classifier, meta_features_test, y_test)


Accuracy: 99.13%
F1 Score: 0.99
Recall: 0.99
Confusion Matrix:
[[5135   68]
 [  34 6505]]
