In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [25]:
df = pd.read_csv("CEAS_08.csv")
df_original = df.copy()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


# Preprocessing

In [None]:
def clean_text(text):
    text = re.sub(r"<[^>]+>", "", str(text))  
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower().strip()

df["clean_subject"] = df["subject"].apply(clean_text)
df["clean_body"] = df["body"].apply(clean_text)

# Extract sender domain
df["sender_domain"] = df["sender"].apply(lambda x: x.split("@")[-1] if pd.notnull(x) else "")
df['sender_domain'] = df['sender_domain'].str[:-1]

df["receiver_domain"] = df["receiver"].apply(lambda x: x.split("@")[-1] if pd.notnull(x) else "")
# df['receiver_domain'] = df['receiver_domain'].str[:-1]
df['receiver_domain'] = df['receiver_domain'].apply(lambda x: x[:-1] if x.endswith(">") else x)

# Parse date (handle inconsistent formats)
df["date"] = df["date"].apply(lambda x: pd.to_datetime(x, errors="coerce",utc = True))
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek 
df['hour_normalized'] = df['hour'] / 23.0

df = df.dropna(subset=["label", "clean_subject", "clean_body","receiver","subject","date"])

In [7]:
df['label'].value_counts()

label
1    21812
0    16842
Name: count, dtype: int64

# Balancing

In [8]:
# Separate the majority and minority classes
majority_class = df[df['label'] == 1]
minority_class = df[df['label'] == 0]

# Randomly sample from the majority class to match the size of the minority class
balanced_majority_class = majority_class.sample(len(minority_class), random_state=42)

# Combine the balanced majority class with the minority class
df_balanced = pd.concat([balanced_majority_class, minority_class])

# Shuffle the resulting dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

df_balanced['label'].value_counts()

label
1    16842
0    16842
Name: count, dtype: int64

In [None]:
df_balanced = df_balanced.drop(columns=["date", "sender", "receiver", "subject", "body"])

# Vectorize

In [None]:
ffffff

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
import tensorflow_federated as tff

# Define a simple Keras model
def create_keras_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Wrap the Keras model for TFF
def model_fn():
    return tff.learning.from_keras_model(
        create_keras_model(),
        input_spec=tf.data.Dataset.from_tensor_slices((X_train, y_train)).element_spec,
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.BinaryAccuracy()]
    )

# Simulate 3 clients (split data into 3 partitions)
client_data = []
for i in range(3):
    client_X, _, client_y, _ = train_test_split(X_train, y_train, test_size=0.7, stratify=y_train)
    client_data.append(tf.data.Dataset.from_tensor_slices((client_X, client_y)).batch(32))

# Initialize the FL process
trainer = tff.learning.build_federated_averaging_process(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0)
)

# Run FL rounds
state = trainer.initialize()
for round_num in range(5):
    state, metrics = trainer.next(state, client_data)
    print(f"Round {round_num}: Loss={metrics['train']['loss']}, Accuracy={metrics['train']['binary_accuracy']}")

: 

In [None]:
import shap

# Train a final model on the full dataset
final_model = create_keras_model()
final_model.fit(X_train, y_train, epochs=10, batch_size=32)

# Generate SHAP explanations
explainer = shap.DeepExplainer(final_model, X_train[:100])
shap_values = explainer.shap_values(X_test[:10])

# Visualize
shap.summary_plot(shap_values, X_test[:10])

: 