In [1]:
!pip install plotly



In [4]:
import os
import pandas as pd
import plotly.graph_objects as go

# Path to your folder
   # change this to your folder
base_path = os.curdir
folder_path = "datasets"
folder_path = os.path.join(base_path, folder_path)
exclude = ['four_row_phishtank_scraped_dataset_670K.csv']
# Read all CSV files in the folder
dataframes = []
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        if file not in exclude:
            file_path = os.path.join(folder_path, file)

            print(file_path)
            df = pd.read_csv(file_path)
            dataframes.append((file,df))

# Concatenate all dataframes
if len(dataframes) == 0:
    raise ValueError("No CSV files found in the folder.")

# Choose a column to analyze (change this to your desired column)
column_name = "label"   # replace with your actual column
fig = go.Figure()

for file_name, df in dataframes:
    # Rename 0/1 to labels safely
    df[column_name] = df[column_name].astype(str)
    #df.loc[df[column_name] == 1, column_name] = "spam"
    #df.loc[df[column_name] == 0, column_name] = "legitimate"

    # Value counts
    value_counts = df[column_name].value_counts().reset_index()
    value_counts.columns = [column_name, "count"]

    # Add total row
    value_counts_with_total = pd.concat([
        value_counts,
        pd.DataFrame({column_name: ["total"], "count": [df.shape[0]]})
    ], ignore_index=True)

    # Compute percentage relative to total
    value_counts_with_total["percent"] = value_counts_with_total["count"] / df.shape[0] * 100

    # Add bar trace with percentages on top
    fig.add_bar(
        x=value_counts_with_total[column_name],
        y=value_counts_with_total["count"],
        name=file_name,
        text=value_counts_with_total["percent"].apply(lambda x: f"{x:.1f}%"),
        textposition="auto"
    )

fig.update_layout(
    title=f"Value Counts of '{column_name}' + Total Rows",
    xaxis_title=column_name,
    yaxis_title="Count",
    barmode="group"
)

fig.show()
column_text = "body"
column_label = "label"

.\datasets\CEAS_08.csv
.\datasets\Enron.csv
.\datasets\Ling.csv
.\datasets\Nazario.csv
.\datasets\Nazario_5.csv
.\datasets\Nigerian_5.csv
.\datasets\Nigerian_Fraud.csv
.\datasets\SpamAssasin.csv
.\datasets\TREC_07.csv


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

# ---------------------------
# Custom dataset class
# ---------------------------
class EmailDataset(Dataset):
    def __init__(self, texts, labels, vectorizer):
        self.X = vectorizer.transform(texts).toarray()
        self.y = labels
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)

# ---------------------------
# Simple Feedforward model
# ---------------------------
class SimpleFFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super(SimpleFFNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# ---------------------------
# Parameters
# ---------------------------
column_text = "body"
column_label = "label"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32
epochs = 5
hidden_dim = 128
max_features = 2000

# ---------------------------
# Fill NaNs and map labels
# ---------------------------
for i in range(len(dataframes)):
    file_name, df = dataframes[i]
    df[column_text] = df[column_text].fillna("")  # fill NaN
    
    df.loc[df[column_label] == "spam", column_label] = 1
    df.loc[df[column_label] == "legitimate", column_label] = 0
    df[column_label] = df[column_label].astype(int)
    dataframes[i] = (file_name, df)

# ---------------------------
# Vectorizer on all data
# ---------------------------
all_texts = pd.concat([df[column_text] for _, df in dataframes])
vectorizer = CountVectorizer(max_features=max_features)
vectorizer.fit(all_texts)

# ---------------------------
# Cross-dataset evaluation
# ---------------------------
n = len(dataframes)
accuracy_matrix = np.zeros((n, n))

for i, (train_name, train_df) in enumerate(dataframes):
    # Encode train labels
    le = LabelEncoder()
    train_labels = le.fit_transform(train_df[column_label])
    train_dataset = EmailDataset(train_df[column_text], train_labels, vectorizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Model
    model = SimpleFFNN(input_dim=max_features, hidden_dim=hidden_dim, output_dim=2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

    # Test on all datasets
    for j, (test_name, test_df) in enumerate(dataframes):
        # Fit LabelEncoder on all labels
        all_labels = pd.concat([df[column_label] for _, df in dataframes])
        le = LabelEncoder()
        le.fit(all_labels)

        # Then use the same encoder for both train and test
        train_labels = le.transform(train_df[column_label])
        test_labels = le.transform(test_df[column_label])
        test_dataset = EmailDataset(test_df[column_text], test_labels, vectorizer)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()
        accuracy_matrix[i, j] = correct / total * 100

# ---------------------------
# Print confusion-like matrix
# ---------------------------
df_accuracy = pd.DataFrame(accuracy_matrix,
                           index=[name for name, _ in dataframes],
                           columns=[name for name, _ in dataframes])
print("Cross-dataset accuracy matrix (%):")
print(df_accuracy)


KeyboardInterrupt: 

Cross-dataset accuracy matrix (%):
                    CEAS_08.csv  Enron.csv    Ling.csv  Nazario.csv  \
CEAS_08.csv           99.966798  76.000269   88.877230    23.003195   
Enron.csv             67.934311  99.697652   69.324939    81.725240   
Ling.csv              81.580426  79.752746  100.000000    70.415335   
Nazario.csv           44.215150  53.048678   83.980413     0.000000   
Nazario_5.csv         54.362262  63.483052   83.001049   100.000000   
Nigerian_5.csv        45.637738  59.807841   84.889822    27.539936   
Nigerian_Fraud.csv    44.215150  53.048678   83.980413     0.000000   
SpamAssasin.csv       77.404607  75.839016   88.387548    60.255591   
TREC_07.csv           77.894979  68.935398   45.015740    95.015974   

                    Nazario_5.csv  Nigerian_5.csv  Nigerian_Fraud.csv  \
CEAS_08.csv             57.781403       92.133944           90.486194   
Enron.csv               64.730832       75.848997           99.399760   
Ling.csv                74.681892       69.609856           57.923169   
Nazario.csv             48.939641       47.370084            0.000000   
Nazario_5.csv           99.934747       64.792292           33.943577   
Nigerian_5.csv          63.001631       99.936819           99.879952   
Nigerian_Fraud.csv      48.939641       47.370084            0.000000   
SpamAssasin.csv         75.040783       94.329490           97.959184   
TREC_07.csv             82.185971       83.067446           99.369748   

                    SpamAssasin.csv  TREC_07.csv  
CEAS_08.csv               79.015321    84.282977  
Enron.csv                 46.686177    61.398516  
Ling.csv                  70.184197    75.545510  
Nazario.csv               70.425202    45.311308  
Nazario_5.csv             75.210880    53.414439  
Nigerian_5.csv            76.674126    50.566438  
Nigerian_Fraud.csv        70.425202    45.311308  
SpamAssasin.csv           99.931141    78.867868  
TREC_07.csv               73.059046    99.910709  

In [27]:
import plotly.graph_objects as go

# df_accuracy is your DataFrame
fig = go.Figure(data=go.Heatmap(
    z=df_accuracy.values,
    x=df_accuracy.columns,
    y=df_accuracy.index,
    colorscale='Viridis',
    text=df_accuracy.values,
    texttemplate="%{text:.2f}%",
    hoverongaps=False,
))

fig.update_layout(
    title="Cross-Dataset Accuracy Matrix (%)",
    xaxis_title="Test Dataset",
    yaxis_title="Train Dataset",
    xaxis_tickangle=-45,
)

fig.show()


In [None]:
import pandas as pd

# -----------------------------
# Parameters
# -----------------------------
accuracy_threshold = 50  # include datasets with avg cross-accuracy above this
merged_filename = r"datasets\merged_emails.csv"

# -----------------------------
# Compute average cross-dataset accuracy per dataset
# -----------------------------
avg_accuracy = df_accuracy.mean(axis=1)
print("Average cross-dataset accuracy per dataset:")
print(avg_accuracy)

# Select datasets above threshold
datasets_to_include = avg_accuracy[avg_accuracy >= accuracy_threshold].index.tolist()
print("\nDatasets selected for merging (avg accuracy >= {}%):".format(accuracy_threshold))
print(datasets_to_include)



NameError: name 'df_accuracy' is not defined

In [7]:
datasets_to_include = ['CEAS_08.csv','Enron.csv','Ling.csv','Nigerian_5.csv','SpamAssasin.csv','TREC_07.csv']
# -----------------------------
# Merge selected datasets
# -----------------------------
merged_df = pd.concat([df for name, df in dataframes if name in datasets_to_include], ignore_index=True)

# -----------------------------
# Print metadata
# -----------------------------
print("\nMerged Dataset Metadata:")
print(f"Total rows: {len(merged_df)}")
print(f"Number of datasets merged: {len(datasets_to_include)}")
print("Rows per dataset:")
for name in datasets_to_include:
    df_count = len([df for n, df in dataframes if n == name][0])
    print(f" - {name}: {df_count} rows")

# Class balance
if column_label in merged_df.columns:
    print("\nClass distribution:")
    print(merged_df[column_label].value_counts())
    print("\nClass percentages:")
    print(merged_df[column_label].value_counts(normalize=True)*100)

# -----------------------------
# Save merged DataFrame
# -----------------------------
merged_df.to_csv(merged_filename, index=False)
print(f"\nMerged dataset saved as '{merged_filename}'")


Merged Dataset Metadata:
Total rows: 137677
Number of datasets merged: 6
Rows per dataset:
 - CEAS_08.csv: 39154 rows
 - Enron.csv: 29767 rows
 - Ling.csv: 2859 rows
 - Nigerian_5.csv: 6331 rows
 - SpamAssasin.csv: 5809 rows
 - TREC_07.csv: 53757 rows

Class distribution:
label
1    70725
0    66952
Name: count, dtype: int64

Class percentages:
label
1    51.370236
0    48.629764
Name: proportion, dtype: float64

Merged dataset saved as 'merged_emails.csv'
