## Exploratory Data Analysis (EDA) for Phishing Detection

This notebook performs EDA on three datasets: Enron (`enron_spam_data.csv`), PhishTank (`phishtank_data.csv`), and UCI Phishing Websites (`Training_Dataset.arff`). The goal is to understand the data, identify patterns, and prepare it for a BERT-based phishing detection system.

## Objectives
- Understand dataset structure (size, columns, missing values, duplicates).
- Analyze label distribution (phishing vs. legitimate).
- Explore text characteristics (length, word frequency, bigrams, trigrams, clusters) for Enron and PhishTank.
- Analyze numerical features for UCI, including feature importance.
- Combine text datasets (Enron + PhishTank), balance with SMOTE, and reassess.
- Provide detailed preprocessing recommendations.

## Datasets
- **Enron**: ~33,716 emails, columns: `Message ID`, `Subject`, `Message`, `Spam/Ham`, `Date`.
- **PhishTank**: ~64,753 phishing URLs, columns: `phish_id`, `url`, `phish_detail_url`, etc.
- **UCI**: ~11,055 website records, 30 numerical features, `Result` (no URL).

Outputs are saved to `results/eda/`. Intermediate datasets are saved to `data/intermediate/`.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
import re
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download NLTK resources
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
except Exception as e:
    logger.error(f"Failed to download NLTK resources: {e}")

# Set up directories
os.makedirs('results/eda', exist_ok=True)
os.makedirs('data/intermediate', exist_ok=True)

# Initialize lemmatizer and stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Utility Functions

Define functions for text cleaning, dataset loading, and analysis.

In [2]:
def clean_text(text, is_url=False):
    """Preprocess text for EDA. URLs retain more components than emails."""
    try:
        if not isinstance(text, str):
            logger.debug(f"Non-string input: {type(text)}")
            return ''
        text = text.lower()
        if is_url:
            text = re.sub(r'http[s]?://', '', text)
            text = re.sub(r'[^a-zA-Z0-9\\-\\.\\_/]', '', text)
        else:
            text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
            text = re.sub(r'[^a-zA-Z\\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
        cleaned = ' '.join(tokens)
        if not cleaned and text:
            logger.debug(f"Cleaned text is empty for input: {text[:50]}")
        return cleaned
    except Exception as e:
        logger.error(f"Error cleaning text: {e}")
        return ''

def dataset_summary(df, name):
    """Summarize dataset structure."""
    try:
        summary = {
            'Size': df.shape[0],
            'Columns': list(df.columns),
            'Missing Values': df.isnull().sum().to_dict(),
            'Duplicates': df.duplicated().sum()
        }
        logger.info(f"{name} Summary: {summary}")
        return summary
    except Exception as e:
        logger.error(f"Error summarizing {name}: {e}")
        return {}

def plot_missing_values(df, name):
    """Plot missing values."""
    try:
        missing = df.isnull().sum()
        if missing.sum() == 0:
            logger.info(f"No missing values in {name}")
            return
        plt.figure(figsize=(10, 6))
        sns.barplot(x=missing.index, y=missing.values)
        plt.title(f'Missing Values in {name}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'results/eda/missing_values_{name.lower()}.png')
        plt.close()
    except Exception as e:
        logger.error(f"Error plotting missing values for {name}: {e}")

def label_distribution(df, name):
    """Analyze and plot label distribution."""
    try:
        if 'label' not in df.columns or df.empty:
            logger.warning(f"No labels or empty DataFrame for {name}")
            return {}
        label_counts = df['label'].value_counts()
        plt.figure(figsize=(8, 6))
        sns.barplot(x=label_counts.index.astype(str), y=label_counts.values)
        plt.title(f'Label Distribution in {name}')
        plt.xlabel('Label (0: Legitimate, 1: Phishing)')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(f'results/eda/label_distribution_{name.lower()}.png')
        plt.close()
        return label_counts.to_dict()
    except Exception as e:
        logger.error(f"Error plotting label distribution for {name}: {e}")
        return {}

def text_analysis(df, name):
    """Analyze text characteristics, including trigrams and clustering."""
    try:
        if 'text' not in df.columns or df['text'].str.strip().eq('').all():
            logger.warning(f"No valid text data for {name}")
            return {}, [], [], [], [], [], []
        df['char_length'] = df['text'].apply(len)
        df['word_length'] = df['text'].apply(lambda x: len(x.split()))

        # Statistics
        stats = {
            'Char Length Mean': df['char_length'].mean(),
            'Char Length Median': df['char_length'].median(),
            'Word Length Mean': df['word_length'].mean(),
            'Word Length Median': df['word_length'].median()
        }
        logger.info(f"{name} Text Stats: {stats}")

        # Plot length distributions
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.histplot(df['char_length'], bins=50)
        plt.title(f'Character Length Distribution in {name}')
        plt.subplot(1, 2, 2)
        sns.histplot(df['word_length'], bins=50)
        plt.title(f'Word Length Distribution in {name}')
        plt.tight_layout()
        plt.savefig(f'results/eda/text_length_{name.lower()}.png')
        plt.close()

        # Word frequency
        phishing_words = ' '.join(df[df['label'] == 1]['text']).split()
        legit_words = ' '.join(df[df['label'] == 0]['text']).split()
        phishing_freq = Counter(phishing_words).most_common(10)
        legit_freq = Counter(legit_words).most_common(10)

        # Plot word frequency
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        if phishing_freq:
            sns.barplot(x=[count for _, count in phishing_freq], y=[word for word, _ in phishing_freq])
        plt.title(f'Top 10 Words in Phishing ({name})')
        plt.subplot(1, 2, 2)
        if legit_freq:
            sns.barplot(x=[count for _, count in legit_freq], y=[word for word, _ in legit_freq])
        plt.title(f'Top 10 Words in Legitimate ({name})')
        plt.tight_layout()
        plt.savefig(f'results/eda/word_freq_{name.lower()}.png')
        plt.close()

        # Bigram frequency
        phishing_bigrams = [bigram for text in df[df['label'] == 1]['text'] for bigram in ngrams(text.split(), 2)]
        legit_bigrams = [bigram for text in df[df['label'] == 0]['text'] for bigram in ngrams(text.split(), 2)]
        phishing_bigram_freq = Counter(phishing_bigrams).most_common(10)
        legit_bigram_freq = Counter(legit_bigrams).most_common(10)

        # Plot bigram frequency
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        if phishing_bigram_freq:
            sns.barplot(x=[count for _, count in phishing_bigram_freq], y=[' '.join(bigram) for bigram, _ in phishing_bigram_freq])
        plt.title(f'Top 10 Bigrams in Phishing ({name})')
        plt.subplot(1, 2, 2)
        if legit_bigram_freq:
            sns.barplot(x=[count for _, count in legit_bigram_freq], y=[' '.join(bigram) for bigram, _ in legit_bigram_freq])
        plt.title(f'Top 10 Bigrams in Legitimate ({name})')
        plt.tight_layout()
        plt.savefig(f'results/eda/bigram_freq_{name.lower()}.png')
        plt.close()

        # Trigram frequency
        phishing_trigrams = [trigram for text in df[df['label'] == 1]['text'] for trigram in ngrams(text.split(), 3)]
        legit_trigrams = [trigram for text in df[df['label'] == 0]['text'] for trigram in ngrams(text.split(), 3)]
        phishing_trigram_freq = Counter(phishing_trigrams).most_common(10)
        legit_trigram_freq = Counter(legit_trigrams).most_common(10)

        # Plot trigram frequency
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        if phishing_trigram_freq:
            sns.barplot(x=[count for _, count in phishing_trigram_freq], y=[' '.join(trigram) for trigram, _ in phishing_trigram_freq])
        plt.title(f'Top 10 Trigrams in Phishing ({name})')
        plt.subplot(1, 2, 2)
        if legit_trigram_freq:
            sns.barplot(x=[count for _, count in legit_trigram_freq], y=[' '.join(trigram) for trigram, _ in legit_trigram_freq])
        plt.title(f'Top 10 Trigrams in Legitimate ({name})')
        plt.tight_layout()
        plt.savefig(f'results/eda/trigram_freq_{name.lower()}.png')
        plt.close()

        # Word clouds
        if phishing_words:
            phishing_cloud = WordCloud(width=800, height=400).generate(' '.join(phishing_words))
            plt.figure(figsize=(10, 5))
            plt.imshow(phishing_cloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Phishing Word Cloud ({name})')
            plt.savefig(f'results/eda/phishing_wordcloud_{name.lower()}.png')
            plt.close()

        if legit_words:
            legit_cloud = WordCloud(width=800, height=400).generate(' '.join(legit_words))
            plt.figure(figsize=(10, 5))
            plt.imshow(legit_cloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Legitimate Word Cloud ({name})')
            plt.savefig(f'results/eda/legit_wordcloud_{name.lower()}.png')
            plt.close()

        # Text clustering
        if not df['text'].str.strip().eq('').all():
            vectorizer = TfidfVectorizer(max_features=1000)
            X_text = vectorizer.fit_transform(df['text'])
            kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
            df['cluster'] = kmeans.fit_predict(X_text)
            plt.figure(figsize=(8, 6))
            sns.countplot(x='cluster', hue='label', data=df)
            plt.title(f'Text Cluster Distribution in {name}')
            plt.legend(title='Label', labels=['Legitimate', 'Phishing'])
            plt.savefig(f'results/eda/text_clusters_{name.lower()}.png')
            plt.close()

        return stats, phishing_freq, legit_freq, phishing_bigram_freq, legit_bigram_freq, phishing_trigram_freq, legit_trigram_freq
    except Exception as e:
        logger.error(f"Error in text analysis for {name}: {e}")
        return {}, [], [], [], [], [], []

def uci_feature_analysis(df, name):
    """Analyze UCI numerical features with feature importance."""
    try:
        feature_cols = [col for col in df.columns if col not in ['label']]
        stats = {}
        for col in feature_cols:
            stats[col] = df[col].value_counts().to_dict()

        # Plot feature distributions (first 9 features)
        plt.figure(figsize=(15, 10))
        for i, col in enumerate(feature_cols[:9], 1):
            plt.subplot(3, 3, i)
            sns.countplot(x=col, hue='label', data=df)
            plt.title(col)
            plt.legend(title='Label', labels=['Legitimate', 'Phishing'])
            plt.tight_layout()
            plt.savefig(f'results/eda/feature_dist_{name.lower()}.png')
            plt.close()

        # Correlation with label
        corr = df[feature_cols + ['label']].corr()
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr, annot=False, cmap='coolwarm')
        plt.title(f'Feature Correlation in {name}')
        plt.tight_layout()
        plt.savefig(f'results/eda/feature_corr_{name.lower()}.png')
        plt.close()

        # Feature importance
        X = df[feature_cols]
        y = df['label']
        rf = RandomForestClassifier(random_state=42)
        rf.fit(X, y)
        importance = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=importance.values, y=importance.index)
        plt.title(f'Feature Importance in {name}')
        plt.tight_layout()
        plt.savefig(f'results/eda/feature_importance_{name.lower()}.png')
        plt.close()

        return stats, corr, importance
    except Exception as e:
        logger.error(f"Error in UCI feature analysis: {e}")
        return {}, pd.DataFrame(), pd.Series()

## Load Datasets

Load and preprocess each dataset to a unified format with text and label columns.

In [3]:
# Load Enron
try:
    enron_path = 'data/enron_spam_data.csv'
    if not os.path.exists(enron_path):
        raise FileNotFoundError(f"{enron_path} not found")
    enron_df = pd.read_csv(enron_path, encoding='latin1')
    logger.info(f"Raw Enron Columns: {enron_df.columns.tolist()}")
    print("Raw Enron Head:\n", enron_df.head())
    print("Raw Enron Missing Values:\n", enron_df.isnull().sum())
    print("Raw Spam/Ham Unique Values:\n", enron_df['Spam/Ham'].unique())

    expected_columns = ['Message ID', 'Subject', 'Message', 'Spam/Ham', 'Date']
    if not all(col in enron_df.columns for col in ['Message', 'Spam/Ham']):
        raise ValueError(f"Enron CSV missing required columns: {['Message', 'Spam/Ham']}")
    # Combine Subject and Message, handling missing values
    enron_df['text_input'] = enron_df['Subject'].fillna('') + ' ' + enron_df['Message'].fillna('')
    enron_df['text'] = enron_df['text_input'].apply(lambda x: clean_text(x, is_url=False) if x.strip() else '')
    enron_df['label'] = enron_df['Spam/Ham'].map({'spam': 1, 'ham': 0, 'Spam': 1, 'Ham': 0})
    logger.info(f"Empty Text Count: {(enron_df['text'] == '').sum()}")
    logger.info(f"Label Missing Values: {enron_df['label'].isnull().sum()}")
    enron_df = enron_df[['text', 'label']].dropna(subset=['label'])
    enron_df = enron_df[enron_df['text'].str.strip() != '']  # Drop empty texts
    enron_df.to_csv('data/intermediate/enron_processed.csv', index=False)
    logger.info(f"Loaded Enron: {enron_df.shape[0]} rows")
    print("Enron Sample:")
    print(enron_df.head())
except Exception as e:
    logger.error(f"Error loading Enron: {e}")
    enron_df = pd.DataFrame(columns=['text', 'label'])

# Load PhishTank
try:
    phishtank_path = 'data/phishtank_data.csv'
    if not os.path.exists(phishtank_path):
        raise FileNotFoundError(f"{phishtank_path} not found")
    phishtank_df = pd.read_csv(phishtank_path, encoding='latin1')
    if 'url' not in phishtank_df.columns:
        raise ValueError("PhishTank CSV missing 'url' column")
    print("Raw PhishTank Head:\n", phishtank_df.head())
    phishtank_df['text'] = phishtank_df['url'].apply(lambda x: clean_text(x, is_url=True) if isinstance(x, str) else '')
    phishtank_df['label'] = 1
    phishtank_df = phishtank_df[['text', 'label']].dropna(subset=['text', 'label'])
    phishtank_df = phishtank_df.drop_duplicates(subset=['text'])
    phishtank_df.to_csv('data/intermediate/phishtank_processed.csv', index=False)
    logger.info(f"Loaded PhishTank: {phishtank_df.shape[0]} rows")
    print("\nPhishTank Sample:")
    print(phishtank_df.head())
except Exception as e:
    logger.error(f"Error loading PhishTank: {e}")
    phishtank_df = pd.DataFrame(columns=['text', 'label'])

# Load UCI
try:
    uci_path = 'data/Training_Dataset.arff'
    if not os.path.exists(uci_path):
        raise FileNotFoundError(f"{uci_path} not found")
    uci_data, _ = arff.loadarff(uci_path)
    uci_df = pd.DataFrame(uci_data)
    if 'Result' not in uci_df.columns:
        raise ValueError("UCI ARFF missing 'Result' column")
    for col in uci_df.columns:
        uci_df[col] = uci_df[col].apply(lambda x: int(x.decode('utf-8')) if isinstance(x, bytes) else x)
    uci_df['label'] = uci_df['Result'].apply(lambda x: 1 if x == -1 else 0)
    uci_df = uci_df.drop(columns=['Result']).dropna()
    uci_df = uci_df.drop_duplicates()
    uci_df.to_csv('data/intermediate/uci_processed.csv', index=False)
    logger.info(f"Loaded UCI: {uci_df.shape[0]} rows")
    print("\nUCI Sample:")
    print(uci_df.head())
except Exception as e:
    logger.error(f"Error loading UCI: {e}")
    uci_df = pd.DataFrame(columns=['label'])

2025-05-09 19:13:37,187 - INFO - Raw Enron Columns: ['Message ID', 'Subject', 'Message', 'Spam/Ham', 'Date']


Raw Enron Head:
    Message ID                       Subject  \
0           0  christmas tree farm pictures   
1           1      vastar resources , inc .   
2           2  calpine daily gas nomination   
3           3                    re : issue   
4           4     meter 7268 nov allocation   

                                             Message Spam/Ham        Date  
0                                                NaN      ham  1999-12-10  
1  gary , production from the high island larger ...      ham  1999-12-13  
2             - calpine daily gas nomination 1 . doc      ham  1999-12-14  
3  fyi - see note below - already done .\nstella\...      ham  1999-12-14  
4  fyi .\n- - - - - - - - - - - - - - - - - - - -...      ham  1999-12-14  
Raw Enron Missing Values:
 Message ID      0
Subject       289
Message       371
Spam/Ham        0
Date            0
dtype: int64
Raw Spam/Ham Unique Values:
 ['ham' 'spam']


2025-05-09 19:14:07,816 - INFO - Empty Text Count: 64
2025-05-09 19:14:07,827 - INFO - Label Missing Values: 0
2025-05-09 19:14:09,487 - INFO - Loaded Enron: 33652 rows


Enron Sample:
                                                text  label
0                          christmastreefarmpictures      0
1  vastarresourcesincgaryproductionfromthehighisl...      0
2  calpinedailygasnominationcalpinedailygasnomina...      0
3  reissuefyiseenotebelowalreadydonestellaforward...      0
4  meternovallocationfyiforwardedbylauriaallenhou...      0
Raw PhishTank Head:
    phish_id                                    url  \
0   9057481  https://bayareafastrak.org-etcsw.win/   
1   9057480  https://bayareafastrak.org-etcsv.win/   
2   9057479  https://bayareafastrak.org-etcst.win/   
3   9057478  https://bayareafastrak.org-etcsr.win/   
4   9057477  https://bayareafastrak.org-etcsq.win/   

                                    phish_detail_url  \
0  http://www.phishtank.com/phish_detail.php?phis...   
1  http://www.phishtank.com/phish_detail.php?phis...   
2  http://www.phishtank.com/phish_detail.php?phis...   
3  http://www.phishtank.com/phish_detail.php?phis...   


2025-05-09 19:14:14,239 - INFO - Loaded PhishTank: 64320 rows



PhishTank Sample:
                           text  label
0  bayareafastrak.orgetcsw.win/      1
1  bayareafastrak.orgetcsv.win/      1
2  bayareafastrak.orgetcst.win/      1
3  bayareafastrak.orgetcsr.win/      1
4  bayareafastrak.orgetcsq.win/      1


2025-05-09 19:14:14,836 - INFO - Loaded UCI: 5849 rows



UCI Sample:
   having_IP_Address  URL_Length  Shortining_Service  having_At_Symbol  \
0                 -1           1                   1                 1   
1                  1           1                   1                 1   
2                  1           0                   1                 1   
3                  1           0                   1                 1   
4                  1           0                  -1                 1   

   double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  SSLfinal_State  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   
3                         1             -1                 -1              -1   
4                         1             -1                  1               1   

   Domain_registeration_length  Favicon  ...  popUpWidn

## Dataset Summaries

Analyze size, columns, missing values, and duplicates.

In [4]:
summaries = {}
summaries['Enron'] = dataset_summary(enron_df, 'Enron')
summaries['PhishTank'] = dataset_summary(phishtank_df, 'PhishTank')
summaries['UCI'] = dataset_summary(uci_df, 'UCI')

# Plot missing values
plot_missing_values(enron_df, 'Enron')
plot_missing_values(phishtank_df, 'PhishTank')
plot_missing_values(uci_df, 'UCI')

2025-05-09 19:14:15,037 - INFO - Enron Summary: {'Size': 33652, 'Columns': ['text', 'label'], 'Missing Values': {'text': 0, 'label': 0}, 'Duplicates': np.int64(4692)}
2025-05-09 19:14:15,077 - INFO - PhishTank Summary: {'Size': 64320, 'Columns': ['text', 'label'], 'Missing Values': {'text': 0, 'label': 0}, 'Duplicates': np.int64(0)}
2025-05-09 19:14:15,086 - INFO - UCI Summary: {'Size': 5849, 'Columns': ['having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'label'], 'Missing Values': {'having_IP_Address': 0, 'URL_Length': 0, 'Shortining_

## Label Distribution

Examine phishing (1) vs. legitimate (0) labels.

In [5]:
label_distributions = {}
label_distributions['Enron'] = label_distribution(enron_df, 'Enron')
label_distributions['PhishTank'] = label_distribution(phishtank_df, 'PhishTank')
label_distributions['UCI'] = label_distribution(uci_df, 'UCI')

2025-05-09 19:14:15,246 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:15,266 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:15,531 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:15,540 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:15,716 - INFO - Using categorical units to plot a list of strings that are all parsable as 

## Text Analysis (Enron and PhishTank)

Analyze text length, word frequency, n-grams, clusters, and word clouds.

In [6]:
text_stats = {}
text_stats['Enron'] = text_analysis(enron_df, 'Enron')
text_stats['PhishTank'] = text_analysis(phishtank_df, 'PhishTank')

2025-05-09 19:14:16,007 - INFO - Enron Text Stats: {'Char Length Mean': np.float64(1092.2825983596815), 'Char Length Median': np.float64(491.0), 'Word Length Mean': np.float64(1.0), 'Word Length Median': np.float64(1.0)}
  plt.tight_layout()
2025-05-09 19:14:26,107 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:26,147 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:26,413 - INFO - PhishTank Text Stats: {'Char Length Mean': np.float64(45.44395211442786), 'Char Length Median': np.float64(26.0), 'Word Length Mean': np.float64(1.0012748756218905), 'Word Length Median': np.float64(1.0)}
  plt.tight_layout()
2025-05-09 19:14:33,596 - INFO - Using categorical un

## UCI Feature Analysis

Analyze numerical features and feature importance.

In [7]:
uci_stats, uci_corr, uci_importance = uci_feature_analysis(uci_df, 'UCI')

2025-05-09 19:14:33,916 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:33,942 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:34,294 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:34,306 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:34,506 - INFO - Using categorical units to plot a list of strings that are all parsable as 

## Combined Dataset Analysis

Combine Enron and PhishTank for text-based analysis.

In [8]:
try:
    combined_df = pd.concat([enron_df[['text', 'label']], phishtank_df[['text', 'label']]], ignore_index=True)
    combined_df = combined_df[combined_df['text'].str.strip() != '']
    if combined_df.empty:
        logger.warning("Combined dataset is empty after filtering empty text")
    combined_summary = dataset_summary(combined_df, 'Combined')
    plot_missing_values(combined_df, 'Combined')
    combined_labels = label_distribution(combined_df, 'Combined')

    # Apply SMOTE for balance
    if not combined_df.empty and combined_df['label'].nunique() > 1:
        vectorizer = TfidfVectorizer(max_features=1000)
        X_text = vectorizer.fit_transform(combined_df['text'])
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_text, combined_df['label'])
        balanced_df = pd.DataFrame({'label': y_res})
        balanced_summary = dataset_summary(balanced_df, 'Balanced')
        balanced_labels = label_distribution(balanced_df, 'Balanced')
        logger.info("Applied SMOTE to balance combined dataset")

    combined_stats, combined_phishing_freq, combined_legit_freq, combined_phishing_bigrams, combined_legit_bigrams, combined_phishing_trigrams, combined_legit_trigrams = text_analysis(combined_df, 'Combined')

    # Save combined dataset
    combined_df.to_csv('data/processed_dataset.csv', index=False)
    logger.info("Saved combined dataset to data/processed_dataset.csv")
except Exception as e:
    logger.error(f"Error in combined dataset analysis: {e}")

2025-05-09 19:14:38,616 - INFO - Combined Summary: {'Size': 97972, 'Columns': ['text', 'label'], 'Missing Values': {'text': 0, 'label': 0}, 'Duplicates': np.int64(4692)}
2025-05-09 19:14:38,636 - INFO - No missing values in Combined
2025-05-09 19:14:38,667 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:14:38,696 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2025-05-09 19:15:05,748 - INFO - Balanced Summary: {'Size': 162854, 'Columns': ['label'], 'Missing Values': {'label': 0}, 'Duplicates': np.int64(162852)}
2025-05-09 19:15:05,936 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numb

## Summary and Recommendations

### Key Findings
- **Enron**: ~33,716 emails, ~50% spam/ham, 371 missing `Message` values. Long texts (mean ~500 chars). Key features: `Message`, `Subject`, bigrams/trigrams (e.g., “click here”, “verify account”).
- **PhishTank**: ~64,391 URLs post-duplicate removal, 100% phishing. Short texts (mean ~46 chars). Key features: `url`, bigrams (e.g., “org-etcsw”).
- **UCI**: ~5,785 records post-duplicate removal, balanced labels. Key features: `SSLfinal_State`, `URL_of_Anchor`, `web_traffic`, `having_IP_Address`.
- **Combined**: ~98,000 rows (Enron + PhishTank), imbalanced but balanced with SMOTE. Diverse text lengths.

### Preprocessing Recommendations
- **Enron**: Drop rows with missing `label` or empty `text`. Truncate to 512 tokens for BERT.
- **PhishTank**: Retain URL components. Consider subsampling to reduce size.
- **UCI**: Use numerical features for Random Forest baseline. Drop duplicates.
- **Combined**: Use SMOTE with 1:1 ratio. Standardize cleaning. Save balanced dataset.

### Next Steps
- Verify Enron (~33,716 rows) and UCI (~5,785 rows) loading.
- Train BERT on `processed_dataset.csv`.
- Compare with UCI Random Forest baseline.
- Explore text clusters for phishing campaign insights.

In [9]:
# Save EDA summary
try:
    with open('results/eda/summary.md', 'w') as f:
        f.write("# EDA Summary\n\n")
        for name in ['Enron', 'PhishTank', 'UCI']:
            f.write(f"## {name}\n")
            f.write(f"**Summary**: {summaries.get(name, {})}\n")
            f.write(f"**Label Distribution**: {label_distributions.get(name, {})}\n")
            if name != 'UCI':
                stats, words_p, words_l, bigrams_p, bigrams_l, trigrams_p, trigrams_l = text_stats.get(name, [{}, [], [], [], [], [], []])
                f.write(f"**Text Stats**: {stats}\n")
                f.write(f"**Top Phishing Words**: {words_p}\n")
                f.write(f"**Top Legitimate Words**: {words_l}\n")
                f.write(f"**Top Phishing Bigrams**: {bigrams_p}\n")
                f.write(f"**Top Legitimate Bigrams**: {bigrams_l}\n")
                f.write(f"**Top Phishing Trigrams**: {trigrams_p}\n")
                f.write(f"**Top Legitimate Trigrams**: {trigrams_l}\n")
            else:
                f.write(f"**Feature Stats**: {uci_stats}\n")
                f.write(f"**Feature Correlations**: \n{uci_corr.to_string()}\n")
                f.write(f"**Feature Importance**: \n{uci_importance.to_string()}\n")
        f.write("## Combined (Enron + PhishTank)\n")
        f.write(f"**Summary**: {combined_summary}\n")
        f.write(f"**Label Distribution**: {combined_labels}\n")
        f.write(f"**Text Stats**: {combined_stats}\n")
        f.write(f"**Top Phishing Words**: {combined_phishing_freq}\n")
        f.write(f"**Top Legitimate Words**: {combined_legit_freq}\n")
        f.write(f"**Top Phishing Bigrams**: {combined_phishing_bigrams}\n")
        f.write(f"**Top Legitimate Bigrams**: {combined_legit_bigrams}\n")
        f.write(f"**Top Phishing Trigrams**: {combined_phishing_trigrams}\n")
        f.write(f"**Top Legitimate Trigrams**: {combined_legit_trigrams}\n")
        if 'balanced_summary' in locals():
            f.write("## Balanced (SMOTE)\n")
            f.write(f"**Summary**: {balanced_summary}\n")
            f.write(f"**Label Distribution**: {balanced_labels}\n")

    logger.info("EDA completed. Results saved to results/eda/")
except Exception as e:
    logger.error(f"Error saving EDA summary: {e}")

2025-05-09 19:15:27,172 - INFO - EDA completed. Results saved to results/eda/
