In [None]:
# [1] Enhanced Data Loading
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

# Load data (updated paths to match your actual structure)
data_dir = Path("../data/extracted/phishing")
legit = pd.read_csv(data_dir / "enron.csv")[lambda x: x['label'] == 0]
phish = pd.read_csv(data_dir / "enron.csv")[lambda x: x['label'] == 1]

# Combine subject + body
df = pd.concat([
    legit.assign(
        text=lambda x: x['subject'].fillna('') + "\n\n" + x['body'].fillna(''),
        type="legitimate"
    ),
    phish.assign(
        text=lambda x: x['subject'].fillna('') + "\n\n" + x['body'].fillna(''),
        type="phishing"
    )
])

# [2] Enhanced Class Analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Class distribution
df["type"].value_counts().plot.pie(
    autopct='%.1f%%', 
    title="Class Distribution",
    ax=ax1
)

# Text length distribution
for label in df['type'].unique():
    df[df['type']==label]['text'].str.len().plot(
        kind='kde',
        ax=ax2,
        label=label
    )
ax2.set_title("Text Length Distribution")
ax2.legend()

plt.tight_layout()
plt.show()

# [3] Vocabulary Analysis (Critical for LLMs)
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# Top phishing terms
phish_words = Counter(" ".join(phish['subject'] + " " + phish['body']).lower().split()).most_common(20)
print("Top phishing terms:", phish_words)

# [4] Sample Comparison
print("\n=== Legitimate Sample ===")
print(df[df['type']=='legitimate']['text'].iloc[0][:500] + "...")
print("\n=== Phishing Sample ===")
print(df[df['type']=='phishing']['text'].iloc[0][:500] + "...")