In [None]:
# [1] Setup and Data Loading
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from transformers import AutoTokenizer

# Verify paths match your actual structure
DATA_DIR = Path("../data/extracted/phishing")
assert DATA_DIR.exists(), f"Path not found: {DATA_DIR}"

df = pd.read_csv(DATA_DIR / "enron.csv")
print("Raw data loaded. Columns:", df.columns.tolist())

# [2] Preprocess and Class Balance Check
df['text'] = df['subject'].fillna('') + "\n\n" + df['body'].fillna('')
class_counts = df['label'].value_counts()
print(f"\nClass counts:\n{class_counts}")

# [3] Token Length Analysis (Critical!)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
df['token_count'] = df['text'].apply(lambda x: len(tokenizer.tokenize(x)))

plt.figure(figsize=(10,4))
df['token_count'].hist(bins=50)
plt.axvline(x=256, color='r', linestyle='--')
plt.title(f"Token Counts (Max={df['token_count'].max()}, >256={(df['token_count']>256).mean():.1%})")
plt.show()

# [4] Phishing Keyword Check
keywords = ['click', 'urgent', 'password', 'verify', 'account']
print("\nPhishing keyword frequency:")
for word in keywords:
    freq = df[df['label']==1]['text'].str.contains(word, case=False).mean()
    print(f"{word:<10}: {freq:.1%} in phishing emails")

# [5] Sample Inspection
print("\n=== Sample Legitimate Email ===")
print(df[df['label']==0]['text'].iloc[0][:300] + "...")
print("\n=== Sample Phishing Email ===")
print(df[df['label']==1]['text'].iloc[0][:300] + "...")