# LLM Phishing Email Detection Evaluation

In [1]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path

# Add src directory to path to import our API modules
sys.path.append('src')

from api_provider import LLM
from api_call import make_api_call

print("Dependencies imported successfully!")


Dependencies imported successfully!


In [2]:
# Load the unified phishing email dataset
df = pd.read_csv('unified_phishing_email_dataset.csv')
print(f"Dataset loaded with {len(df)} records")
print(f"Columns: {list(df.columns)}")
print(f"\nDataset info:")
print(df.info())


Dataset loaded with 217204 records
Columns: ['subject', 'body', 'label', 'source']

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217204 entries, 0 to 217203
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   subject  214600 non-null  object
 1   body     217201 non-null  object
 2   label    217204 non-null  int64 
 3   source   217204 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.6+ MB
None


In [3]:
# Replace NaN values with empty strings
df = df.fillna("")

# Verify no more null values
print("After replacing NaN values:")
print(df.isnull().sum())

After replacing NaN values:
subject    0
body       0
label      0
source     0
dtype: int64


In [4]:
# Sample 100 random records for evaluation
np.random.seed(42)  # For reproducibility
sample_df = df.sample(n=1000, random_state=611).reset_index(drop=True)

print("Sample emails for evaluation:")
for i, row in sample_df.iterrows():
    print(f"\n--- Email {i+1} ---")
    print(f"Subject: {row['subject'][:100]}...")
    print(f"Body: {row['body'][:200]}...")
    print(f"True Label: {'Phishing' if row['label'] == 1 else 'Legitimate'}")
    print(f"Source: {row['source']}")


Sample emails for evaluation:

--- Email 1 ---
Subject: Re: [UAI] Imprecise Probabilities--A simple and yet computationally nontrivial problem...
Body: Dear Prof. Zadeh (and others),

My view is that the notion of imprecise probabilities is not well-defined.
It seems that you are imagining something like, before the experiment is
performed, the proba...
True Label: Legitimate
Source: CEAS_08

--- Email 2 ---
Subject: ...
Body: please don't write.  we're working on getting it released.

-rob

...
True Label: Legitimate
Source: TREC_06

--- Email 3 ---
Subject: But lonetree none kinnear...
Body: THIS ONE IS BEING PROMOTED, TAKE ADVANTAGE
Special Situation Alert

TMXO

Trimax. are providers of Broadband over Power Line (BPL) 
communication technologies. 

OTC:TMXO - Last: 0.20 (up 8%, it is ju...
True Label: Phishing
Source: TREC_07

--- Email 4 ---
Subject: poznan linguistic meeting 1999...
Body: plm 1999 32nd poznan linguistic meeting recent developments in linguistic theory 30 april-2 

In [5]:
# see src/api_provider to add more providers
PROVIDER = "local"
MODEL = "qwen3-30b-a3b"

try:
    llm_instance = LLM(provider=PROVIDER, model=MODEL)
    llm = llm_instance.get_llm()
    print(f"LLM initialized successfully: {PROVIDER} - {MODEL}")
except Exception as e:
    print(f"Error initializing LLM: {e}")
    print("Please make sure you have set the appropriate API key environment variable.")


LLM initialized successfully: local - qwen3-30b-a3b


In [6]:
# Define the system prompt for phishing detection
SYSTEM_PROMPT = """
You are an expert cybersecurity analyst specializing in phishing email detection.
Your task is to analyze email content and determine if it's a phishing email or legitimate.

Guidelines:
- Phishing emails typically contain deceptive content designed to steal credentials, personal information, or money
- Look for suspicious indicators like urgent language, suspicious links, grammatical errors, impersonation attempts
- Legitimate emails are from real organizations and don't attempt to deceive recipients

Respond with:
- Phishing: true if the email is phishing, false if legitimate
- Reason: Brief explanation of your decision
"""

def create_user_prompt(subject, body):
    return f"""
Please analyze the following email and determine if it's phishing or legitimate:

Subject: {subject}

Body: {body}

Analyze this email and provide your assessment.
"""

print("Prompt templates defined successfully!")


Prompt templates defined successfully!


In [7]:
# Evaluate LLM performance on sample emails
results = []

for i, row in sample_df.iterrows():
    print("="*80)
    print(f"\nEvaluating Email {i+1}...")
    
    # Create user prompt
    user_prompt = create_user_prompt(row['subject'], row['body'])
    
    try:
        # Make API call
        response = make_api_call(llm, SYSTEM_PROMPT, user_prompt)
        
        # Extract prediction
        predicted_phishing = response.Phishing
        predicted_label = 1 if predicted_phishing else 0
        actual_label = row['label']
        
        # Calculate if prediction is correct
        is_correct = predicted_label == actual_label
        
        result = {
            'email_id': i+1,
            'subject': row['subject'][:50] + '...',
            'actual_label': actual_label,
            'actual_class': 'Phishing' if actual_label == 1 else 'Legitimate',
            'predicted_label': predicted_label,
            'predicted_class': 'Phishing' if predicted_label == 1 else 'Legitimate',
            'is_correct': is_correct,
            'llm_reason': response.Reason,
            'source': row['source']
        }
        
        results.append(result)
        
        print(f"  Actual: {result['actual_class']}")
        print(f"  Predicted: {result['predicted_class']}")
        print(f"  Correct: {is_correct}")
        print(f"  Reason: {response.Reason[:100]}...")
        
    except Exception as e:
        print(f"  Error processing email {i+1}: {e}")
        result = {
            'email_id': i+1,
            'subject': row['subject'][:50] + '...',
            'actual_label': row['label'],
            'actual_class': 'Phishing' if row['label'] == 1 else 'Legitimate',
            'predicted_label': None,
            'predicted_class': 'Error',
            'is_correct': False,
            'llm_reason': f'Error: {str(e)}',
            'source': row['source']
        }
        results.append(result)

print("\nEvaluation completed")



Evaluating Email 1...
Phishing=False Reason="The email appears to be a legitimate academic discussion about imprecise probabilities. It's a follow-up to a previous message from Lotfi Zadeh, a well-known computer science professor, discussing technical aspects of probability theory. There are no suspicious links, urgent requests for personal information, or other typical phishing indicators."
  Actual: Legitimate
  Predicted: Legitimate
  Correct: True
  Reason: The email appears to be a legitimate academic discussion about imprecise probabilities. It's a follo...

Evaluating Email 2...
Phishing=False Reason="The email is extremely brief, lacks any specific details, and appears to be a simple message from someone named Rob. There's no indication of phishing attempts such as requests for sensitive information or suspicious links."
  Actual: Legitimate
  Predicted: Legitimate
  Correct: True
  Reason: The email is extremely brief, lacks any specific details, and appears to be a simple me

In [8]:
# Create results dataframe and display detailed results
results_df = pd.DataFrame(results)

print("=== DETAILED RESULTS ===")
print(results_df[['email_id', 'subject', 'actual_class', 'predicted_class', 'is_correct']].to_string(index=False))

print("\n=== LLM REASONING ===")
for result in results:
    print(f"\nEmail {result['email_id']}: {result['subject']}")
    print(f"Prediction: {result['predicted_class']}")
    print(f"Reason: {result['llm_reason']}")


=== DETAILED RESULTS ===
 email_id                                                subject actual_class predicted_class  is_correct
        1  Re: [UAI] Imprecise Probabilities--A simple and ye...   Legitimate      Legitimate        True
        2                                                    ...   Legitimate      Legitimate        True
        3                           But lonetree none kinnear...     Phishing      Legitimate       False
        4                      poznan linguistic meeting 1999...   Legitimate      Legitimate        True
        5           TRV Notification:  (EMW P/L - 11/21/2001)...   Legitimate      Legitimate        True
        6  Commissioner.COM E-Reports for Get Bad with Yourse...   Legitimate      Legitimate        True
        7                              Perfected RX Discounts...     Phishing        Phishing        True
        8              Re: [R] subset arg in (modified) evalq...   Legitimate      Legitimate        True
        9  [UAI] Curr

In [9]:
# Calculate performance metrics
valid_results = [r for r in results if r['predicted_label'] is not None]
total_emails = len(valid_results)
correct_predictions = sum(1 for r in valid_results if r['is_correct'])

if total_emails > 0:
    accuracy = correct_predictions / total_emails
    
    # Calculate confusion matrix components
    tp = sum(1 for r in valid_results if r['actual_label'] == 1 and r['predicted_label'] == 1)  # True Positive
    tn = sum(1 for r in valid_results if r['actual_label'] == 0 and r['predicted_label'] == 0)  # True Negative
    fp = sum(1 for r in valid_results if r['actual_label'] == 0 and r['predicted_label'] == 1)  # False Positive
    fn = sum(1 for r in valid_results if r['actual_label'] == 1 and r['predicted_label'] == 0)  # False Negative
    
    print("=== PERFORMANCE METRICS ===")
    print(f"Total emails evaluated: {total_emails}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Accuracy: {accuracy:.2%}")
    
    print("\n=== CONFUSION MATRIX ===")
    print(f"True Positives (Phishing correctly identified): {tp}")
    print(f"True Negatives (Legitimate correctly identified): {tn}")
    print(f"False Positives (Legitimate misclassified as Phishing): {fp}")
    print(f"False Negatives (Phishing misclassified as Legitimate): {fn}")
    
    # Calculate precision, recall, F1 if applicable
    if tp + fp > 0:
        precision = tp / (tp + fp)
        print(f"\nPrecision: {precision:.2%}")
    
    if tp + fn > 0:
        recall = tp / (tp + fn)
        print(f"Recall: {recall:.2%}")
    
    if tp + fp > 0 and tp + fn > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
        print(f"F1 Score: {f1_score:.2%}")
    
else:
    print("No valid results to calculate metrics.")


=== PERFORMANCE METRICS ===
Total emails evaluated: 1000
Correct predictions: 924
Accuracy: 92.40%

=== CONFUSION MATRIX ===
True Positives (Phishing correctly identified): 414
True Negatives (Legitimate correctly identified): 510
False Positives (Legitimate misclassified as Phishing): 12
False Negatives (Phishing misclassified as Legitimate): 64

Precision: 97.18%
Recall: 86.61%
F1 Score: 91.59%


In [10]:
# Save results for future analysis
results_df.to_csv('llm_evaluation_results.csv', index=False)
print("Results saved to 'llm_evaluation_results.csv'")

# Display summary
print("\n=== EVALUATION SUMMARY ===")
print(f"Provider: {PROVIDER}")
print(f"Model: {MODEL}")
print(f"Sample size: {len(sample_df)} emails")
print(f"Successfully processed: {len(valid_results)} emails")
if total_emails > 0:
    print(f"Overall accuracy: {accuracy:.2%}")


Results saved to 'llm_evaluation_results.csv'

=== EVALUATION SUMMARY ===
Provider: local
Model: qwen3-30b-a3b
Sample size: 1000 emails
Successfully processed: 1000 emails
Overall accuracy: 92.40%
