# LLM Phishing Email Detection Evaluation

In [1]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path

# Add src directory to path to import our API modules
sys.path.append('src')

from api_provider import LLM
from api_call import make_api_call

print("Dependencies imported successfully!")


Dependencies imported successfully!


In [2]:
# Load the unified phishing email dataset
df = pd.read_csv('unified_phishing_email_dataset.csv')
print(f"Dataset loaded with {len(df)} records")
print(f"Columns: {list(df.columns)}")
print(f"\nDataset info:")
print(df.info())


Dataset loaded with 217204 records
Columns: ['subject', 'body', 'label', 'source']

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217204 entries, 0 to 217203
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   subject  214600 non-null  object
 1   body     217201 non-null  object
 2   label    217204 non-null  int64 
 3   source   217204 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.6+ MB
None


In [3]:
# Sample 100 random records for evaluation
np.random.seed(42)  # For reproducibility
sample_df = df.sample(n=100, random_state=42).reset_index(drop=True)

print("Sample emails for evaluation:")
for i, row in sample_df.iterrows():
    print(f"\n--- Email {i+1} ---")
    print(f"Subject: {row['subject'][:100]}...")
    print(f"Body: {row['body'][:200]}...")
    print(f"True Label: {'Phishing' if row['label'] == 1 else 'Legitimate'}")
    print(f"Source: {row['source']}")


Sample emails for evaluation:

--- Email 1 ---
Subject: cognitive linguistics...
Body: possession : cognitive sources , forces and grammaticalization bernd heine ( university of cologne ) ; possession : cognitive sources , forces and grammaticalization ; isbn : 0-521 - 55037 - 8 ; hardb...
True Label: Legitimate
Source: Ling

--- Email 2 ---
Subject: powerisk 2000 - more cocktail info...
Body: - - - - - - - - - - - - - - - - - - - - - - forwarded by iona maclean / lon / ect on 22 / 09 / 2000 12 : 24
- - - - - - - - - - - - - - - - - - - - - - - - - - -
enron capital & trade resources corp ....
True Label: Legitimate
Source: Enron

--- Email 3 ---
Subject: Re: AC adaptor ...
Body: Please see the FAQ for the answer to the adapter question:   http://el.www.media.mit.edu/projects/handy-board/faq/ re: the L1, that substitution may or may not work.  try it; if your motors don't work...
True Label: Legitimate
Source: TREC_06

--- Email 4 ---
Subject: CNN.com Daily Top 10...
Body: >+=+=+=+=+=+

In [4]:
# see src/api_provider to add more providers
PROVIDER = "local"
MODEL = "qwen3-30b-a3b"

try:
    llm_instance = LLM(provider=PROVIDER, model=MODEL)
    llm = llm_instance.get_llm()
    print(f"LLM initialized successfully: {PROVIDER} - {MODEL}")
except Exception as e:
    print(f"Error initializing LLM: {e}")
    print("Please make sure you have set the appropriate API key environment variable.")


LLM initialized successfully: local - qwen3-30b-a3b


In [5]:
# Define the system prompt for phishing detection
SYSTEM_PROMPT = """
You are an expert cybersecurity analyst specializing in phishing email detection.
Your task is to analyze email content and determine if it's a phishing email or legitimate.

Guidelines:
- Phishing emails typically contain deceptive content designed to steal credentials, personal information, or money
- Look for suspicious indicators like urgent language, suspicious links, grammatical errors, impersonation attempts
- Legitimate emails are from real organizations and don't attempt to deceive recipients

Respond with:
- Phishing: true if the email is phishing, false if legitimate
- Reason: Brief explanation of your decision
"""

def create_user_prompt(subject, body):
    return f"""
Please analyze the following email and determine if it's phishing or legitimate:

Subject: {subject}

Body: {body}

Analyze this email and provide your assessment.
"""

print("Prompt templates defined successfully!")


Prompt templates defined successfully!


In [6]:
# Evaluate LLM performance on sample emails
results = []

for i, row in sample_df.iterrows():
    print("="*80)
    print(f"\nEvaluating Email {i+1}...")
    
    # Create user prompt
    user_prompt = create_user_prompt(row['subject'], row['body'])
    
    try:
        # Make API call
        response = make_api_call(llm, SYSTEM_PROMPT, user_prompt)
        
        # Extract prediction
        predicted_phishing = response.Phishing
        predicted_label = 1 if predicted_phishing else 0
        actual_label = row['label']
        
        # Calculate if prediction is correct
        is_correct = predicted_label == actual_label
        
        result = {
            'email_id': i+1,
            'subject': row['subject'][:50] + '...',
            'actual_label': actual_label,
            'actual_class': 'Phishing' if actual_label == 1 else 'Legitimate',
            'predicted_label': predicted_label,
            'predicted_class': 'Phishing' if predicted_label == 1 else 'Legitimate',
            'is_correct': is_correct,
            'llm_reason': response.Reason,
            'source': row['source']
        }
        
        results.append(result)
        
        print(f"  Actual: {result['actual_class']}")
        print(f"  Predicted: {result['predicted_class']}")
        print(f"  Correct: {is_correct}")
        print(f"  Reason: {response.Reason[:100]}...")
        
    except Exception as e:
        print(f"  Error processing email {i+1}: {e}")
        result = {
            'email_id': i+1,
            'subject': row['subject'][:50] + '...',
            'actual_label': row['label'],
            'actual_class': 'Phishing' if row['label'] == 1 else 'Legitimate',
            'predicted_label': None,
            'predicted_class': 'Error',
            'is_correct': False,
            'llm_reason': f'Error: {str(e)}',
            'source': row['source']
        }
        results.append(result)

print("\nEvaluation completed")



Evaluating Email 1...
Phishing=False Reason='The email appears to be a legitimate academic book listing from Cambridge University Press, providing detailed information about various books on cognitive linguistics. It contains no suspicious links, urgent requests, or deceptive content.'
  Actual: Legitimate
  Predicted: Legitimate
  Correct: True
  Reason: The email appears to be a legitimate academic book listing from Cambridge University Press, providin...

Evaluating Email 2...
Phishing=False Reason='The email appears to be a legitimate business communication from Powerisk 2000 event organizers. It contains standard corporate formatting, proper contact information, and the subject line mentions an important invitation with high priority. The content is about event invitations and there are no obvious signs of phishing such as suspicious links or requests for sensitive information.'
  Actual: Legitimate
  Predicted: Legitimate
  Correct: True
  Reason: The email appears to be a legit

In [7]:
# Create results dataframe and display detailed results
results_df = pd.DataFrame(results)

print("=== DETAILED RESULTS ===")
print(results_df[['email_id', 'subject', 'actual_class', 'predicted_class', 'is_correct']].to_string(index=False))

print("\n=== LLM REASONING ===")
for result in results:
    print(f"\nEmail {result['email_id']}: {result['subject']}")
    print(f"Prediction: {result['predicted_class']}")
    print(f"Reason: {result['llm_reason']}")


=== DETAILED RESULTS ===
 email_id                                               subject actual_class predicted_class  is_correct
        1                              cognitive linguistics...   Legitimate      Legitimate        True
        2                 powerisk 2000 - more cocktail info...   Legitimate      Legitimate        True
        3                                    Re: AC adaptor ...   Legitimate      Legitimate        True
        4                               CNN.com Daily Top 10...     Phishing      Legitimate       False
        5 ERV Notification:  (Violation/Notification Memo - ...   Legitimate      Legitimate        True
        6                                      RE: NOx Model...   Legitimate      Legitimate        True
        7                 REMINDER: Officers Meeting Tonight...   Legitimate      Legitimate        True
        8         Free Full Length Tee_n Movies For Download...     Phishing        Phishing        True
        9                 1 / 

In [8]:
# Calculate performance metrics
valid_results = [r for r in results if r['predicted_label'] is not None]
total_emails = len(valid_results)
correct_predictions = sum(1 for r in valid_results if r['is_correct'])

if total_emails > 0:
    accuracy = correct_predictions / total_emails
    
    # Calculate confusion matrix components
    tp = sum(1 for r in valid_results if r['actual_label'] == 1 and r['predicted_label'] == 1)  # True Positive
    tn = sum(1 for r in valid_results if r['actual_label'] == 0 and r['predicted_label'] == 0)  # True Negative
    fp = sum(1 for r in valid_results if r['actual_label'] == 0 and r['predicted_label'] == 1)  # False Positive
    fn = sum(1 for r in valid_results if r['actual_label'] == 1 and r['predicted_label'] == 0)  # False Negative
    
    print("=== PERFORMANCE METRICS ===")
    print(f"Total emails evaluated: {total_emails}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Accuracy: {accuracy:.2%}")
    
    print("\n=== CONFUSION MATRIX ===")
    print(f"True Positives (Phishing correctly identified): {tp}")
    print(f"True Negatives (Legitimate correctly identified): {tn}")
    print(f"False Positives (Legitimate misclassified as Phishing): {fp}")
    print(f"False Negatives (Phishing misclassified as Legitimate): {fn}")
    
    # Calculate precision, recall, F1 if applicable
    if tp + fp > 0:
        precision = tp / (tp + fp)
        print(f"\nPrecision: {precision:.2%}")
    
    if tp + fn > 0:
        recall = tp / (tp + fn)
        print(f"Recall: {recall:.2%}")
    
    if tp + fp > 0 and tp + fn > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
        print(f"F1 Score: {f1_score:.2%}")
    
else:
    print("No valid results to calculate metrics.")


=== PERFORMANCE METRICS ===
Total emails evaluated: 100
Correct predictions: 89
Accuracy: 89.00%

=== CONFUSION MATRIX ===
True Positives (Phishing correctly identified): 34
True Negatives (Legitimate correctly identified): 55
False Positives (Legitimate misclassified as Phishing): 3
False Negatives (Phishing misclassified as Legitimate): 8

Precision: 91.89%
Recall: 80.95%
F1 Score: 86.08%


In [9]:
# Save results for future analysis
results_df.to_csv('llm_evaluation_results.csv', index=False)
print("Results saved to 'llm_evaluation_results.csv'")

# Display summary
print("\n=== EVALUATION SUMMARY ===")
print(f"Provider: {PROVIDER}")
print(f"Model: {MODEL}")
print(f"Sample size: {len(sample_df)} emails")
print(f"Successfully processed: {len(valid_results)} emails")
if total_emails > 0:
    print(f"Overall accuracy: {accuracy:.2%}")


Results saved to 'llm_evaluation_results.csv'

=== EVALUATION SUMMARY ===
Provider: local
Model: qwen3-30b-a3b
Sample size: 100 emails
Successfully processed: 100 emails
Overall accuracy: 89.00%
