In [None]:
%pip install python-dotenv langchain langchain_community langchain-openai langsmith langgraph faiss-cpu beautifulsoup4

### **Core imports and configuration**

In [None]:
# Cell 1: Core imports and configuration
import os
import json
import re
import pandas as pd
import numpy as np
import operator
from typing import List, Dict, Any, Optional, Tuple, Annotated, TypedDict, Sequence
from datetime import datetime
from dotenv import load_dotenv

# LangChain imports
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

from langchain.agents import create_tool_calling_agent
from langchain.tools import tool, BaseTool
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferWindowMemory
from langchain.callbacks import StreamingStdOutCallbackHandler
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
from langchain.schema import Document
from langchain.output_parsers import PydanticOutputParser
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode
from pydantic import BaseModel, Field

# Email processing imports
import base64
from dateutil.parser import parse
from langchain_community.agent_toolkits import GmailToolkit
from langchain_community.tools.gmail.utils import build_resource_service

# Load environment variables
load_dotenv()

# Configure environment variables
os.environ["LANGSMITH_TRACING"] = os.getenv("LANGSMITH_TRACING")
os.environ["LANGSMITH_ENDPOINT"] = os.getenv("LANGSMITH_ENDPOINT")
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_PROJECT"] = os.getenv("LANGSMITH_PROJECT")

# Get the OpenAI API Key from the .env file
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
    os.environ["OPENAI_API_KEY"] = openai_api_key
else:
    raise ValueError("OPENAI_API_KEY not found in .env file!")

# To test the LangSmith connection
try:
    from langsmith import Client
    client = Client()
    print("LangSmith connection successful!")
    print(f"Proje: {os.environ.get('LANGSMITH_PROJECT')}")
except Exception as e:
    print(f"LangSmith connection error: {e}")

from langchain.chat_models import init_chat_model
test_llm = init_chat_model("gpt-4o-mini", model_provider="openai")

# Send test message (to check tracing)
try:
    response = test_llm.invoke("Hi, this is a test message!")
    print("LLM test successful!")
    print("You can check the traces in LangSmith.")
    print(response)
    del test_llm  # We are deleting the test_llm to clear the memory
except Exception as e:
    print(f"LLM test error: {e}")

print("✅ Core configuration completed")

### **Email Fetcher Class**

In [None]:
# Cell 2: Email Fetcher class - structured approach for email retrieval
class EmailFetcher:
    """Gmail API email fetcher and processor class"""

    def __init__(self):
        """Initialise Gmail API service"""
        self.api_resource = build_resource_service()
        self.toolkit = GmailToolkit(api_resource=self.api_resource)
        self.search_tool = next(
            (tool for tool in self.toolkit.get_tools() if tool.name == 'search_gmail'),
            None
        )

        if not self.search_tool:
            raise ValueError("Gmail search tool could not be initialised!")

    def get_email_contents(self, payload: Dict) -> Dict[str, str]:
        """
        Recursively extract content from email payload

        Args:
            payload: Email payload from Gmail API

        Returns:
            dict: Dictionary containing 'text' and 'html' content
        """
        plain_text = ""
        html_text = ""

        # Recursively process parts if they exist
        if 'parts' in payload:
            for part in payload['parts']:
                nested_content = self.get_email_contents(part)
                plain_text += nested_content['text']
                html_text += nested_content['html']

        # Decode body if present
        elif 'body' in payload and 'data' in payload['body']:
            mime_type = payload.get('mimeType', '')
            body_data = payload['body'].get('data', '')

            if body_data:
                try:
                    decoded_body = base64.urlsafe_b64decode(body_data).decode('utf-8', errors='ignore')
                    if 'text/plain' in mime_type:
                        plain_text += decoded_body
                    elif 'text/html' in mime_type:
                        html_text += decoded_body
                except Exception as e:
                    print(f"⚠️ Decode error: {e}")
                    pass

        return {'text': plain_text.strip(), 'html': html_text.strip()}

    def fetch_emails(self, query: str = "in:inbox", max_results: int = 10) -> pd.DataFrame:
        """
        Fetch emails from Gmail and return as DataFrame

        Args:
            query: Gmail search query
            max_results: Maximum number of emails to fetch

        Returns:
            pd.DataFrame: Email data
        """
        processed_emails = []

        try:
            # Search for emails
            search_params = {"query": query, "max_results": max_results}
            search_results = self.search_tool.run(search_params)

            print(f"📧 Found {len(search_results)} emails, fetching details...")

            # Process each email
            for summary in search_results:
                message_id = summary.get('id')
                message_detail = self.api_resource.users().messages().get(
                    userId='me',
                    id=message_id,
                    format='full'
                ).execute()

                payload = message_detail.get('payload', {})
                headers = payload.get('headers', [])

                # Extract header information
                subject = next((h['value'] for h in headers if h['name'].lower() == 'subject'), 'N/A')
                sender = next((h['value'] for h in headers if h['name'].lower() == 'from'), 'N/A')
                to = next((h['value'] for h in headers if h['name'].lower() == 'to'), 'N/A')
                cc = next((h['value'] for h in headers if h['name'].lower() == 'cc'), 'N/A')
                date_str = next((h['value'] for h in headers if h['name'].lower() == 'date'), None)

                # Parse date
                try:
                    email_date = parse(date_str) if date_str else None
                except:
                    email_date = None

                # Check labels and read status
                labels = message_detail.get('labelIds', [])
                is_unread = 'UNREAD' in labels

                # Extract content - FULL TEXT, no truncation for security
                contents = self.get_email_contents(payload)

                # Recursive attachment check
                has_attachment = False
                attachment_names = []

                def check_attachments(part):
                    """Recursive attachment check"""
                    if part.get('filename'):
                        return True, part.get('filename')
                    if 'parts' in part:
                        for subpart in part['parts']:
                            has_att, filename = check_attachments(subpart)
                            if has_att:
                                return True, filename
                    return False, None

                if 'parts' in payload:
                    for part in payload['parts']:
                        has_att, filename = check_attachments(part)
                        if has_att:
                            has_attachment = True
                            if filename:
                                attachment_names.append(filename)

                # Add email data - storing FULL content
                processed_emails.append({
                    'id': message_id,
                    'is_unread': is_unread,
                    'date': email_date,
                    'from': sender,
                    'to': to,
                    'cc': cc,
                    'labels': labels,
                    'subject': subject,
                    'body_text': contents['text'],  # Full text for analysis
                    'body_html': contents['html'],  # Full HTML for security checks
                    'has_attachment': has_attachment,
                    'attachment_names': attachment_names
                })

            # Create DataFrame
            if processed_emails:
                df = pd.DataFrame(processed_emails)
                df = df.sort_values(by='date', ascending=False, na_position='last').reset_index(drop=True)
                print(f"✅ Successfully processed {len(df)} emails")
                return df
            else:
                print("⚠️ No emails found")
                return pd.DataFrame()

        except Exception as e:
            print(f"❌ Email fetch error: {e}")
            return pd.DataFrame()

    # Added to the code for testing purposes, to better examine the emails received and their content.
    def save_to_csv(self, df: pd.DataFrame, filename: str = 'fetched_emails.csv'):
        """
        Save DataFrame to CSV file

        Args:
            df: DataFrame to save
            filename: Output filename
        """
        if not df.empty:
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"✅ DataFrame successfully saved to '{filename}'")
        else:
            print("⚠️ No data to save")

# Test the fetcher
email_fetcher = EmailFetcher()
print("✅ EmailFetcher successfully created")

In [None]:
# Cell 2a: Enhanced Query Parser for Dynamic Email Retrieval

class EmailQueryIntent(BaseModel):
    """Structured output for email query parsing."""

    email_count: Optional[int] = Field(
        description="Number of emails to fetch (None means all matching emails)"
    )
    time_filter: Optional[str] = Field(
        description="Time-based filter (e.g., 'today', 'this_week', 'last_month')"
    )
    status_filter: Optional[str] = Field(
        description="Email status filter (e.g., 'unread', 'important', 'starred')"
    )
    sender_filter: Optional[str] = Field(
        description="Specific sender to filter by"
    )
    subject_keywords: Optional[str] = Field(
        description="Keywords to search in subject"
    )
    label_filter: Optional[str] = Field(
        description="Gmail label to filter by"
    )

class IntelligentQueryParser:
    """
    Parse user queries to extract email retrieval parameters dynamically.
    Uses LLM for natural language understanding rather than rigid rules.
    """

    def __init__(self, llm_model: str = "gpt-4o-mini"):
        """
        Initialise the query parser with LLM.

        Args:
            llm_model: Model to use for parsing
        """
        self.llm = ChatOpenAI(model=llm_model, temperature=0.3)
        self.output_parser = PydanticOutputParser(pydantic_object=EmailQueryIntent)

    def parse_user_query(self, user_query: str) -> EmailQueryIntent:
        """
        Parse user query to extract email retrieval parameters.

        Args:
            user_query: Natural language query from user

        Returns:
            EmailQueryIntent with parsed parameters
        """

        format_instructions = self.output_parser.get_format_instructions()

        prompt = f"""Parse the following email query to extract retrieval parameters.

User Query: "{user_query}"

Extract the following information:
- email_count: If user specifies a number (e.g., "5 emails", "last 10"), extract it.
  If they say "all", set to None. If no number mentioned, set to None.
- time_filter: Extract time references like "today", "yesterday", "this week", "last week", "this month", "last month"
  IMPORTANT: Preserve the exact format (e.g., "this week" not "this_week")
- status_filter: Extract status like "unread", "important", "starred"
  NOTE: If user says "urgent" or "priority", map it to "important"
- sender_filter: Extract specific sender if mentioned (name or email)
- subject_keywords: Extract any subject-related keywords or topics
- label_filter: Extract Gmail labels if mentioned

Examples:
- "Show me my last 5 unread emails" → email_count: 5, status_filter: "unread"
- "Get all emails from today" → email_count: None, time_filter: "today"
- "Fetch emails about the project" → subject_keywords: "project"
- "Show all unread emails" → email_count: None, status_filter: "unread"
- "Get all urgent emails from today" → email_count: None, time_filter: "today", status_filter: "important"
- "Find emails from this week about meetings" → time_filter: "this week", subject_keywords: "meetings"
- "Urgent messages from John" → status_filter: "important", sender_filter: "John"
- "Show me priority emails" → status_filter: "important"

{format_instructions}

Parse the query and respond with the JSON object:"""

        response = self.llm.invoke(prompt)

        try:
            # Parse the response
            intent = self.output_parser.parse(response.content)
            return intent
        except Exception as e:
            print(f"⚠️ Failed to parse query intent: {e}")
            # Return default intent
            return EmailQueryIntent()

    def build_gmail_query(self, intent: EmailQueryIntent) -> tuple[str, int]:
        """
        Build Gmail API query string from parsed intent.

        Args:
            intent: Parsed email query intent

        Returns:
            Tuple of (gmail_query_string, max_results)
        """
        query_parts = []

        # Build query based on intent
        if intent.status_filter:
            # IMPROVEMENT 1: Map common terms to Gmail's actual filters
            status_mappings = {
                "unread": "is:unread",
                "important": "is:important",
                "urgent": "is:important",  # Map "urgent" to "important"
                "priority": "is:important",  # Map "priority" to "important"
                "starred": "is:starred",
                "read": "-is:unread",  # Negative filter for read emails
            }

            filter_value = intent.status_filter.lower()
            if filter_value in status_mappings:
                query_parts.append(status_mappings[filter_value])
            else:
                # Try to use it as-is if not in mappings
                query_parts.append(f"is:{intent.status_filter}")

        if intent.time_filter:
            # IMPROVEMENT 2: Handle both "this week" and "this_week" formats
            time_mappings = {
                "today": "newer_than:1d",
                "yesterday": "older_than:1d newer_than:2d",
                "this week": "newer_than:7d",  # Handle space version
                "this_week": "newer_than:7d",  # Handle underscore version
                "last week": "older_than:7d newer_than:14d",
                "last_week": "older_than:7d newer_than:14d",
                "this month": "newer_than:30d",
                "this_month": "newer_than:30d",
                "last month": "older_than:30d newer_than:60d",
                "last_month": "older_than:30d newer_than:60d"
            }

            filter_value = intent.time_filter.lower()
            if filter_value in time_mappings:
                query_parts.append(time_mappings[filter_value])

        if intent.sender_filter:
            # IMPROVEMENT 3: Handle sender filter more intelligently
            sender = intent.sender_filter.strip()
            if '@' in sender:
                query_parts.append(f"from:{sender}")
            else:
                # If no @, treat as name/partial match
                query_parts.append(f"from:{sender}")

        if intent.subject_keywords:
            # IMPROVEMENT 4: Better handling of multi-word subjects
            keywords = intent.subject_keywords.strip()
            if ' ' in keywords:
                # Multiple words - search for the phrase
                query_parts.append(f'subject:"{keywords}"')
            else:
                # Single word
                query_parts.append(f"subject:{keywords}")

        if intent.label_filter:
            label = intent.label_filter.strip()
            query_parts.append(f"label:{label}")

        # Default to inbox if no specific filters
        if not query_parts:
            query_parts.append("in:inbox")

        # Build the final query
        gmail_query = " ".join(query_parts)

        # Determine max_results
        # If email_count is None, fetch all (set a reasonable limit like 100)
        # If specified, use that number
        max_results = intent.email_count if intent.email_count else 100

        return gmail_query, max_results

# Enhanced fetch method for EmailFetcher
def enhanced_fetch_emails_node(email_agent_instance, state):
    """
    Enhanced email fetching with dynamic query parsing.
    This replaces the rigid if-elif logic with intelligent parsing.
    """
    print("📧 Intelligently parsing email request...")

    # Get the original user message
    user_message = state["messages"][-1].content if state["messages"] else ""

    # Initialise parser
    parser = IntelligentQueryParser()

    # Parse the query
    intent = parser.parse_user_query(user_message)
    print(f"  Parsed intent: {intent.model_dump()}")

    # Build Gmail query
    gmail_query, max_results = parser.build_gmail_query(intent)
    print(f"  Gmail query: '{gmail_query}' (max: {max_results})")

    try:
        # Fetch emails using the parsed parameters
        df = email_agent_instance.email_fetcher.fetch_emails(
            query=gmail_query,
            max_results=max_results
        )

        if not df.empty:
            emails = df.to_dict('records')
            state["email_data"] = {
                "emails": emails,
                "count": len(emails),
                "query": gmail_query,
                "intent": intent.model_dump()
            }
            print(f"  ✅ Found {len(emails)} emails")
        else:
            state["email_data"] = {
                "emails": [],
                "count": 0,
                "query": gmail_query,
                "intent": intent.model_dump()
            }
            print("  ⚠️ No emails found matching criteria")

    except Exception as e:
        state["email_data"] = {"error": str(e)}
        print(f"  ❌ Error: {e}")

    return state

print("✅ Intelligent query parser created")

### **Security Analysis Module**

In [None]:
# Cell 3: Security analysis module with hybrid LLM integration
class SecurityAnalyser:
    """Email security analysis class with hybrid deterministic + LLM approach"""

    def __init__(self, llm_model=None):
        """Initialise threat patterns and LLM integration"""

        # LLM for domain assessment
        self.security_llm = ChatOpenAI(
            model="gpt-4o-mini",  # Using lighter model for quick assessments
            temperature=0.1,  # Low temperature for consistent security decisions
        )

        # Phishing/Scam patterns
        self.phishing_patterns = [
            # URL patterns
            r'bit\.ly|tinyurl|short\.link|clck\.ru',  # URL shorteners
            r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}',  # IP addresses
            r'@[^@]*@',  # Double @ signs

            # Word patterns
            r'urgent.{0,20}action.{0,20}required',
            r'verify.{0,20}account.{0,20}immediately',
            r'suspended.{0,20}account',
            r'click.{0,20}here.{0,20}immediately',
            r'limited.{0,20}time.{0,20}offer',
            r'congratulations.{0,20}won',
            r'claim.{0,20}prize',
            r'tax.{0,20}refund',
            r'nigerian?.{0,20}prince',
            r'claim.{0,20}reward',
        ]

        # Prompt injection patterns
        self.injection_patterns = [
            r'ignore.{0,20}previous.{0,20}instructions',
            r'disregard.{0,20}all.{0,20}prior',
            r'forget.{0,20}everything',
            r'new.{0,20}instructions.{0,20}follow',
            r'system.{0,20}prompt.{0,20}override',
            r'admin.{0,20}mode',
            r'developer.{0,20}mode',
            r'bypass.{0,20}security',
            r'<script',  # XSS attempts
            r'javascript:',
            r'eval\(',
            r'onerror=',
        ]

    def assess_domain_with_llm(self, domain: str, email_context: Dict) -> Dict[str, Any]:
        """
        Use LLM to assess domain trustworthiness.
        This method will be enhanced by DomainSimilarityMatcher if enabled.

        Args:
            domain: Domain to assess
            email_context: Email context for better assessment

        Returns:
            Dict with assessment results
        """
        try:
            prompt = f"""Analyse this email sender domain for security risks.

Domain: {domain}
Email Subject: {email_context.get('subject', 'N/A')}
Sender Full Address: {email_context.get('from', 'N/A')}

Based on the domain name pattern and common phishing tactics, assess if this domain appears:
1. SUSPICIOUS (likely phishing/scam)
2. TRUSTED (legitimate business/service)
3. UNKNOWN (cannot determine)

Consider:
- Does the domain mimic known brands?
- Does it use suspicious patterns?
- Is it a legitimate business domain?

Respond with ONLY one word: SUSPICIOUS, TRUSTED, or UNKNOWN

CRITICAL NOTE: Try not to mark as ‘UNKNOWN’ as much as possible.

Decision:"""

            response = self.security_llm.invoke(prompt)
            assessment = response.content.strip().upper()

            # Ensure assessment is one of the expected values
            if assessment not in ['SUSPICIOUS', 'TRUSTED', 'UNKNOWN']:
                assessment = 'UNKNOWN'

            return {
                'domain': domain,
                'llm_assessment': assessment,
                'confidence': 'low',  # Low confidence without similarity matching
                'method': 'basic_llm'
            }

        except Exception as e:
            print(f"⚠️ LLM assessment failed for {domain}: {e}")
            return {
                'domain': domain,
                'llm_assessment': 'UNKNOWN',
                'confidence': 'low'
            }

    def check_phishing_indicators(self, email_data: Dict) -> Dict[str, Any]:
        """Check for phishing indicators with hybrid approach"""
        indicators = []
        risk_score = 0
        domain_assessment = {}

        # Analyse full text
        text = f"{email_data.get('subject', '')} {email_data.get('body_text', '')} {email_data.get('body_html', '')}".lower()

        # Pattern checking
        for pattern in self.phishing_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                indicators.append(f"Suspicious pattern detected: {pattern}")
                risk_score += 20

        # Urgency words analysis
        urgency_words = ['urgent', 'immediate', 'expire', 'suspend', 'limited time']
        urgency_count = sum(1 for word in urgency_words if word in text)
        if urgency_count > 2:
            indicators.append(f"High urgency level detected ({urgency_count} keywords)")
            risk_score += urgency_count * 10

        # Enhanced sender analysis
        sender = email_data.get('from', '')
        sender_domain = sender.split('@')[-1].split('>')[0] if '@' in sender else ''

        if sender_domain:
            # Check display name vs actual email
            if '<' in sender and '>' in sender:
                display_name = sender.split('<')[0].strip()
                actual_email = sender.split('<')[1].split('>')[0]

                # Check if display name contains different email
                if '@' in display_name:
                    indicators.append("Display name contains different email address")
                    risk_score += 30

            # Domain assessment - now relies entirely on enhanced LLM method
            # (which will use similarity matching if Cell 3a is loaded)
            llm_result = self.assess_domain_with_llm(sender_domain, email_data)
            domain_assessment = {
                'status': llm_result['llm_assessment'].lower(),
                'source': llm_result.get('method', 'llm'),
                'confidence': llm_result.get('confidence', 'low')
            }

            if llm_result['llm_assessment'] == 'SUSPICIOUS':
                indicators.append(f"Domain assessed as suspicious: {sender_domain}")
                risk_score += 30

            elif llm_result['llm_assessment'] == 'TRUSTED':
                indicators.append(f"Domain assessed as trusted: {sender_domain}")
                risk_score -= 10
                risk_score = max(0, risk_score)

            else:  # UNKNOWN
                indicators.append(f"Domain assessment inconclusive: {sender_domain}")
                risk_score += 10

        # Attachment analysis
        if email_data.get('has_attachment'):
            attachments = email_data.get('attachment_names', [])
            dangerous_extensions = ['.exe', '.zip', '.rar', '.bat', '.cmd', '.scr', '.vbs']

            for att in attachments:
                if any(att.lower().endswith(ext) for ext in dangerous_extensions):
                    indicators.append(f"Dangerous file extension detected: {att}")
                    risk_score += 40

        return {
            'indicators': indicators,
            'risk_score': min(risk_score, 100),
            'risk_level': self._calculate_risk_level(risk_score),
            'domain_assessment': domain_assessment
        }

    def check_url_safety(self, text: str) -> Dict[str, Any]:
        """Check URL safety in email content"""
        urls = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', text)
        suspicious_urls = []

        for url in urls:
            # Check for URL shorteners
            if any(short in url.lower() for short in ['bit.ly', 'tinyurl', 'short.link']):
                suspicious_urls.append({
                    'url': url,
                    'reason': 'URL shortener detected - could hide malicious destination'
                })

            # Check for IP addresses instead of domains
            if re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url):
                suspicious_urls.append({
                    'url': url,
                    'reason': 'Contains IP address instead of domain name'
                })

            # Check for homograph attacks (similar looking characters)
            if any(char in url for char in ['а', 'е', 'о', 'р', 'с', 'у', 'х']):  # Cyrillic chars
                suspicious_urls.append({
                    'url': url,
                    'reason': 'Possible homograph attack - contains lookalike characters'
                })

        return {
            'total_urls': len(urls),
            'suspicious_urls': suspicious_urls,
            'risk_level': 'high' if suspicious_urls else 'low'
        }

    def check_prompt_injection(self, text: str) -> Dict[str, Any]:
        """Check for prompt injection attempts"""
        injections_found = []

        for pattern in self.injection_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                injections_found.append({
                    'pattern': pattern,
                    'matches': matches[:3]  # First 3 matches
                })

        return {
            'injection_detected': len(injections_found) > 0,
            'injection_patterns': injections_found,
            'risk_level': 'critical' if len(injections_found) > 2 else
                         'high' if len(injections_found) > 0 else 'none'
        }

    def _calculate_risk_level(self, score: int) -> str:
        """Calculate risk level from score"""
        if score >= 70:
            return 'critical'
        elif score >= 50:
            return 'high'
        elif score >= 30:
            return 'medium'
        elif score >= 10:
            return 'low'
        else:
            return 'safe'

    def analyse_email_security(self, email_data: Dict) -> Dict[str, Any]:
        """Complete security analysis with hybrid approach"""

        # Run all analyses using FULL text
        full_text = f"{email_data.get('subject', '')} {email_data.get('body_text', '')} {email_data.get('body_html', '')}"

        url_analysis = self.check_url_safety(full_text)
        phishing_analysis = self.check_phishing_indicators(email_data)
        injection_analysis = self.check_prompt_injection(full_text)

        # Calculate overall risk score
        overall_risk_score = phishing_analysis['risk_score']

        if url_analysis['risk_level'] == 'high':
            overall_risk_score = min(overall_risk_score + 30, 100)

        if injection_analysis['risk_level'] == 'critical':
            overall_risk_score = min(overall_risk_score + 50, 100)
        elif injection_analysis['risk_level'] == 'high':
            overall_risk_score = min(overall_risk_score + 30, 100)

        return {
            'email_id': email_data.get('id'),
            'subject': email_data.get('subject'),
            'sender': email_data.get('from'),
            'overall_risk_score': overall_risk_score,
            'overall_risk_level': self._calculate_risk_level(overall_risk_score),
            'domain_assessment': phishing_analysis.get('domain_assessment', {}),
            'url_analysis': url_analysis,
            'phishing_analysis': phishing_analysis,
            'injection_analysis': injection_analysis,
            'recommendations': self._generate_recommendations(
                overall_risk_score,
                phishing_analysis,
                injection_analysis
            )
        }

    def _generate_recommendations(self, risk_score: int, phishing: Dict, injection: Dict) -> List[str]:
        """Generate security recommendations"""
        recommendations = []

        if risk_score >= 70:
            recommendations.append("⛔ CRITICAL: Do NOT open this email - DELETE immediately!")
            recommendations.append("🚨 Report to IT security team")
        elif risk_score >= 50:
            recommendations.append("⚠️ HIGH RISK: Do not click any links")
            recommendations.append("🔧 Verify sender identity independently")
        elif risk_score >= 30:
            recommendations.append("⚡ CAUTION: Suspicious content detected")

        if injection['injection_detected']:
            recommendations.append("🤖 PROMPT INJECTION detected - do not copy to AI systems")

        if phishing['indicators']:
            recommendations.append("🎣 Phishing indicators detected - do not share personal information")

        # Add domain-specific recommendation
        domain_assessment = phishing.get('domain_assessment', {})
        if domain_assessment.get('confidence') == 'high' and domain_assessment.get('status') == 'suspicious':
            recommendations.append("🔍 AI assessment with similarity matching suggests sender domain is suspicious")

        return recommendations

# Test the analyser
security_analyser = SecurityAnalyser()
print("✅ Cleaned SecurityAnalyser created - domain lists now managed in DomainSimilarityMatcher")

In [None]:
# Cell 3a: Enhanced Domain Assessment with Similarity Matching
class DomainSimilarityMatcher:
    """
    Domain similarity matcher using vector embeddings for intelligent domain assessment.
    Uses semantic similarity to find the most relevant examples for LLM decision-making.
    """

    def __init__(self, embeddings_model: str = "text-embedding-3-small"):
        """
        Initialise the domain similarity matcher with vector stores.

        Args:
            embeddings_model: OpenAI embeddings model to use
        """
        self.embeddings = OpenAIEmbeddings(model=embeddings_model)

        # Suspicious domains with context
        self.suspicious_domains_data = [
            ("109.197.125.34.bc.googleusercontent.com", "IP address subdomain impersonating Google"),
            ("accounts-mail.ru", "Russian domain mimicking account services"),
            ("adobe-jkwefnewkjnfkjewnfkejwnfkjew.pages.dev", "Gibberish subdomain impersonating Adobe"),
            ("business-facebook-covid19.com", "COVID-19 phishing using Facebook brand"),
            ("google-secure.org", "Fake Google security domain"),
            ("disceord.gift", "Discord typosquatting with gift scam"),
            ("microsoft-error-pages-check-errors.pages.dev", "Fake Microsoft error page"),
            ("steamcommunitc.com", "Steam typosquatting domain"),
            ("xn--gmai-88b.com", "Homograph attack on Gmail"),
            ("trust.twallet.cam", "Trust Wallet phishing domain"),
            # Add more as needed
        ]

        # Trusted domains with context
        self.trusted_domains_data = [
            ("gmail.com", "Official Google email service"),
            ("outlook.com", "Microsoft email service"),
            ("amazon.com", "Official Amazon domain"),
            ("github.com", "Official GitHub platform"),
            ("linkedin.com", "Professional networking platform"),
            ("paypal.com", "Official PayPal payment service"),
            ("dropbox.com", "Cloud storage service"),
            ("slack.com", "Team collaboration platform"),
            ("zoom.us", "Video conferencing service"),
            ("adobe.com", "Official Adobe domain"),
            # Add more as needed
        ]

        # Create vector stores
        self._initialise_vector_stores()

    def _initialise_vector_stores(self):
        """Create FAISS vector stores for domain similarity search."""

        # Create documents for suspicious domains
        suspicious_docs = [
            Document(
                page_content=f"{domain} - {context}",
                metadata={"domain": domain, "type": "suspicious", "context": context}
            )
            for domain, context in self.suspicious_domains_data
        ]

        # Create documents for trusted domains
        trusted_docs = [
            Document(
                page_content=f"{domain} - {context}",
                metadata={"domain": domain, "type": "trusted", "context": context}
            )
            for domain, context in self.trusted_domains_data
        ]

        # Create FAISS vector stores
        self.suspicious_vectorstore = FAISS.from_documents(
            suspicious_docs, self.embeddings
        )
        self.trusted_vectorstore = FAISS.from_documents(
            trusted_docs, self.embeddings
        )

    def get_similar_domains(
        self,
        query_domain: str,
        top_k: int = 5
    ) -> Tuple[List[Document], List[Document]]:
        """
        Retrieve the most similar suspicious and trusted domains.

        Args:
            query_domain: Domain to assess
            top_k: Number of similar examples to retrieve

        Returns:
            Tuple of (suspicious_examples, trusted_examples)
        """
        # Search for similar suspicious domains
        suspicious_similar = self.suspicious_vectorstore.similarity_search(
            query_domain, k=top_k
        )

        # Search for similar trusted domains
        trusted_similar = self.trusted_vectorstore.similarity_search(
            query_domain, k=top_k
        )

        return suspicious_similar, trusted_similar

    def create_assessment_prompt(
        self,
        domain: str,
        email_context: dict,
        top_k: int = 5
    ) -> str:
        """
        Create an enhanced prompt with similar domain examples.

        Args:
            domain: Domain to assess
            email_context: Email context for assessment
            top_k: Number of examples to include

        Returns:
            Enhanced prompt with examples
        """
        # Get similar domains
        suspicious_examples, trusted_examples = self.get_similar_domains(domain, top_k)

        # Format examples
        suspicious_text = "\n".join([
            f"  - {doc.metadata['domain']}: {doc.metadata['context']}"
            for doc in suspicious_examples
        ])

        trusted_text = "\n".join([
            f"  - {doc.metadata['domain']}: {doc.metadata['context']}"
            for doc in trusted_examples
        ])

        prompt = f"""Analyse this email sender domain for security risks using the provided examples.

Domain to assess: {domain}
Email Subject: {email_context.get('subject', 'N/A')}
Sender Full Address: {email_context.get('from', 'N/A')}

SIMILAR SUSPICIOUS DOMAINS (for reference):
{suspicious_text}

SIMILAR TRUSTED DOMAINS (for reference):
{trusted_text}

Based on:
1. The similarity to the suspicious examples above
2. The similarity to the trusted examples above
3. Common phishing patterns (typosquatting, brand impersonation, etc.)
4. The email context provided

Assess if this domain appears:
- SUSPICIOUS (likely phishing/scam)
- TRUSTED (legitimate business/service)
- UNKNOWN (cannot determine with confidence)

Consider:
- Does it closely resemble any suspicious examples?
- Does it match patterns from trusted examples?
- Are there spelling variations of known brands?
- Does it use suspicious subdomain patterns?

Respond with ONLY one word: SUSPICIOUS, TRUSTED, or UNKNOWN

Decision:"""

        return prompt

# Integration with SecurityAnalyser
def integrate_similarity_matcher(security_analyser_class):
    """
    Monkey-patch the SecurityAnalyser to use similarity matching.
    In production, this should be properly integrated into the class.
    """

    # Initialise the matcher
    domain_matcher = DomainSimilarityMatcher()

    # Store original method
    original_assess = security_analyser_class.assess_domain_with_llm

    def enhanced_assess_domain(self, domain: str, email_context: dict) -> dict:
        """Enhanced domain assessment using similarity matching."""
        try:
            # Create enhanced prompt with examples
            prompt = domain_matcher.create_assessment_prompt(
                domain, email_context, top_k=5
            )

            # Get LLM assessment
            response = self.security_llm.invoke(prompt)
            assessment = response.content.strip().upper()

            # Validate response
            if assessment not in ['SUSPICIOUS', 'TRUSTED', 'UNKNOWN']:
                assessment = 'UNKNOWN'

            return {
                'domain': domain,
                'llm_assessment': assessment,
                'confidence': 'high' if assessment != 'UNKNOWN' else 'medium',
                'method': 'similarity_enhanced'
            }

        except Exception as e:
            print(f"⚠️ Enhanced assessment failed for {domain}: {e}")
            # Fall back to original method
            return original_assess(self, domain, email_context)

    # Replace method
    security_analyser_class.assess_domain_with_llm = enhanced_assess_domain

    return domain_matcher

print("✅ Domain similarity matcher created")

### **LangChain Agent Tools**

In [None]:
# Cell 4: Enhanced Tools with @tool Decorator and Full Content Processing
# Define input schemas for tools
class EmailSummaryInput(BaseModel):
    """Input schema for email summarisation."""
    email_id: str = Field(description="Email ID to summarise")
    subject: str = Field(description="Email subject")
    sender: str = Field(description="Email sender")
    body_text: str = Field(description="Full email body text")
    body_html: Optional[str] = Field(description="Full email HTML content if available")

class EmailActionInput(BaseModel):
    """Input schema for action extraction."""
    email_id: str = Field(description="Email ID")
    subject: str = Field(description="Email subject")
    body_text: str = Field(description="Full email body text")
    sender: str = Field(description="Email sender")

class EmailSecurityInput(BaseModel):
    """Input schema for security analysis."""
    email_data: Dict[str, Any] = Field(description="Complete email data dictionary")

# Create LLM for summarisation (separate from security)
summarisation_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.3  # Slightly higher for more natural summaries
)

@tool("email_summariser", args_schema=EmailSummaryInput, return_direct=False)
def summarise_email(
    email_id: str,
    subject: str,
    sender: str,
    body_text: str,
    body_html: Optional[str] = None
) -> Dict[str, Any]:
    """
    Provide comprehensive email summarisation using full content.
    This tool reads the ENTIRE email content to create proper summaries.
    """

    # Use full body text, not regex extracts
    full_content = body_text if body_text else ""

    # If HTML is available and text is empty, extract text from HTML
    if not full_content and body_html:
        # Basic HTML tag removal (in production, use BeautifulSoup)
        import re
        full_content = re.sub('<[^<]+?>', '', body_html)

    # Create comprehensive summarisation prompt
    prompt = f"""Provide a comprehensive summary of this email.

Email Details:
- Subject: {subject}
- From: {sender}
- Email ID: {email_id}

Full Email Content:
{full_content[:5000]}  # Limit to 5000 chars for token management

Create a detailed summary that includes:
1. **Main Purpose**: What is the primary reason for this email?
2. **Key Points**: List the most important information (3-5 points)
3. **Context**: Any relevant background or context mentioned
4. **Tone**: Professional, casual, urgent, informative, etc.
5. **Important Details**: Dates, numbers, names, specific requirements
6. **Attachments Mentioned**: Any files or documents referenced

Provide a clear, comprehensive summary that would allow someone to understand
the email without reading it. Focus on clarity and completeness.

Format your response as JSON with the following structure:
{{
    "main_purpose": "...",
    "key_points": ["point1", "point2", ...],
    "context": "...",
    "tone": "...",
    "important_details": {{
        "dates": [...],
        "numbers": [...],
        "names": [...],
        "requirements": [...]
    }},
    "attachments_mentioned": [...],
    "executive_summary": "A 2-3 sentence overview",
    "word_count": <approximate word count of original email>
}}"""

    try:
        response = summarisation_llm.invoke(prompt)

        # Parse JSON response
        content = response.content
        # Clean potential markdown formatting
        if "```json" in content:
            content = content.split("```json")[1].split("```")[0]
        elif "```" in content:
            content = content.split("```")[1].split("```")[0]

        summary_data = json.loads(content.strip())

        return {
            "email_id": email_id,
            "subject": subject,
            "sender": sender,
            "summary": summary_data,
            "content_length": len(full_content),
            "summarisation_successful": True
        }

    except Exception as e:
        print(f"⚠️ Summarisation failed: {e}")
        # Fallback to basic summary
        return {
            "email_id": email_id,
            "subject": subject,
            "sender": sender,
            "summary": {
                "executive_summary": f"Email from {sender} about: {subject}",
                "error": str(e)
            },
            "content_length": len(full_content),
            "summarisation_successful": False
        }

@tool("extract_actions", args_schema=EmailActionInput, return_direct=False)
def extract_action_items(
    email_id: str,
    subject: str,
    body_text: str,
    sender: str
) -> Dict[str, Any]:
    """
    Extract action items and to-dos from email using full content analysis.
    Uses LLM to understand context and extract meaningful actions.
    """

    prompt = f"""Extract all action items, tasks, and deadlines from this email.

Email Subject: {subject}
From: {sender}

Full Email Content:
{body_text[:5000]}

Identify and extract:
1. **Direct Requests**: Things explicitly asked to be done
2. **Implied Tasks**: Actions implied but not directly stated
3. **Deadlines**: Any time-sensitive items with dates
4. **Follow-ups**: Items requiring response or follow-up
5. **Decisions Required**: Points needing decisions
6. **Information Requests**: Requests for information or documents

For each action item, specify:
- The specific action required
- Who needs to do it (if mentioned)
- Deadline or timeframe (if any)
- Priority level (High/Medium/Low based on context)
- Category (Request/Deadline/Follow-up/Decision/Information)

Format as JSON:
{{
    "action_items": [
        {{
            "action": "...",
            "assigned_to": "recipient/specific person/not specified",
            "deadline": "date or null",
            "priority": "High/Medium/Low",
            "category": "..."
        }}
    ],
    "total_actions": <number>,
    "has_urgent_items": true/false,
    "summary_of_requirements": "Brief overview of what's needed"
}}"""

    try:
        response = summarisation_llm.invoke(prompt)

        # Parse response
        content = response.content
        if "```json" in content:
            content = content.split("```json")[1].split("```")[0]
        elif "```" in content:
            content = content.split("```")[1].split("```")[0]

        actions_data = json.loads(content.strip())

        return {
            "email_id": email_id,
            "subject": subject,
            "actions": actions_data,
            "extraction_successful": True
        }

    except Exception as e:
        print(f"⚠️ Action extraction failed: {e}")
        return {
            "email_id": email_id,
            "subject": subject,
            "actions": {
                "action_items": [],
                "total_actions": 0,
                "error": str(e)
            },
            "extraction_successful": False
        }

@tool("security_analysis", args_schema=EmailSecurityInput, return_direct=False)
def analyse_email_security(email_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Perform comprehensive security analysis on email.
    Maintains existing security logic but as a proper tool.
    """

    # This would use the existing SecurityAnalyser class
    # but wrapped as a proper tool
    from typing import TYPE_CHECKING
    if not TYPE_CHECKING:
        # Import would happen here in production
        pass

    # For now, return a structured response
    # In production, this would call the actual SecurityAnalyser

    security_result = {
        "email_id": email_data.get("id"),
        "risk_assessment": {
            "overall_risk": "low",  # This would be calculated
            "risk_score": 0,
            "risk_factors": []
        },
        "recommendations": [],
        "security_check_complete": True
    }

    return security_result

# Tool registration function for LangChain
def get_enhanced_tools():
    """
    Get all enhanced tools with proper structure.
    These tools will show up correctly in LangSmith traces.
    """
    return [
        summarise_email,
        extract_action_items,
        analyse_email_security
    ]

print("✅ Enhanced tools with @tool decorator created")

### **Main Agent Configuration**

In [None]:
# Cell 5: Enhanced Email Agent with Proper Integration
class EnhancedAgentState(TypedDict):
    """Enhanced state for the email agent."""
    messages: Annotated[Sequence[BaseMessage], operator.add]
    user_query: str
    parsed_intent: Dict[str, Any]
    email_data: Dict[str, Any]
    summaries: List[Dict[str, Any]]
    actions: List[Dict[str, Any]]
    security_results: List[Dict[str, Any]]
    final_output: str

class EnhancedEmailAgent:
    """
    Enhanced email management agent with proper tool integration,
    dynamic query parsing, and comprehensive summarisation.
    """

    def __init__(
        self,
        model_name: str = "gpt-4o-mini",
        temperature: float = 0.3,
        use_similarity_matching: bool = True
    ):
        """
        Initialise the enhanced agent.

        Args:
            model_name: LLM model to use
            temperature: Model temperature
            use_similarity_matching: Whether to use domain similarity matching
        """

        # Initialise LLM
        from langchain_openai import ChatOpenAI
        self.llm = ChatOpenAI(model=model_name, temperature=temperature)

        # Initialise components
        self.email_fetcher = EmailFetcher()
        self.query_parser = IntelligentQueryParser()

        # Initialise enhanced security analyser
        self.security_analyser = SecurityAnalyser()
        if use_similarity_matching:
            self.domain_matcher = integrate_similarity_matcher(SecurityAnalyser)

        # Get enhanced tools
        self.tools = get_enhanced_tools()

        # Build the enhanced graph
        self.app = self._build_enhanced_graph()

    def _build_enhanced_graph(self) -> StateGraph:
        """Build the enhanced LangGraph workflow."""

        # Create workflow
        workflow = StateGraph(EnhancedAgentState)

        # Add nodes
        workflow.add_node("parse_query", self._parse_query)
        workflow.add_node("fetch_emails", self._fetch_emails)
        workflow.add_node("security_check", self._security_analysis)
        workflow.add_node("summarise_emails", self._summarise_emails)
        workflow.add_node("extract_actions", self._extract_actions)
        workflow.add_node("generate_output", self._generate_final_output)

        # Define the flow
        workflow.set_entry_point("parse_query")
        workflow.add_edge("parse_query", "fetch_emails")
        workflow.add_edge("fetch_emails", "security_check")
        workflow.add_edge("security_check", "summarise_emails")
        workflow.add_edge("summarise_emails", "extract_actions")
        workflow.add_edge("extract_actions", "generate_output")
        workflow.add_edge("generate_output", END)

        # Compile
        return workflow.compile()

    def _parse_query(self, state: EnhancedAgentState) -> EnhancedAgentState:
        """Parse user query to understand intent."""

        print("🧠 Parsing user query with intelligence...")

        user_query = state.get("user_query", "")

        # Parse the query
        intent = self.query_parser.parse_user_query(user_query)
        state["parsed_intent"] = intent.model_dump()

        print(f"  ✅ Intent parsed: {intent.model_dump()}")

        return state

    def _fetch_emails(self, state: EnhancedAgentState) -> EnhancedAgentState:
        """Fetch emails based on parsed intent."""

        print("📧 Fetching emails based on parsed intent...")

        intent = EmailQueryIntent(**state["parsed_intent"])

        # Build Gmail query
        gmail_query, max_results = self.query_parser.build_gmail_query(intent)

        print(f"  Query: '{gmail_query}' (max: {max_results})")

        try:
            df = self.email_fetcher.fetch_emails(gmail_query, max_results)

            if not df.empty:
                emails = df.to_dict('records')
                state["email_data"] = {
                    "emails": emails,
                    "count": len(emails),
                    "query": gmail_query
                }
                print(f"  ✅ Fetched {len(emails)} emails")
            else:
                state["email_data"] = {"emails": [], "count": 0}
                print("  ⚠️ No emails found")

        except Exception as e:
            state["email_data"] = {"error": str(e)}
            print(f"  ❌ Error: {e}")

        return state

    def _security_analysis(self, state: EnhancedAgentState) -> EnhancedAgentState:
        """Perform security analysis on all emails."""

        print("🔒 Performing security analysis...")

        emails = state.get("email_data", {}).get("emails", [])
        security_results = []

        for email in emails:
            # Use the enhanced security analyser
            result = self.security_analyser.analyse_email_security(email)
            security_results.append(result)

        state["security_results"] = security_results

        high_risk = sum(1 for r in security_results
                       if r.get("overall_risk_level") in ["high", "critical"])
        print(f"  ✅ Security check complete: {high_risk} high-risk emails")

        return state

    def _summarise_emails(self, state: EnhancedAgentState) -> EnhancedAgentState:
        """Create comprehensive summaries using full email content."""

        print("📝 Creating comprehensive summaries...")

        emails = state.get("email_data", {}).get("emails", [])
        summaries = []

        for email in emails:
            # Use the enhanced summarisation tool with FULL content
            summary = summarise_email.invoke({
                "email_id": email.get("id"),
                "subject": email.get("subject"),
                "sender": email.get("from"),
                "body_text": email.get("body_text", ""),  # FULL text
                "body_html": email.get("body_html", "")   # FULL HTML
            })
            summaries.append(summary)

        state["summaries"] = summaries
        print(f"  ✅ Created {len(summaries)} comprehensive summaries")

        return state

    def _extract_actions(self, state: EnhancedAgentState) -> EnhancedAgentState:
        """Extract action items from emails."""

        print("📋 Extracting action items...")

        emails = state.get("email_data", {}).get("emails", [])
        actions = []

        for email in emails:
            # Use the enhanced action extraction tool
            action_result = extract_action_items.invoke({
                "email_id": email.get("id"),
                "subject": email.get("subject"),
                "body_text": email.get("body_text", ""),  # FULL text
                "sender": email.get("from")
            })
            actions.append(action_result)

        state["actions"] = actions

        total_actions = sum(
            a.get("actions", {}).get("total_actions", 0)
            for a in actions
        )
        print(f"  ✅ Extracted {total_actions} total action items")

        return state

    def _generate_final_output(self, state: EnhancedAgentState) -> EnhancedAgentState:
        """Generate comprehensive final output with proper structure."""

        print("📊 Generating comprehensive final report...")

        # Build structured output
        output = self._build_comprehensive_output(state)

        state["final_output"] = output

        return state

    def _build_comprehensive_output(self, state: EnhancedAgentState) -> str:
        """
        Build comprehensive output with:
        1. Security warnings (if any)
        2. Full email summaries
        3. Consolidated action items
        4. Clear structure and formatting
        """

        emails = state.get("email_data", {}).get("emails", [])
        summaries = state.get("summaries", [])
        actions = state.get("actions", [])
        security_results = state.get("security_results", [])

        output = "\n" + "="*80 + "\n"
        output += "📊 COMPREHENSIVE EMAIL ANALYSIS REPORT\n"
        output += "="*80 + "\n\n"

        # Overview
        output += f"📧 Total Emails Analysed: {len(emails)}\n"
        output += f"🔍 Query Used: {state.get('email_data', {}).get('query', 'N/A')}\n"

        # Security Section First (Priority)
        high_risk_emails = [
            (i, r) for i, r in enumerate(security_results)
            if r.get("overall_risk_level") in ["high", "critical"]
        ]

        if high_risk_emails:
            output += "\n" + "="*80 + "\n"
            output += "⚠️ SECURITY ALERTS - IMMEDIATE ATTENTION REQUIRED\n"
            output += "="*80 + "\n"

            for idx, risk in high_risk_emails:
                email = emails[idx]
                output += f"\n🔴 HIGH RISK: {email.get('subject', 'No subject')}\n"
                output += f"   From: {email.get('from', 'Unknown')}\n"
                output += f"   Risk Level: {risk.get('overall_risk_level', '').upper()}\n"
                output += f"   Risk Score: {risk.get('overall_risk_score', 0)}/100\n"

                recommendations = risk.get("recommendations", [])
                if recommendations:
                    output += "   Recommendations:\n"
                    for rec in recommendations[:3]:
                        output += f"     • {rec}\n"

        # Full Email Summaries Section
        output += "\n" + "="*80 + "\n"
        output += "📝 DETAILED EMAIL SUMMARIES\n"
        output += "="*80 + "\n"

        for i, (email, summary, action, security) in enumerate(
            zip(emails, summaries, actions, security_results), 1
        ):
            output += f"\n{i}. {email.get('subject', 'No subject')}\n"
            output += "-"*60 + "\n"
            output += f"From: {email.get('from', 'Unknown')}\n"
            output += f"Date: {str(email.get('date', 'Unknown'))[:19]}\n"
            output += f"Security Status: {security.get('overall_risk_level', 'unchecked')}\n"

            # Comprehensive Summary
            if summary.get("summarisation_successful"):
                sum_data = summary.get("summary", {})
                output += f"\n📌 Executive Summary:\n"
                output += f"   {sum_data.get('executive_summary', 'Not available')}\n"

                output += f"\n📍 Main Purpose:\n"
                output += f"   {sum_data.get('main_purpose', 'Not identified')}\n"

                key_points = sum_data.get("key_points", [])
                if key_points:
                    output += f"\n🔑 Key Points:\n"
                    for point in key_points:
                        output += f"   • {point}\n"

                important = sum_data.get("important_details", {})
                if any(important.values()):
                    output += f"\n📊 Important Details:\n"
                    if important.get("dates"):
                        output += f"   Dates: {', '.join(important['dates'])}\n"
                    if important.get("requirements"):
                        output += f"   Requirements: {', '.join(important['requirements'])}\n"
            else:
                output += f"\n⚠️ Summarisation failed: {summary.get('summary', {}).get('error', 'Unknown error')}\n"

        # Consolidated Action Items
        all_actions = []
        for i, action_set in enumerate(actions):
            if action_set.get("extraction_successful"):
                for action in action_set.get("actions", {}).get("action_items", []):
                    action["email_subject"] = emails[i].get("subject", "")
                    all_actions.append(action)

        if all_actions:
            output += "\n" + "="*80 + "\n"
            output += "📋 CONSOLIDATED ACTION ITEMS & TO-DO LIST\n"
            output += "="*80 + "\n"

            # Sort by priority
            high_priority = [a for a in all_actions if a.get("priority") == "High"]
            medium_priority = [a for a in all_actions if a.get("priority") == "Medium"]
            low_priority = [a for a in all_actions if a.get("priority") == "Low"]

            if high_priority:
                output += "\n🔴 HIGH PRIORITY:\n"
                for action in high_priority:
                    output += f"   □ {action['action']}\n"
                    if action.get("deadline"):
                        output += f"     ⏰ Deadline: {action['deadline']}\n"
                    output += f"     📧 From: {action['email_subject'][:50]}...\n"

            if medium_priority:
                output += "\n🟡 MEDIUM PRIORITY:\n"
                for action in medium_priority:
                    output += f"   □ {action['action']}\n"
                    if action.get("deadline"):
                        output += f"     ⏰ Deadline: {action['deadline']}\n"

            if low_priority:
                output += "\n🟢 LOW PRIORITY:\n"
                for action in low_priority[:5]:  # Limit to 5
                    output += f"   □ {action['action']}\n"

        output += "\n" + "="*80 + "\n"
        output += "✅ END OF REPORT\n"
        output += "="*80 + "\n"

        return output

    def process_emails(self, user_query: str) -> Dict[str, Any]:
        """
        Process user email query with enhanced capabilities.

        Args:
            user_query: Natural language query from user

        Returns:
            Dict with results and status
        """

        print("\n" + "="*80)
        print("🤖 ENHANCED EMAIL AGENT ACTIVE")
        print("="*80)
        print(f"\n📝 User Query: {user_query}\n")

        try:
            # Initial state
            initial_state = {
                "messages": [HumanMessage(content=user_query)],
                "user_query": user_query,
                "parsed_intent": {},
                "email_data": {},
                "summaries": [],
                "actions": [],
                "security_results": [],
                "final_output": ""
            }

            # Run the workflow
            final_state = self.app.invoke(initial_state)

            # Display result
            print(final_state.get("final_output", "No result"))

            return {
                'success': True,
                'output': final_state.get("final_output"),
                'state': final_state
            }

        except Exception as e:
            print(f"\n❌ ERROR: {str(e)}")
            return {
                'success': False,
                'error': str(e)
            }

print("✅ Enhanced Email Agent created with all improvements")

### **Initialise the Enhanced Agent**

In [None]:
# After all cells are loaded
enhanced_agent = EnhancedEmailAgent(
    model_name="gpt-4o-mini",
    temperature=0.3,
    use_similarity_matching=True  # Enable domain similarity
)

# Test with various queries
result = enhanced_agent.process_emails("Show me my last 5 unread emails")
result = enhanced_agent.process_emails("Get all emails from today")
result = enhanced_agent.process_emails("Summarise last 5 important emails")

### **Test**

In [None]:
if 'enhanced_agent' in locals():
    # The domain_matcher is stored as an attribute of the agent
    domain_matcher = enhanced_agent.domain_matcher

    # Now you can test it
    test_domain = "arnazon.com"  # Note the typo
    suspicious, trusted = domain_matcher.get_similar_domains(test_domain)

    print("✅ Domain Matcher Test Results:")
    print(f"\nTesting domain: {test_domain}")
    print(f"\nTop suspicious similar domains:")
    for doc in suspicious[:5]:
        print(f"  - {doc.metadata['domain']}: {doc.metadata['context']}")

    print(f"\nTop trusted similar domains:")
    for doc in trusted[:5]:
        print(f"  - {doc.metadata['domain']}: {doc.metadata['context']}")
else:
    print("⚠️ Enhanced agent not yet created. Create it first with:")
    print("enhanced_agent = EnhancedEmailAgent(use_similarity_matching=True)")

In [None]:
# Test various natural language queries
parser = IntelligentQueryParser()
intent = parser.parse_user_query("Show me urgent emails from last week")
print(intent.model_dump())  # Changed from .dict() to .model_dump()

In [None]:
# Test complete pipeline
queries = [
    "Find emails from this week about meetings"
]
for query in queries:
    result = enhanced_agent.process_emails(query)