In this post we will see how to use a local LLM to extract structured information from emails.

My very first project when I started working at appliedAI Initiative in 2021, involved information extraction from emails for a company that makes a document management system. Back then LLMs were not yet as widespread and as useful as they are right now, so we decided to train a model from scratch. We however didn't have any labelled data for training because we couldn't use their customer data due to privacy reasons and had to resort to manually labelling emails from the [Enron email dataset]() and in the end the results were not very impressive.

Now, this type of application is simpler than ever and I want to demonstrate that in this blog post.

# Imports

In [None]:
import random
import yaml
from datetime import datetime
from difflib import SequenceMatcher
from email.parser import Parser
from functools import partial
from statistics import fmean
from typing import Any, Callable, Literal

import numpy as np
import seaborn as sns
from llama_cpp import Llama
from pydantic import BaseModel
from tqdm.notebook import tqdm, trange

random.seed(16)
sns.set_theme(style="ticks", palette="pastel")

# Email information model

We start by defining the model and by extension the schema of the structured information we want to extract.

For that we use [Pydantic](), to define the email information, sender and recipient models along with methods to compare and compute a similarity score for each model type.

In [None]:
class EmailBaseModel(BaseModel, extra="forbid"):
    """Base model class for email-related information extraction.

    This class extends BaseModel and provides common functionality for comparing
    string attributes between email information objects.

    Note:
        The extra="forbid" parameter ensures no additional attributes can be added
        beyond those explicitly defined.
    """

    @staticmethod
    def _compare_strings(a: str | None, b: str | None) -> float:
        """Computes similarity ratio between two possibly None strings.

        Uses SequenceMatcher to calculate string similarity when both inputs are
        not missing (None). Handles cases where one or both inputs are None.

        Args:
            a: First string to compare, or None
            b: Second string to compare, or None

        Returns:
            Similarity ratio between 0.0 and 1.0, where:
                - 1.0 indicates identical strings or both are None
                - 0.0 indicates completely different strings or one of them is None
                - Values between 0.0 and 1.0 indicate partial similarity
        """
        if a is None and b is None:
            similarity = 1.0
        elif a is not None and b is not None:
            similarity = SequenceMatcher(None, a, b).ratio()
        elif a is not None:
            similarity = 0.0
        else:
            similarity = 0.0
        return similarity

In [None]:
class Sender(EmailBaseModel):
    """Represents a sender of an email with their associated information.

    Stores and compares sender details including name, email, phone number,
    role, and organization.

    Attributes:
        name: The sender's full name if available
        email: The sender's email address
        phone_number: The sender's phone number if available
        role: The sender's professional role if available
        organization: The sender's organization if available
    """

    name: str | None = None
    email: str
    phone_number: str | None = None
    role: str | None = None
    organization: str | None = None

    def compare(self, other: "Sender") -> float:
        """Compares this sender with another sender object.

        Calculates similarity by comparing all attributes using string comparison
        and returns the mean similarity across all fields.

        Args:
            other: Another Sender object to compare against

        Returns:
            Mean similarity ratio between 0.0 and 1.0, where:
                - 1.0 indicates identical senders
                - 0.0 indicates completely different senders or invalid comparison
                - Values between 0.0 and 1.0 indicate partial similarity across fields

        Note:
            Returns 0.0 if other is not a Sender instance.
        """
        if not isinstance(other, Sender):
            return 0.0

        name_similarity = self._compare_strings(self.name, other.name)
        email_similarity = self._compare_strings(self.email, other.email)
        phone_number_similarity = self._compare_strings(
            self.phone_number, other.phone_number
        )
        role_similarity = self._compare_strings(self.role, other.role)
        organization_similarity = self._compare_strings(
            self.organization, other.organization
        )
        return fmean(
            [
                name_similarity,
                email_similarity,
                phone_number_similarity,
                role_similarity,
                organization_similarity,
            ]
        )

In [None]:
class Recipient(EmailBaseModel):
    """Represents a recipient of an email with their associated information.

    Stores and compares recipient details including name, email, phone number,
    role, organization, and their type of recipiency (to, cc, bcc).

    Attributes:
        name: The recipient's full name if available
        email: The recipient's email address
        phone_number: The recipient's phone number if available
        role: The recipient's professional role if available
        organization: The recipient's organization if available
        type: The type of recipient ("to", "cc", or "bcc")
    """

    name: str | None = None
    email: str
    phone_number: str | None = None
    role: str | None = None
    organization: str | None = None
    type: Literal["to", "cc", "bcc"] = "to"

    def compare(self, other: "Recipient") -> float:
        """Compares this recipient with another recipient object.

        Calculates similarity by comparing all attributes using string comparison
        and includes exact matching for recipient type. Returns the mean
        similarity across all fields.

        Args:
            other: Another Recipient object to compare against

        Returns:
            Mean similarity ratio between 0.0 and 1.0, where:
                - 1.0 indicates identical recipients
                - 0.0 indicates completely different recipients or invalid comparison
                - Values between 0.0 and 1.0 indicate partial similarity across fields

        Note:
            Returns 0.0 if other is not a Recipient instance.
            Recipient type comparison is binary: 1.0 if identical, 0.0 if different.
        """
        if not isinstance(other, Recipient):
            return 0.0

        name_similarity = self._compare_strings(self.name, other.name)
        email_similarity = self._compare_strings(self.email, other.email)
        phone_number_similarity = self._compare_strings(
            self.phone_number, other.phone_number
        )
        role_similarity = self._compare_strings(self.role, other.role)
        organization_similarity = self._compare_strings(
            self.organization, other.organization
        )
        type_similarity = 1.0 if self.type == other.type else 0.0
        return fmean(
            [
                name_similarity,
                email_similarity,
                phone_number_similarity,
                role_similarity,
                organization_similarity,
                type_similarity,
            ]
        )

In [None]:
class EmailInformation(EmailBaseModel):
    """Represents comprehensive information extracted from an email.

    Stores and compares email metadata including date, subject, sender information,
    and a list of recipients. Provides functionality to compare two email information
    objects for similarity.

    Attributes:
        date: The date of the email
        subject: The email subject line
        sender: Sender object containing sender information
        recipients: List of Recipient objects containing recipient information
    """

    date: str
    subject: str
    sender: Sender
    recipients: list[Recipient]

    def compare(self, other: "EmailInformation") -> float:
        """Compares this email information with another email information object.

        Performs a detailed comparison of all email attributes including sender
        and recipient information. For recipients, finds the best matching recipient
        pairs between the two emails and averages their similarities.

        Args:
            other: Another EmailInformation object to compare against

        Returns:
            Mean similarity ratio between 0.0 and 1.0, where:
                - 1.0 indicates identical email information
                - 0.0 indicates completely different emails or invalid comparison
                - Values between 0.0 and 1.0 indicate partial similarity across all fields

        Note:
            - Returns 0.0 if other is not an EmailInformation instance.
            - Returns 1.0 if self == other (exact match).
            - Recipient comparison finds the best matching recipient for each
              recipient in self.recipients among other.recipients.
        """
        if not isinstance(other, EmailInformation):
            return 0.0
        if self == other:
            return 1.0
        date_similarity = self._compare_strings(self.date, other.date)
        subject_similarity = self._compare_strings(self.subject, other.subject)
        sender_similarity = self.sender.compare(other.sender)

        if self.recipients == other.recipients:
            recipient_similarity = 1.0
        else:
            recipient_similarities = []
            for recipient_1 in self.recipients:
                recipient_1_similarity = 0.0
                for recipient_2 in other.recipients:
                    recipient_1_similarity = max(
                        recipient_1_similarity, recipient_1.compare(recipient_2)
                    )
                recipient_similarities.append(recipient_1_similarity)
            if recipient_similarities:
                recipient_similarity = fmean(recipient_similarities)
            else:
                recipient_similarity = 0.0

        return fmean(
            [
                date_similarity,
                subject_similarity,
                sender_similarity,
                recipient_similarity,
            ]
        )

In order to evaluate our different approaches, we also define a helper fuction to run the extraction approach over all emails in a given dataset and compute the accuracy (similarity).

In [None]:
def evaluate_extraction(
    extract_fn: Callable[[str], EmailInformation],
    dataset: list[dict[str, Any]],
) -> list[float]:
    """Evaluates an email information extraction function against a ground truth dataset.

    Processes each email in the dataset using the provided extraction function and
    compares the results against ground truth annotations using the EmailInformation
    comparison logic.

    Args:
        extract_fn: Function that takes a raw email string as input and returns
            an EmailInformation object containing the extracted information.
        dataset: List of dictionaries, where each dictionary contains:
            - 'raw_email': The raw email text to process
            - 'extracted_information': Ground truth EmailInformation object

    Returns:
        List of accuracy scores between 0.0 and 1.0 for each email, where:
            - 1.0 indicates perfect extraction matching ground truth
            - 0.0 indicates completely incorrect extraction
            - Values between indicate partial matching of extracted information
    """
    accuracies = []

    for sample in tqdm(dataset, desc="Emails", leave=False):
        extracted_information = extract_fn(sample["raw_email"])

        accuracy = sample["extracted_information"].compare(extracted_information)
        accuracies.append(accuracy)

    return accuracies

# Data

Similarly to my first project, we will use as data emails from the [Enron dataset](https://www.cs.cmu.edu/~enron/). 

I went ahead and created a sample of 20 emails and manually extracted information from them in order to be able to evaluate the different methods.

We load the dataset and split it into train and test sets with with split sizes 0.4, 0.6

In [None]:
with open("enron_emails.yml") as f:
    all_email_data = yaml.safe_load(f)

for email_data in all_email_data:
    email_data["extracted_information"] = EmailInformation.model_validate(
        email_data["extracted_information"]
    )

all_indices = list(range(len(all_email_data)))
train_set_indices = random.choices(all_indices, k=int(0.4 * len(all_email_data)))
test_set_indices = list(set(all_indices).difference(train_set_indices))
train_set = [all_email_data[i] for i in train_set_indices]
test_set = [all_email_data[i] for i in test_set_indices]

In [None]:
sample_test_email = train_set[0]
sample_raw_email = sample_test_email["raw_email"]
sample_email_information = sample_test_email["extracted_information"]
print(f"Sample raw email:\n\n{sample_raw_email}")

In [None]:
print(
    f"Sample ground truth extracted information:\n\n{sample_email_information.model_dump_json(indent=4)}"
)

## First approach - Use Python's builtin email parser

As a first approach, we will simply use Python's builtin email parser from the [email](https://docs.python.org/3/library/email.examples.html) package.

We define an extraction function that parses the emails and extracts information from them without much validation.

In [None]:
def extract_information_with_builtin_parser(raw_email: str) -> EmailInformation:
    """Extracts structured information from a raw email using Python's built-in email parser.

    Parses the raw email text to extract metadata including date, subject, sender, and recipients.
    Handles special X-headers for additional information like sender and recipient names.

    Args:
        raw_email: Raw email text including headers and body.

    Returns:
        Structured object containing the extracted information with:
            - date: Formatted as DD.MM.YYYY
            - subject: Email subject line
            - sender: Sender information including email and optional name
            - recipients: List of recipients (to/cc/bcc) with email and optional name,
                sorted by email address
    """
    parser = Parser()
    email = parser.parsestr(raw_email)
    parsed_date = datetime.strptime(
        email["date"].strip().split("(")[0], "%a, %d %b %Y %H:%M:%S %z "
    )
    formatted_date = parsed_date.strftime("%d.%m.%Y")
    email_dict = {"date": formatted_date, "subject": email["subject"].strip()}
    sender = {"email": email["from"].strip()}
    if email["X-from"] and email["X-from"].strip() != email["from"]:
        sender["name"] = email["X-from"].strip()
    email_dict["sender"] = sender

    recipients = []
    for type_ in ["to", "cc", "bcc"]:
        recipient_names = email.get(f"X-{type_}", "").strip().split(",")
        recipient_emails = email.get(type_, None)
        if recipient_emails is None:
            continue
        recipient_emails = recipient_emails.split(",")
        if len(recipient_emails) != len(recipient_names):
            recipient_names = [""] * len(recipient_emails)
        for recipient_name, recipient_email in zip(recipient_names, recipient_emails):
            recipient = {"type": type_, "email": recipient_email.strip()}
            if recipient_name and recipient_name != recipient_email:
                recipient["name"] = recipient_name.strip()
            recipients.append(recipient)

    email_dict["recipients"] = list(sorted(recipients, key=lambda x: x["email"]))

    return EmailInformation.model_validate(email_dict)

In [None]:
extracted_information = extract_information_with_builtin_parser(sample_raw_email)
sample_accuracy = sample_email_information.compare(extracted_information)
print(
    f"Sample extracted information with builtin parser:\n\n{extracted_information.model_dump_json(indent=4)}"
)
print(
    f"Sample email information extraction accuracy for builtin parser: {sample_accuracy * 100:.2f}%"
)

In [None]:
builtin_test_accuracies = evaluate_extraction(
    extract_information_with_builtin_parser, test_set
)

print(
    f"Mean email information extraction test accuracy for builtin parser: {np.mean(builtin_test_accuracies) * 100:.2f}%"
)

In [None]:
sns.boxplot(x=builtin_test_accuracies);

## Second Approach - Use LLM zero-shot extraction with JSON schema

We will use [llama-cpp-python](), a python wrapper for [llama.cpp](), to run an LLM locally.

It has support for passing a json schema to enforce structured output generation without having to play around with the prompt and retrying in case of failed json generation.

> **Note**: There [known performance issues](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md#troubleshooting) with llama.cpp's structured output generation using grammars and, by extension, json schemas especially with nested objects.

We use a quantized version of Llama 3.2 3B Instruct as LLM and limit the context length to 16384 in order to handle long raw emails.

In [None]:
llm = Llama.from_pretrained(
    "bartowski/Llama-3.2-3B-Instruct-GGUF",
    filename="Llama-3.2-3B-Instruct-Q8_0.gguf",
    n_ctx=16384,
    n_gpu_layers=-1,
    verbose=False,
)

We then define a system prompt with instructions for the LLM. Inspired by [this blog post](https://www.boundaryml.com/blog/type-definition-prompting-baml), I decided to use a JSON type definition in the prompt instead of a JSON schema because it is shorter, produced better results and is more human-readable.

In [None]:
system_prompt = """You are a helpful assistant that extract information from a user provided email in JSON format that adheres to the following schema:

{
    "date": string,
    "subject": string,
    "sender": {
        "name": string | null,
        "email": string,
        "phone_number": string | null,
        "role": string | null,
        "organization": string | null
    },
    "recipients": {
        "name": string | null,
        "email": string,
        "phone_number": string | null,
        "role": string | null,
        "organization": string | null,
        "to": enum(["to", "cc", "bcc"])
    }[]
}
"""
print(f"System prompt:\n---\n\n{system_prompt}")

In [None]:
def extract_information_with_llm(
    raw_email: str, *, system_prompt: str
) -> EmailInformation:
    """Extracts structured information from a raw email using an LLM.

    Uses chat completion API to parse email content into structured format.
    Enforces output schema validation using EmailInformation model specification.

    Args:
        raw_email: Raw email text including headers and body.
        system_prompt: System prompt for the LLM that defines the extraction task.

    Returns:
        Structured object containing the extracted information, validated against the EmailInformation schema.
    """
    response_format = {
        "type": "json_object",
        "schema": EmailInformation.model_json_schema(),
    }
    output = llm.create_chat_completion_openai_v1(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": raw_email},
        ],
        response_format=response_format,
        temperature=0.3,
    )
    extracted_information = EmailInformation.model_validate_json(
        output.choices[0].message.content
    )
    return extracted_information

In [None]:
extracted_information = extract_information_with_llm(
    sample_raw_email, system_prompt=system_prompt
)
sample_accuracy = sample_email_information.compare(extracted_information)
print(
    f"Sample extracted information with llm zero-shot:\n\n{extracted_information.model_dump_json(indent=4)}"
)
print(
    f"Sample email information extraction accuracy for llm zero-shot: {sample_accuracy * 100:.2f}%"
)

In [None]:
llm_test_accuracies = evaluate_extraction(
    partial(extract_information_with_llm, system_prompt=system_prompt), test_set
)

print(
    f"Mean email information extraction test accuracy: {np.mean(llm_test_accuracies) * 100:.2f}%"
)

In [None]:
sns.boxplot(x=llm_test_accuracies)

## Third approach - LLM few-shot extraction with JSON schema

In [None]:
train_accuracies = []

for i in trange(len(train_set), desc="Example"):
    example = train_set[i]
    train_set_without_example = train_set[:i] + train_set[i + 1 :]
    system_prompt_with_example = (
        system_prompt
        + f"""
Use the following example of raw email and extracted information as reference:

# Raw email

{example["raw_email"]}

# Extracted information

{example["extracted_information"].model_dump_json(indent=2)}
"""
    )
    accuracies = evaluate_extraction(
        partial(extract_information_with_llm, system_prompt=system_prompt_with_example),
        train_set_without_example,
    )
    mean_accuracy = np.mean(accuracies).item()
    train_accuracies.append((mean_accuracy, system_prompt_with_example))

In [None]:
sns.boxplot(x=[x[0] for x in train_accuracies])

In [None]:
best_index = np.argmin([x[0] for x in train_accuracies])
best_system_prompt_with_example = train_accuracies[best_index][1]
print(best_system_prompt_with_example)

In [None]:
extracted_information = extract_information_with_llm(
    sample_raw_email, system_prompt=best_system_prompt_with_example
)
sample_accuracy = sample_email_information.compare(extracted_information)
print(
    f"Sample extracted information with llm zero-shot:\n\n{extracted_information.model_dump_json(indent=4)}"
)
print(
    f"Sample email information extraction accuracy for llm zero-shot: {sample_accuracy * 100:.2f}%"
)

In [None]:
few_shot_llm_test_accuracies = evaluate_extraction(
    partial(
        extract_information_with_llm, system_prompt=best_system_prompt_with_example
    ),
    test_set,
)

print(
    f"Mean email information extraction test accuracy: {np.mean(few_shot_llm_test_accuracies) * 100:.2f}%"
)

In [None]:
sns.boxplot(x=few_shot_llm_test_accuracies)

In [None]:
import pandas as pd

df = pd.DataFrame(
    {
        "Builtin": builtin_test_accuracies,
        "LLM Zero-Shot": llm_test_accuracies,
        "LLM Few-Shot": few_shot_llm_test_accuracies,
    }
)

In [None]:
sns.boxplot(data=df)

# Conclusion

In this post, we have ...