In this post we will see how to use a local LLM to extract structured information from emails.

My very first project when I started working at appliedAI Initiative in 2021, involved information extraction from emails for a company that makes a document management system. Back then LLMs were not yet as widespread and as useful as they are right now, so we decided to train a model from scratch. We however didn't have any labelled data for training because we couldn't use their customer data due to privacy reasons and had to resort to manually labelling emails from the [Enron email dataset]() and in the end the results were not very impressive.

Now, this type of application is simpler than ever and I want to demonstrate that in this blog post.

# Imports

In [None]:
import json
import random
import yaml
from email.parser import Parser
from functools import partial
from typing import Any, Callable, Literal

import instructor
import numpy as np
from deepdiff import DeepDiff
from llama_cpp import Llama
from pydantic import BaseModel
from tqdm.notebook import tqdm

random.seed(16)

# Helper functions

In [None]:
class Sender(BaseModel):
    name: str | None = None
    email: str
    phone_number: str | None = None
    role: str | None = None
    organization: str | None = None


class Recipient(Sender):
    type: Literal["to", "cc", "bcc"] = "to"


class EmailInformation(BaseModel):
    date: str
    subject: str
    sender: Sender
    recipients: list[Recipient]

In [None]:
def compute_extracted_information_accuracy(
    *,
    expected_info: EmailInformation,
    extracted_info: EmailInformation,
) -> float:
    extracted_info_dict = extracted_info.model_dump(mode="json", exclude_none=True)
    expected_info_dict = expected_info.model_dump(mode="json", exclude_none=True)
    diff_result = DeepDiff(
        expected_info_dict,
        extracted_info_dict,
        get_deep_distance=True,
        verbose_level=2,
    )
    return 1 - diff_result["deep_distance"]

In [None]:
def evaluate_extraction(
    extract_fn: Callable[[str], EmailInformation],
    dataset: list[dict[str, Any]],
) -> list[float]:
    accuracies = []

    for sample in tqdm(dataset, desc="Emails"):
        print(f"{sample=}")
        extracted_information = extract_fn(sample["raw_email"])

        accuracy = compute_extracted_information_accuracy(
            extracted_info=extracted_information,
            expected_info=sample["extracted_information"],
        )
        accuracies.append(accuracy)

    return accuracies

# Data

Similarly to my first project, we will use as data emails from the [Enron dataset](https://www.cs.cmu.edu/~enron/). 

In [None]:
with open("enron_emails.yml") as f:
    all_email_data = yaml.safe_load(f)

for email_data in all_email_data:
    email_data["extracted_information"] = EmailInformation.model_validate(
        email_data["extracted_information"]
    )

all_indices = list(range(len(all_email_data)))
train_set_indices = random.choices(all_indices, k=int(0.5 * len(all_email_data)))
test_set_indices = list(set(all_indices).difference(train_set_indices))
train_set = [all_email_data[i] for i in train_set_indices]
test_set = [all_email_data[i] for i in test_set_indices]

In [None]:
sample_raw_email, sample_email_information = train_set[0]
print(f"Sample raw email:\n{train_set[0]['raw_email']}")

In [None]:
train_set[0]["extracted_information"]

## First approach - Use Python's builtin email parser

In [None]:
def extract_information_with_builtin_parser(raw_email: str) -> EmailInformation:
    parser = Parser()
    email = parser.parsestr(raw_email)
    email_dict = {"date": email["date"].strip(), "subject": email["subject"].strip()}
    sender = {"email": email["from"].strip()}
    if email["X-from"] and email["X-from"].strip() != email["from"]:
        sender["name"] = email["X-from"].strip()
    email_dict["sender"] = sender

    recipients = []
    for type_ in ["to", "cc", "bcc"]:
        recipient_names = email.get(f"X-{type_}", "").strip().split(",")
        recipient_emails = email.get(type_, None)
        if recipient_emails is None:
            continue
        recipient_emails = recipient_emails.split(",")
        if len(recipient_emails) != len(recipient_names):
            recipient_names = [""] * len(recipient_emails)
        for recipient_name, recipient_email in zip(recipient_names, recipient_emails):
            recipient = {"type": type_, "email": recipient_email.strip()}
            if recipient_name and recipient_name != recipient_email:
                recipient["name"] = recipient_name.strip()
            recipients.append(recipient)

    email_dict["recipients"] = list(sorted(recipients, key=lambda x: x["email"]))

    return EmailInformation.model_validate(email_dict)

In [None]:
extracted_information = extract_information_with_builtin_parser(
    test_set[0]["raw_email"]
)
extracted_information

In [None]:
sample_accuracy = compute_extracted_information_accuracy(
    extracted_info=extracted_information,
    expected_info=test_set[0]["extracted_information"],
)
print(f"Sample email information extraction accuracy: {sample_accuracy * 100:.2f}%")

In [None]:
test_accuracies = evaluate_extraction(extract_information_with_builtin_parser, test_set)

mean_test_accuracy = np.mean(test_accuracies).item()
print(
    f"Mean email information extraction test accuracy for builtin parser: {mean_test_accuracy * 100:.2f}%"
)

## Second Approach - Use LLM with Json schema

We will use [llama-cpp-python](), a python wrapper for [llama.cpp](), to run an LLM locally.

It has support for passing a json schema to enforce structured output generation without having to play around with the prompt and retrying in case of failed json generation.

Unfortunately, due to [known performance issues](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md#troubleshooting) with llama.cpp's grammars and, by extension, json schemas, we will instead use [instructor]() ...

In [None]:
llm = Llama.from_pretrained(
    "bartowski/Llama-3.2-1B-Instruct-GGUF",
    filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
    n_ctx=16384,
    n_gpu_layers=-1,
    verbose=False,
)
llm_extract_information = instructor.patch(
    create=llm.create_chat_completion_openai_v1,
)

In [None]:
email_json_schema = {
    "type": "json_object",
    "schema": EmailInformation.model_json_schema(),
}
print(email_json_schema)

In [None]:
system_prompt = f"""You are a helpful assistant that extract information from a user provided email in JSON format that adheres to the following schema:

{json.dumps(email_json_schema, indent=4)}
"""

In [None]:
def extract_information_with_llm(
    raw_email: str, *, system_prompt: str
) -> EmailInformation:
    extracted_information = llm_extract_information(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": raw_email},
        ],
        response_model=EmailInformation,
        temperature=0.3,
    )
    return extracted_information

In [None]:
extracted_information = extract_information_with_llm(
    test_set[0]["raw_email"], system_prompt=system_prompt
)
extracted_information

In [None]:
sample_accuracy = compute_extracted_information_accuracy(
    extracted_info=extracted_information,
    expected_info=test_set[0]["extracted_information"],
)
print(f"Sample email information extraction accuracy: {sample_accuracy * 100:.2f}%")

In [None]:
test_accuracies = evaluate_extraction(
    partial(extract_information_with_llm, system_prompt=system_prompt), test_set
)

mean_test_accuracy = np.mean(test_accuracies).item()
print(
    f"Mean email information extraction test accuracy: {mean_test_accuracy * 100:.2f}%"
)

## Third approach - LLM with Json schema and few-shot prompting

In [None]:
train_accuracies = evaluate_extraction(
    partial(extract_information_with_llm, system_prompt=system_prompt), train_set
)

In [None]:
train_accuracies = []

for i in range(train_set):
    example = train_set[i]
    train_set_without_example = train_set[:i] + train_set[i + 1 :]
    system_prompt_with_example = f"""You are a helpful assistant that extract information from a user provided email in JSON format that adheres to the following schema:

{json.dumps(email_json_schema, indent=4)}

Use the following example as reference:
{example["extracted_information"]}
"""
    accuracies = evaluate_extraction(
        partial(extract_information_with_llm, system_prompt=system_prompt_with_example),
        train_set_without_example,
    )
    mean_accuracy = np.mean(accuracies).item()
    train_accuracies.append((mean_accuracy, system_prompt_with_example))

In [None]:
best_index = np.argmin([x[0] for x in train_accuracies])
best_system_prompt_with_example = train_accuracies[best_index][1]
print(best_system_prompt_with_example)

In [None]:
test_accuracies = evaluate_extraction(
    partial(
        extract_information_with_llm, system_prompt=best_system_prompt_with_example
    ),
    test_set,
)

mean_test_accuracy = np.mean(test_accuracies).item()
print(
    f"Mean email information extraction test accuracy: {mean_test_accuracy * 100:.2f}%"
)

# Conclusion

In this post, we have ...