In this post we will see how to use a local LLM to extract structured information from emails.

My very first project when I started working at appliedAI Initiative in 2021, involved information extraction from emails for a company that makes a document management system. Back then LLMs were not yet as widespread and as useful as they are right now, so we decided to train a model from scratch. We however didn't have any labelled data for training because we couldn't use their customer data due to privacy reasons and had to resort to manually labelling emails from the [Enron email dataset]() and in the end the results were not very impressive.

Now, this type of application is simpler than ever and I want to demonstrate that in this blog post.

# Imports

In [None]:
import json
import os
import random
import shutil
import tarfile
import tempfile
from email.message import EmailMessage
from email.parser import Parser
from email.policy import default
from itertools import combinations
from pathlib import Path
from typing import Any

import numpy as np
import requests
from deepdiff import DeepDiff
from llama_cpp import Llama
from tqdm.notebook import tqdm

random.seed(16)

# Helper functions

In [None]:
def download(url: str, filename: str | os.PathLike, chunk_size: int = 1024) -> None:
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    filename = Path(filename)
    with filename.open("wb") as file, tqdm(
        desc=filename.name,
        total=total,
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)

In [None]:
def convert_email_to_dict(email: EmailMessage) -> dict:
    email_dict = {"subject": email["subject"]}
    sender = {"email": email["from"].strip()}
    if email["X-from"] and email["X-from"] != email["from"]:
        sender["name"] = email["X-from"].strip()
    email_dict["sender"] = sender

    recipients = []
    for type_ in ["to", "cc", "bcc"]:
        recipient_names = email.get(f"X-{type_}", "").split(",")
        recipient_emails = email.get(type_, "").split(",")
        if len(recipient_emails) != len(recipient_names):
            recipient_names = [""] * len(recipient_emails)
        for recipient_name, recipient_email in zip(recipient_names, recipient_emails):
            recipient = {"type": type_, "email": recipient_email.strip()}
            if recipient_name and recipient_name != recipient_email:
                recipient["name"] = recipient_name.strip()
            recipients.append(recipient)

    email_dict["recipients"] = list(sorted(recipients, key=lambda x: x["email"]))

    return email_dict

In [None]:
def compute_extracted_information_accuracy(
    extracted_info,
    expected_info: dict,
) -> float:
    diff_result = DeepDiff(
        extracted_info,
        expected_info,
        get_deep_distance=True,
        verbose_level=2,
        exclude_paths=["root['sender']['phone_number']", "root['sender']['role']"],
        exclude_regex_paths=[
            r"root\['recipients'\]\[\d+\]\['phone_number'\]",
            r"root\['recipients'\]\[\d+\]\['role'\]",
        ],
        ignore_order=True,
    )
    return 1 - diff_result["deep_distance"]

# Constants

In [None]:
N_TOTAL_EMAILS = 30
N_TRAIN_EMAILS = 10

# Data

Similarly to my first project, we will use as data emails from the [Enron dataset](https://www.cs.cmu.edu/~enron/). 

In [None]:
dataset_url = "https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz"
dataset_dir = Path(tempfile.gettempdir()) / "llm_information_extraction"
dataset_dir.mkdir(exist_ok=True)
dataset_tar_file = dataset_dir / "enron_mail_20150507.tar.gz"
dataset_extracted_dir = dataset_dir / "enron_emails"

if not dataset_tar_file.is_file():
    download(dataset_url, dataset_tar_file)

shutil.rmtree(dataset_extracted_dir, ignore_errors=True)
dataset_extracted_dir.mkdir(exist_ok=True)

with tarfile.open(dataset_tar_file, "r:gz") as tar:
    already_visited_person = set()
    for i, tarinfo in enumerate(tqdm(tar, desc="Tar archive files")):
        if len(already_visited_person) == N_TOTAL_EMAILS:
            break
        if not tarinfo.isfile():
            continue
        if "inbox" not in tarinfo.name:
            continue
        person_name = tarinfo.name.split("/")[1]
        if person_name in already_visited_person:
            continue
        already_visited_person.add(person_name)
        tar.extract(tarinfo, dataset_extracted_dir)

In [None]:
email_files = [x for x in dataset_extracted_dir.rglob("*") if x.is_file()]

email_parser = Parser(policy=default)
parsed_emails: list[tuple[EmailMessage, dict[str, Any]]] = []

for email_file in email_files:
    with email_file.open() as f:
        parsed_email = email_parser.parse(f)
    parsed_email_dict = convert_email_to_dict(parsed_email)
    parsed_emails.append((parsed_email, parsed_email_dict))

train_set_indices = random.choices(range(0, N_TOTAL_EMAILS), k=N_TRAIN_EMAILS)
test_set_indices = list(set(range(0, N_TOTAL_EMAILS)).difference(train_set_indices))
train_set = [parsed_emails[i] for i in train_set_indices]
test_set = [parsed_emails[i] for i in test_set_indices]

In [None]:
sample_email, sample_email_dict = train_set[0]
print(sample_email.as_string())

In [None]:
sample_email_dict

# LLM

In [None]:
llm = Llama.from_pretrained(
    "bartowski/Llama-3.2-1B-Instruct-GGUF",
    filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
    n_ctx=16384,
    verbose=False,
)

In [None]:
email_json_schema = {
    "type": "json_object",
    "schema": {
        "type": "object",
        "properties": {
            "subject": {"type": "string"},
            "sender": {
                "type": "object",
                "properties": {
                    "email": {"type": "string"},
                    "name": {"type": "string"},
                    "phone_number": {"type": "string"},
                    "role": {"type": "string"},
                },
                "required": ["email"],
            },
            "recipients": {
                "type": "array",
                "items": [
                    {
                        "type": "object",
                        "properties": {
                            "type": {
                                "type": "string",
                                "enum": ["to", "cc", "bcc"],
                            },
                            "email": {"type": "string"},
                            "name": {"type": "string"},
                            "phone_number": {"type": "string"},
                            "role": {"type": "string"},
                        },
                        "required": [
                            "email",
                            "type",
                        ],
                    }
                ],
            },
        },
        "required": ["subject", "sender", "recipients"],
    },
}

In [None]:
system_prompt = f"""You are a helpful assistant that extract information from a user provided email in JSON format that adheres to the following schema:

{json.dumps(email_json_schema, indent=4)}
"""

In [None]:
output = llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {"role": "user", "content": sample_email.as_string()},
    ],
    response_format=email_json_schema,
    temperature=0.3,
)
extracted_information = json.loads(output["choices"][0]["message"]["content"])
extracted_information

In [None]:
sample_accuracy = compute_extracted_information_accuracy(
    extracted_information, sample_email_dict
)
print(f"Sample email information extraction accuracy: {sample_accuracy * 100:.2f}%")

In [None]:
test_accuracies = []

for parsed_email, parsed_email_dict in tqdm(test_set, desc="Emails"):
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": parsed_email.as_string()},
        ],
        response_format=email_json_schema,
        temperature=0.3,
    )
    extracted_information = json.loads(output["choices"][0]["message"]["content"])

    accuracy = compute_extracted_information_accuracy(
        extracted_information, parsed_email_dict
    )
    test_accuracies.append(accuracy)

mean_test_accuracy = np.mean(test_accuracies).item()
print(
    f"Mean email information extraction test accuracy: {mean_test_accuracy * 100:.2f}%"
)

# Prompt Optimization

As we have seen so far, the results are good but not great.

We can improve by using a few-shot prompt with some examples from our training data.

In [None]:
train_accuracies = []

for parsed_email, parsed_email_dict in tqdm(train_set, desc="Emails"):
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": parsed_email.as_string()},
        ],
        response_format=email_json_schema,
        temperature=0.3,
    )
    extracted_information = json.loads(output["choices"][0]["message"]["content"])

    accuracy = compute_extracted_information_accuracy(
        extracted_information, parsed_email_dict
    )
    train_accuracies.append(accuracy)

mean_train_accuracy = np.mean(train_accuracies).item()
print(
    f"Mean email information extraction train accuracy: {mean_train_accuracy * 100:.2f}%"
)

In [None]:
worst_accuracy_index = np.argmin(train_accuracies)
worst_accuracy_email = train_set[worst_accuracy_index]

In [None]:
best_accuracy_index = np.argmax(train_accuracies)
best_accuracy_email = train_set[best_accuracy_index]

In [None]:
rng = np.random.default_rng(16)
indices = set(range(len(train_set)))
indices = list(indices.difference([worst_accuracy_index, best_accuracy_index]))
random_index = rng.choice(indices)
random_email = train_set[random_index]

In [None]:
system_prompt_with_examples = (
    system_prompt
    + f"""

Use the following examples as reference:

# Example 1
## Email
{worst_accuracy_email}
## Extracted Information
{json.dumps(convert_email_to_dict(worst_accuracy_email), indent=4)}

# Example 2
## Email
{best_accuracy_email}
## Extracted Information
{json.dumps(convert_email_to_dict(best_accuracy_email), indent=4)}

# Example 3
## Email
{random_email}
## Extracted Information
{json.dumps(convert_email_to_dict(random_email), indent=4)}
"""
)

In [None]:
print(system_prompt_with_examples)

In [None]:
test_accuracies = []

for parsed_email, parsed_email_dict in tqdm(test_set, desc="Emails"):
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": system_prompt_with_examples,
            },
            {"role": "user", "content": parsed_email.as_string()},
        ],
        response_format=email_json_schema,
        temperature=0.3,
    )
    extracted_information = json.loads(output["choices"][0]["message"]["content"])

    accuracy = compute_extracted_information_accuracy(
        extracted_information, parsed_email_dict
    )
    test_accuracies.append(accuracy)

mean_test_accuracy = np.mean(test_accuracies).item()
print(
    f"Mean email information extraction test accuracy: {mean_test_accuracy * 100:.2f}%"
)

# Conclusion

In this post, we have seen how to use a local LLM, using llama-cpp-python, to extract information from the raw content of emails and how use a few-shot prompt with well chosen examples to improve the results.

Manually optimizing the prompt is however in general a tedious and manual process.

We could instead use dspy to automatically optimize the prompt.