In this post we will see how to use a local LLM to extract structured information from emails.

My very first project when I started working at appliedAI Initiative in 2021, involved information extraction from emails for a company that makes a document management system. Back then LLMs were not yet as widespread and as useful as they are right now, so we decided to train a model from scratch. We however didn't have any labelled data for training because we couldn't use their customer data due to privacy reasons and had to resort to manually labelling emails from the [Enron email dataset]() and in the end the results were not very impressive.

Now, this type of application is simpler than ever and I want to demonstrate that in this blog post.

# Imports

In [None]:
import json
import os
import random
import shutil
import tarfile
import tempfile
from email.message import EmailMessage
from email.parser import Parser
from email.policy import default
from pathlib import Path
from typing import Any

import dspy
import numpy as np
import requests
from deepdiff import DeepDiff
from llama_cpp import Llama
from tqdm.notebook import tqdm

random.seed(16)

# Helper functions

In [None]:
def download(url: str, filename: str | os.PathLike, chunk_size: int = 1024) -> None:
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    filename = Path(filename)
    with filename.open("wb") as file, tqdm(
        desc=filename.name,
        total=total,
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)

In [None]:
def convert_email_to_dict(email: EmailMessage) -> dict:
    email_dict = {"subject": email["subject"]}
    sender = {"email": email["from"].strip()}
    if email["X-from"] and email["X-from"] != email["from"]:
        sender["name"] = email["X-from"].strip()
    email_dict["sender"] = sender

    recipients = []
    for type_ in ["to", "cc", "bcc"]:
        recipient_names = email.get(f"X-{type_}", "").split(",")
        recipient_emails = email.get(type_, "").split(",")
        if len(recipient_emails) != len(recipient_names):
            recipient_names = [""] * len(recipient_emails)
        for recipient_name, recipient_email in zip(recipient_names, recipient_emails):
            recipient = {"type": type_, "email": recipient_email.strip()}
            if recipient_name and recipient_name != recipient_email:
                recipient["name"] = recipient_name.strip()
            recipients.append(recipient)

    email_dict["recipients"] = list(sorted(recipients, key=lambda x: x["email"]))

    return email_dict

In [None]:
def compute_extracted_information_accuracy(
    extracted_info,
    expected_info: dict,
) -> float:
    diff_result = DeepDiff(
        extracted_info,
        expected_info,
        get_deep_distance=True,
        verbose_level=2,
        exclude_paths=["root['sender']['phone_number']", "root['sender']['role']"],
        exclude_regex_paths=[
            r"root\['recipients'\]\[\d+\]\['phone_number'\]",
            r"root\['recipients'\]\[\d+\]\['role'\]",
        ],
        ignore_order=True,
    )
    return 1 - diff_result["deep_distance"]

# Constants

In [None]:
N_TOTAL_EMAILS = 100
N_TRAIN_EMAILS = 70

# Data

Similarly to my first project, we will use as data emails from the [Enron dataset](https://www.cs.cmu.edu/~enron/). 

In [None]:
dataset_url = "https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz"
dataset_dir = Path(tempfile.gettempdir()) / "llm_information_extraction"
dataset_dir.mkdir(exist_ok=True)
dataset_tar_file = dataset_dir / "enron_mail_20150507.tar.gz"
dataset_extracted_dir = dataset_dir / "enron_emails"

if not dataset_tar_file.is_file():
    download(dataset_url, dataset_tar_file)

shutil.rmtree(dataset_extracted_dir, ignore_errors=True)
dataset_extracted_dir.mkdir(exist_ok=True)

with tarfile.open(dataset_tar_file, "r:gz") as tar:
    already_visited_person = set()
    for i, tarinfo in enumerate(tqdm(tar, desc="Tar archive files")):
        if len(already_visited_person) == N_TOTAL_EMAILS:
            break
        if not tarinfo.isfile():
            continue
        if "inbox" not in tarinfo.name:
            continue
        person_name = tarinfo.name.split("/")[1]
        if person_name in already_visited_person:
            continue
        already_visited_person.add(person_name)
        tar.extract(tarinfo, dataset_extracted_dir)

In [None]:
email_files = [x for x in dataset_extracted_dir.rglob("*") if x.is_file()]

email_parser = Parser(policy=default)
parsed_emails: list[tuple[EmailMessage, dict[str, Any]]] = []

for email_file in email_files:
    with email_file.open() as f:
        parsed_email = email_parser.parse(f)
    parsed_email_dict = convert_email_to_dict(parsed_email)
    parsed_emails.append((parsed_email, parsed_email_dict))

train_set_indices = random.choices(range(0, N_TOTAL_EMAILS), k=N_TRAIN_EMAILS)
test_set_indices = list(set(range(0, N_TOTAL_EMAILS)).difference(train_set_indices))
train_set = [parsed_emails[i] for i in train_set_indices]
test_set = [parsed_emails[i] for i in test_set_indices]

In [None]:
sample_email, sample_email_dict = train_set[0]
print(sample_email.as_string())

In [None]:
sample_email_dict

# LLM

In [None]:
llm = Llama.from_pretrained(
    "bartowski/Llama-3.2-1B-Instruct-GGUF",
    filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
    n_ctx=16384,
    verbose=False,
)

In [None]:
email_json_schema = {
    "type": "json_object",
    "schema": {
        "type": "object",
        "properties": {
            "subject": {"type": "string"},
            "sender": {
                "type": "object",
                "properties": {
                    "email": {"type": "string"},
                    "name": {"type": "string"},
                    "phone_number": {"type": "string"},
                    "role": {"type": "string"},
                },
                "required": ["email"],
            },
            "recipients": {
                "type": "array",
                "items": [
                    {
                        "type": "object",
                        "properties": {
                            "type": {
                                "type": "string",
                                "enum": ["to", "cc", "bcc"],
                            },
                            "email": {"type": "string"},
                            "name": {"type": "string"},
                            "phone_number": {"type": "string"},
                            "role": {"type": "string"},
                        },
                        "required": [
                            "email",
                            "type",
                        ],
                    }
                ],
            },
        },
        "required": ["subject", "sender", "recipients"],
    },
}

In [None]:
system_prompt = "You are a helpful assistant that outputs in JSON information extracted from an email provided by the user."

In [None]:
output = llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": system_prompt,
        },
        {"role": "user", "content": sample_email.as_string()},
    ],
    response_format=email_json_schema,
    temperature=0.3,
)
extracted_information = json.loads(output["choices"][0]["message"]["content"])
extracted_information

In [None]:
compute_extracted_information_accuracy(extracted_information, sample_email_dict)

In [None]:
accuracies = []

for parsed_email, parsed_email_dict in tqdm(test_set, desc="Emails"):
    output = llm.create_chat_completion(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": parsed_email.as_string()},
        ],
        response_format=email_json_schema,
        temperature=0.3,
    )
    extracted_information = json.loads(output["choices"][0]["message"]["content"])

    accuracy = compute_extracted_information_accuracy(
        extracted_information, parsed_email_dict
    )
    accuracies.append(accuracy)

mean_accuracy = np.mean(accuracies).item()
print(f"Mean email information extraction accuracy: {mean_accuracy * 100:.2f}%")

# Prompt Optimization

As we have seen so far, the results are good but not great.

We can improve that through prompt engineering but it's a tedious and manual process.

We can instead use an optimizer to find a better prompt for us.

In [None]:
lm = dspy.LlamaCpp(
    model="llama",
    llama_model=llm,
    model_type="chat",
    temperature=0.3,
    max_tokens=4096,
)
dspy.configure(lm=lm)

In [None]:
class EmailExtraction(dspy.Signature):
    email: str = dspy.InputField(desc="Raw email content")
    subject: str = dspy.OutputField(desc="Email subject")
    sender: dict[str, str] = dspy.OutputField(
        desc="Email sender's name, email address, phone number and role as a dictionary with keys 'email', 'name', 'phone_number', 'role'",
        examples=[{"name": "John Smith", "email": "john.smith@enron.com"}],
    )
    recipients: list[dict[str, str]] = dspy.OutputField(
        desc="Email recipients' name, email address, phone number, role and type (to, cc, bcc) as a list of dictionaries with keys 'type', 'email', 'name', 'phone_number', 'role'",
        examples=[
            [{"name": "John Smith", "email": "john.smith@enron.com", "type": "to"}]
        ],
    )

In [None]:
email_extractor = dspy.ChainOfThought(EmailExtraction)
email_extractor

In [None]:
def extraction_correctness_metric(
    example: dspy.Example, prediction: dspy.Prediction, trace=None
) -> float:
    example_dict = {
        "subject": example["subject"],
        "sender": example["sender"],
        "recipients": example["recipients"],
    }
    prediction_dict = {
        "subject": prediction["subject"],
        "sender": prediction["sender"],
        "recipients": prediction["recipients"],
    }
    return compute_extracted_information_accuracy(prediction_dict, example_dict)

In [None]:
train_set_examples = []
for parsed_email, parsed_email_dict in train_set:
    example = dspy.Example(
        email=parsed_email.as_string(), **parsed_email_dict
    ).with_inputs("email")
    train_set_examples.append(example)

test_set_examples = []
for parsed_email, parsed_email_dict in test_set:
    example = dspy.Example(
        email=parsed_email.as_string(), **parsed_email_dict
    ).with_inputs("email")
    test_set_examples.append(example)

In [None]:
response = email_extractor(email=test_set_examples[1].email)
response

In [None]:
evaluate_correctness = dspy.Evaluate(
    devset=test_set_examples,
    metric=extraction_correctness_metric,
    num_threads=1,
    display_progress=True,
    display_table=True,
)

In [None]:
evaluate_correctness(email_extractor)

In [None]:
dspy.inspect_history(n=1)

In [None]:
mipro_optimizer = dspy.MIPROv2(
    metric=extraction_correctness_metric,
    auto="medium",
)

In [None]:
optimized_email_extractor = mipro_optimizer.compile(
    email_extractor,
    trainset=train_set_examples,
    max_bootstrapped_demos=3,
    requires_permission_to_run=False,
    minibatch=False,
)

In [None]:
evaluate_correctness(optimized_email_extractor)

In [None]:
dspy.inspect_history(n=1)

# Conclusion

In this post, we have seen how to use a local LLM, using llama-cpp-python, to extract information from the raw content of emails and how to automatically improve the prompt by using dspy.