In [1]:
import pdfplumber
import re

In [2]:
pdf = pdfplumber.open("calendario.pdf")

In [3]:
print(set(char["fontname"] for char in pdf.chars))
print(set(char["size"] for char in pdf.chars))

{'Helvetica-Bold', 'Helvetica'}
{8.0, 9.0, 10.0, 11.999999999999986, 12.000000000000007, 12.0, 11.999999999999943, 12.000000000000028, 20.0, 24.0}


In [4]:
headers = ["Gara N", "Squadra A", "Squadra B", "Giorno", "Data", "Ora"]
coordinates = ["x0", "top", "x1", "bottom"]
first_page = pdf.pages[0]

for header in headers:
    print(f"=== {header} ===")
    search = first_page.search(header)[0]
    for coordinate in coordinates:
        print(f"{coordinate}: {search[coordinate]}")
    print("\n")

=== Gara N ===
x0: 36.0
top: 171.48400000000004
x1: 75.348
bottom: 183.48400000000004


=== Squadra A ===
x0: 88.7
top: 171.48400000000004
x1: 145.4
bottom: 183.48400000000004


=== Squadra B ===
x0: 246.8
top: 171.48400000000004
x1: 303.50000000000006
bottom: 183.48400000000004


=== Giorno ===
x0: 404.9
top: 171.48400000000004
x1: 440.912
bottom: 183.48400000000004


=== Data ===
x0: 496.74
top: 72.65599999999995
x1: 514.076
bottom: 80.65599999999995


=== Ora ===
x0: 499.42
top: 84.65599999999995
x1: 513.2040000000001
bottom: 92.65599999999995




In [6]:
def filter_exclude_header(obj):
    return obj["bottom"] > 184


def filter_bold_chars(obj):
    return "bold" in obj["fontname"].lower()


# def num_gara(obj):
#     return obj["x0"] < 88


def filter_squadra_a(obj):
    return (
        filter_exclude_header(obj)
        and filter_bold_chars(obj)
        and (obj["x0"] > 88)
        and (obj["x1"] < 246)
    )


def filter_squadra_b(obj):
    return (
        filter_exclude_header(obj)
        and filter_bold_chars(obj)
        and (obj["x0"] > 246)
        and (obj["x1"] < 404)
    )


def filter_date_time(obj):
    return filter_exclude_header(obj) and filter_bold_chars(obj) and obj["x0"] > 440


def filter_address(obj):
    return filter_exclude_header(obj) and not filter_bold_chars(obj)

In [7]:
def postprocess_teams(text: str) -> list[str]:
    text = re.sub(r"^.*Giornata di.*$", "", text, flags=re.MULTILINE | re.IGNORECASE)

    teams = re.split(r"^ *\n", text, flags=re.MULTILINE)
    teams = [
        re.sub(r"\s{3,}", " ", t.strip()).replace("  ", " ") for t in teams if t.strip()
    ]
    teams = [t for t in teams if re.sub(r"\d+", "", t)]

    return teams


def postprocess_date_time(text: str) -> list[tuple[str, str]]:
    text = re.split(r"^ *\n", text, flags=re.MULTILINE)
    dt = [re.split(r" +", t.strip()) for t in text if t.strip()]
    return dt


def postprocess_address(text: str) -> list[tuple[str, str]]:
    text = re.split(r"^ *\n", text, flags=re.MULTILINE)
    text = [
        re.sub(r"\s{3,}", " ", t.strip()).replace("  ", " ") for t in text if t.strip()
    ]
    address = [
        re.split(r" *- *", t, maxsplit=1) for t in text if "note" not in t.lower()
    ]
    return address

In [8]:
filters_func = {
    "squadra_a": filter_squadra_a,
    "squadra_b": filter_squadra_b,
    "date_time": filter_date_time,
    "address": filter_address,
}

postprocess_func = {
    "squadra_a": postprocess_teams,
    "squadra_b": postprocess_teams,
    "date_time": postprocess_date_time,
    "address": postprocess_address,
}

In [9]:
result = {
    "squadra_a": "",
    "squadra_b": "",
    "date_time": "",
    "address": "",
}

In [10]:
for page in pdf.pages[:]:
    for k in result.keys():
        filter_page = page.filter(lambda obj: filters_func[k](obj))
        text = filter_page.extract_text(layout=True, use_text_flow=False)
        result[k] += f"\n{text}"


In [11]:
for k, v in result.items():
    result[k] = postprocess_func[k](v)

In [12]:
for k, v in result.items():
    print(f"{k}: {len(v)}")

squadra_a: 240
squadra_b: 240
date_time: 240
address: 240
