### Main

In [None]:
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Dict, Optional
from pathlib import Path
import re, hashlib

import fitz
import pdfplumber

# for table extraction
try:
    import camelot  # for tables (lattice/stream)
    _has_camelot = True
except Exception:
    _has_camelot = False

# tokenizer
try:
    import tiktoken
    _enc = tiktoken.get_encoding("cl100k_base")
    def tok_len(s: str) -> int: return len(_enc.encode(s))
    def tok_split(s: str, n: int) -> List[str]:
        ids = _enc.encode(s)
        return [_enc.decode(ids[i:i+n]) for i in range(0, len(ids), n)]
except Exception:
    def tok_len(s: str) -> int: return max(1, len(s)//4)
    def tok_split(s: str, n: int) -> List[str]:
        step = n*4
        return [s[i:i+step] for i in range(0, len(s), step)]

def md5(s: str) -> str: return hashlib.md5(s.encode("utf-8")).hexdigest()

@dataclass
class ParagraphBlock:
    text: str
    page: int
    section_path: str

@dataclass
class TableBlock:
    rows: List[List[str]]  # rows[0] is table header
    page: int
    title: str
    currency: Optional[str] = None
    unit: Optional[str] = None

HEADING_PATTERNS = [
    r"^Item\s+1A?\b.*",  # Item 1 / 1A Risk Factors
    r"^Item\s+7\b.*",    # Item 7 MD&A
    r"^Item\s+7A\b.*",
    r"^Item\s+8\b.*",    # Financial Statements and Supplementary Data
    r"^Management.?s Discussion.*",
    r"^Consolidated\s+(Statements?|Balance Sheets?|Cash Flows?).*",
    r"^Notes?\s+to\s+Consolidated\s+Financial\s+Statements.*",
    r"^Risk\s+Factors.*",
] # TODO
HEADING_RE = re.compile("|".join(HEADING_PATTERNS), re.I)

CURRENCY_HINT = re.compile(r"\b(USD|US\$|\$|HKD|EUR|GBP|RMB|CNY)\b")
UNIT_HINT = re.compile(r"\b(thousands|millions|billions|’000|000s)\b", re.I)

def extract_paragraphs_with_sections(pdf_path: str) -> List[ParagraphBlock]:
    blocks: List[ParagraphBlock] = []
    doc = fitz.open(pdf_path)
    current_section = "Front"
    for pno in range(len(doc)):
        page = doc[pno]
        text = page.get_text("text")
        chunks = [c.strip() for c in re.split(r"\n\s*\n", text) if c.strip()]
        for c in chunks:
            lines = c.splitlines()
            head = lines[0].strip()
            if HEADING_RE.match(head):
                current_section = head
            blocks.append(ParagraphBlock(text=c, page=pno+1, section_path=current_section))
    doc.close()
    return blocks

# TODO: check the extraction quality
def extract_tables(pdf_path: str) -> List[TableBlock]:
    tbls: List[TableBlock] = []
    if _has_camelot:
        try:
            t1 = camelot.read_pdf(pdf_path, flavor="lattice", pages="all")
            for t in t1:
                rows = [list(map(str, r)) for r in t.df.values.tolist()]
                if rows:
                    title = rows[0][0].strip() if rows[0] and len(rows[0][0]) < 120 else "Table"
                    meta = " ".join(sum(rows[:3], []))[:500]
                    currency = (CURRENCY_HINT.search(meta) or [None]) and (CURRENCY_HINT.search(meta).group(0) if CURRENCY_HINT.search(meta) else None)
                    unit = (UNIT_HINT.search(meta) or [None]) and (UNIT_HINT.search(meta).group(0) if UNIT_HINT.search(meta) else None)
                    tbls.append(TableBlock(rows=rows, page=t.page, title=title, currency=currency, unit=unit))
        except Exception:
            pass
    if not tbls:
        with pdfplumber.open(pdf_path) as pdf:
            for pno, page in enumerate(pdf.pages, start=1):
                try:
                    tables = page.extract_tables()
                    for rows in tables:
                        rows = [[(cell or "").strip() for cell in row] for row in rows if any(cell for cell in row)]
                        if rows:
                            meta = " ".join(sum(rows[:2], []))[:500]
                            currency = (CURRENCY_HINT.search(meta).group(0) if CURRENCY_HINT.search(meta) else None)
                            unit = (UNIT_HINT.search(meta).group(0) if UNIT_HINT.search(meta) else None)
                            tbls.append(TableBlock(rows=rows, page=pno, title="Table", currency=currency, unit=unit))
                except Exception:
                    continue
    return tbls

def chunk_paragraph(text: str, size: int = 640, overlap: int = 96) -> List[str]:
    if tok_len(text) <= size:
        return [text]
    step = max(1, size - overlap)
    parts = tok_split(text, step)
    res = []
    prev_tail = ""
    for i, p in enumerate(parts):
        cur = (prev_tail + p) if i > 0 else p
        if tok_len(cur) > size:
            cur = tok_split(cur, size)[0]
        res.append(cur)
        tail_tokens = tok_split(cur, max(1, tok_len(cur)-overlap))
        prev_tail = tail_tokens[-1] if tail_tokens else ""
    return res

def chunk_table_by_rows(rows: List[List[str]], window: int = 4) -> List[List[List[str]]]:
    if not rows:
        return []
    header = rows[0]
    data = rows[1:]
    if not data:
        return [rows]
    chunks = []
    for i in range(0, len(data), window):
        block = [header] + data[i:i+window]
        chunks.append(block)
    return chunks

def build_chunks_for_financial_report(
    pdf_path: str,
    doc_meta: Dict,
    para_size: int = 640,
    para_overlap: int = 96,
    table_window: int = 4
) -> List[Dict]:
    out: List[Dict] = []
    doc_id = doc_meta.get("doc_id") or md5(pdf_path)
    paras = extract_paragraphs_with_sections(pdf_path)
    order = 0
    for pb in paras:
        splits = chunk_paragraph(pb.text, para_size, para_overlap)
        for j, s in enumerate(splits):
            md = {
                **doc_meta,
                "doc_id": doc_id,
                "type": "paragraph",
                "section_path": pb.section_path,
                "page": pb.page,
                "order": order,
                "inner_index": j,
                "tokens": tok_len(s),
            }
            md["hash"] = md5(f"{doc_id}|p|{pb.page}|{order}|{j}|{s[:80]}")
            out.append({"text": s, "metadata": md})
        order += 1

    tables = extract_tables(pdf_path)
    for t_idx, tb in enumerate(tables):
        t_chunks = chunk_table_by_rows(tb.rows, window=table_window)
        for k, block in enumerate(t_chunks):
            tsv = "\n".join(["\t".join(row) for row in block])
            md = {
                **doc_meta,
                "doc_id": doc_id,
                "type": "table",
                "section_path": f"{tb.title}",
                "page": tb.page,
                "table_id": f"T{t_idx}",
                "table_chunk": k,
                "headers": block[0],
                "currency": tb.currency,
                "unit": tb.unit,
                "tokens": tok_len(tsv),
            }
            md["headers_hash"] = md5("|".join(md["headers"]))
            md["hash"] = md5(f"{doc_id}|t|{tb.page}|{t_idx}|{k}|{tsv[:80]}")
            out.append({"text": tsv, "metadata": md})
    return out

# example usage
data_folder = "./tat_docs/"
chunks = build_chunks_for_financial_report(
    f"{data_folder}/adobe-systems-inc_2019.pdf",
    {"doc_id": "adobe-systems-inc_2019"} # can add more metadata here
)
print(len(chunks), chunks[0]["metadata"])

In [None]:
import os
# remind to change the path to your local path
def get_files(base='./tat_docs'):
    for path, dir_list, file_list in os.walk(base):
        for file_name in file_list:
            if file_name.endswith('.pdf'):
                yield os.path.join(path, file_name)

total_chunks = []
for f in get_files():
    print(f"Processing {f} ...")
    cks = build_chunks_for_financial_report(
        f,
        {"doc_id": f.split('/')[-1]}
    )
    total_chunks.extend(cks)
print(f"Total {len(total_chunks)} chunks from {len(list(get_files()))} documents.")

### Test for table extraction

In [None]:
import camelot, pdfplumber, tabula
from pathlib import Path
import tempfile, re

def extract_with_camelot(pdf_path, flavor, pages="1", **kwargs):
    try:
        tables = camelot.read_pdf(pdf_path, flavor=flavor, pages=pages, **kwargs)
        return [t.df for t in tables]
    except Exception:
        return []

def extract_with_tabula(pdf_path, pages="1", lattice=False, stream=False, area=None, columns=None):
    try:
        opts = {}
        if area: opts["area"] = area
        if columns: opts["columns"] = columns
        dfs = tabula.read_pdf(pdf_path, pages=pages, lattice=lattice, stream=stream, guess=(not (area or columns)), multiple_tables=True, **opts)
        return dfs or []
    except Exception:
        return []

def extract_with_pdfplumber(pdf_path, page=1):
    out = []
    with pdfplumber.open(pdf_path) as pdf:
        tb = pdf.pages[page-1].extract_tables()
        for rows in tb:
            clean = [[(c or "").replace("\n", " ").strip() for c in row] for row in rows]
            out.append(clean)
    return out

In [23]:
ret = tabula.read_pdf("a10-networks-inc_2019 (dragged).pdf", pages="all", stream=True)
print(ret, type(ret), len(ret))

Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:09 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:10 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:10 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:10 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:10 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:10 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:27:10 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4

[                                           Unnamed: 0        2018  \
0                                                 NaN         NaN   
1                                                 NaN         NaN   
2                                                 NaN      Amount   
3                                            Revenue:         NaN   
4   Products . . . . . . . . . . . . . . . . . . ....    $144,682   
5   Services . . . . . . . . . . . . . . . . . . ....      87,541   
6   Total revenue . . . . . . . . . . . . . . . . ...     232,223   
7                                    Cost of revenue:         NaN   
8   Products . . . . . . . . . . . . . . . . . . ....      34,066   
9   Services . . . . . . . . . . . . . . . . . . ....      17,830   
10  Total cost of revenue. . . . . . . . . . . . ....      51,896   
11  Gross profit . . . . . . . . . . . . . . . . ....     180,327   
12                                Operating expenses:         NaN   
13  Sales and marketing . . . . .

In [None]:
import tabula

pdf_file_path = "a10-networks-inc_2019 (dragged).pdf"
csv_output_path = "extracted_table.csv"

try:
    tabula.convert_into(pdf_file_path, csv_output_path, stream=True, output_format="csv", pages="all")
    print(f"Tables successfully extracted from '{pdf_file_path}' and saved to '{csv_output_path}'.")
except Exception as e:
    print(f"An error occurred: {e}")

Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4
Oct 18, 2025 12:15:43 AM org.apache.fontbox.ttf.CmapSubtable processSubtype4

Tables successfully extracted from 'a10-networks-inc_2019 (dragged).pdf' and saved to 'extracted_table.csv'.


In [28]:
import os

def get_files(base='../tat_docs_filtered'):
    cnt = 0
    for path, dir_list, file_list in os.walk(base):
        for file_name in file_list:
            cnt += 1
    return cnt

print(get_files())

149


In [None]:
from pathlib import Path
data_folder = Path("../../tat_docs/")
pdf_files = sorted(data_folder.glob("*.pdf"))
exclude_file_path = "./not_included.txt"
exclude_files = set()

if Path(exclude_file_path).exists():
    with open(exclude_file_path, 'r') as f:
        exclude_files = {f"{line.strip()}.pdf" for line in f if line.strip()}

filtered_pdf_files = [f for f in pdf_files if f.name not in exclude_files]

print(f"Found {len(pdf_files)} PDF files")
print(f"Excluded {len(pdf_files) - len(filtered_pdf_files)} files")
print(f"Remaining {len(filtered_pdf_files)} files to process")

Found 170 PDF files
Excluded 0 files
Remaining 170 files to process
