In [1]:
pip install pdfplumber pandas


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# **Task 1**

In [2]:
import pdfplumber
import re
import pandas as pd
from pathlib import Path

def extract(pdf_path: str, start_page: int = 6, end_page: int = 8):
    results = []
    section, subsection = None, None
    buffer_line = ""

    with pdfplumber.open(pdf_path) as pdf:
        # Loop dari halaman yang diinginkan (ingat: index mulai dari 0)
        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            width, height = page.width, page.height

            # Crop margin atas, kanan, bawah
            cropped = page.crop((0, 120, width - 120, height - 40))
            words = cropped.extract_words(extra_attrs=["fontname", "size"])

            # Gabungkan kata berdasarkan posisi Y (garis horizontal)
            lines = {}
            for w in words:
                line_y = round(w["top"], -1)
                lines.setdefault(line_y, []).append(w)
            lines = [sorted(v, key=lambda x: x["x0"]) for _, v in sorted(lines.items())]

            # Gabungkan teks per baris dan tandai apakah bold
            all_lines = []
            for line_words in lines:
                text = " ".join(w["text"] for w in line_words).strip()
                is_bold = any("Bold" in w["fontname"] or "Bd" in w["fontname"] or "Black" in w["fontname"]
                              for w in line_words)
                all_lines.append((text, is_bold))

            for i, (line_text, is_bold) in enumerate(all_lines):
                if not line_text:
                    continue

                tokens = re.findall(r"(?:[\d.,]+|–|-)", line_text)
                has_number = len(tokens) > 0
                next_line = all_lines[i + 1][0] if i + 1 < len(all_lines) else ""
                next_is_bold = all_lines[i + 1][1] if i + 1 < len(all_lines) else False
                next_has_number = bool(re.search(r"\d[\d.,]+", next_line))

                # === 1️⃣ SECTION ===
                if is_bold and not has_number and next_is_bold:
                    section = line_text.strip()
                    subsection = None
                    continue

                # === 2️⃣ SUBSECTION biasa ===
                if is_bold and not has_number and next_has_number:
                    subsection = line_text.strip()
                    continue

                # === 3️⃣ SUBSECTION TOTAL ===
                if is_bold and has_number and "TOTAL" in line_text.upper():
                    first_num = re.search(r"[\d.,]+", line_text)
                    if first_num:
                        subsection = line_text[:first_num.start()].strip()
                    else:
                        subsection = line_text.strip()

                    filtered = [n for n in tokens if len(n.replace(".", "").replace(",", "")) > 2 or n in ["-", "–"]]
                    if len(filtered) >= 2:
                        val_2021, val_2020 = filtered[-2], filtered[-1]
                        split_idx = line_text.rfind(val_2021)
                        item = line_text[:split_idx].strip()
                        results.append({
                            "page": page_num + 1,
                            "section": section,
                            "subsection": subsection,
                            "item": item,
                            "2021": val_2021,
                            "2020": val_2020
                        })
                    continue

                # === 4️⃣ ITEM biasa ===
                if not is_bold:
                    if not has_number:
                        buffer_line += " " + line_text
                        continue
                    if buffer_line:
                        line_text = buffer_line.strip() + " " + line_text
                        buffer_line = ""

                    filtered = [n for n in tokens if len(n.replace(".", "").replace(",", "")) > 2 or n in ["-", "–"]]
                    if len(filtered) >= 2:
                        val_2021, val_2020 = filtered[-2], filtered[-1]
                        split_idx = line_text.rfind(val_2021)
                        item = line_text[:split_idx].strip()
                        results.append({
                            "page": page_num + 1,
                            "section": section,
                            "subsection": subsection,
                            "item": item,
                            "2021": val_2021,
                            "2020": val_2020
                        })

    # Simpan hasil ke CSV
    Path("output").mkdir(exist_ok=True)
    df = pd.DataFrame(results)
    df.to_csv("output/extracted.csv", index=False, encoding="utf-8-sig")

    print(f"✅ Extraction selesai — {len(df)} baris dari halaman {start_page}–{end_page} disimpan ke output/extracted.csv")
    return df


# === Jalankan ===
df_page = extract("EKAD - LAPORAN KEUANGAN TAHUNAN 2021.pdf", start_page=6, end_page=8)
df_page


✅ Extraction selesai — 51 baris dari halaman 6–8 disimpan ke output/extracted.csv


Unnamed: 0,page,section,subsection,item,2021,2020
0,6,ASET,ASET LANCAR,"Kas dan setara kas 4,33",360.662.679.743,348.026.902.985
1,6,ASET,ASET LANCAR,Deposito berjangka yang dibatasi penggunaannya...,24.219.914.929,21.103.842.879
2,6,ASET,ASET LANCAR,31 Desember 2021 dan 2020,83.071.117.010,87.196.198.463
3,6,ASET,ASET LANCAR,Pihak berelasi 30,684.677.718,581.104.437
4,6,ASET,ASET LANCAR,Piutang lain-lain,-,-
5,6,ASET,ASET LANCAR,pihak ketiga,254.139.325,986.432.198
6,6,ASET,ASET LANCAR,dan 2020 7,168.288.992.651,86.409.350.807
7,6,ASET,ASET LANCAR,Pajak dibayar di muka 16a,11.330.792,472.020.184
8,6,ASET,ASET LANCAR,Beban dibayar di muka dan uang muka 8,5.864.472.120,6.980.448.125
9,6,ASET,ASET LANCAR,Aset lancar lainnya 33,716.097.870,737.558.020


# **Task 2**

In [5]:
from pathlib import Path

Path("output").mkdir(exist_ok=True)

filtered_df = df_page[
    df_page["item"].str.contains(r"\btotal\b", case=False, na=False) &
    (df_page["item"].str.split().str.len() <= 5)
]

filtered_subset = filtered_df[["item", "2021", "2020"]]

output_path = "output/filtered_total.json"
filtered_subset.to_json(output_path, orient="records", force_ascii=False, indent=2)

print(f"✅ Disimpan ke {output_path} ({len(filtered_subset)} baris)")


✅ Disimpan ke output/filtered_total.json (7 baris)
