In [0]:
from pyspark.sql.types import *

silver_schema = StructType([
    StructField("booking_date", DateType(), True),
    StructField("description", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("currency", StringType(), True),
    StructField("iban", StringType(), True)
])


In [0]:
import re
import pandas as pd
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from datetime import datetime

DATE_RE = re.compile(r"^\d{2}\.\d{2}\.\d{4}$")
AMOUNT_RE = re.compile(r"[-+]?[0-9.,]+")

def extract_silver_records(raw_text):
    if not raw_text:
        return []

    lines = [l.strip() for l in raw_text.split("\n") if l.strip()]

    # âœ… Extract IBAN once per document
    iban = None
    for l in lines:
        if l.startswith("IBAN"):
            iban = l.split(":")[1].strip()

    tx_list = []
    i, n = 0, len(lines)

    while i < n - 3:
        if DATE_RE.match(lines[i]):
            booking_date = datetime.strptime(lines[i], "%d.%m.%Y").date()
            description = lines[i + 1]
            category = lines[i + 2]

            amt_match = AMOUNT_RE.search(lines[i + 3])
            amount = (
                float(amt_match.group().replace(",", ""))
                if amt_match else None
            )

            tx_list.append((
                booking_date,
                description,
                category,
                amount,
                "EUR",
                iban
            ))

            i += 4
        else:
            i += 1

    return tx_list


In [0]:
silver_udf_schema = ArrayType(
    StructType([
        StructField("booking_date", DateType(), True),
        StructField("description", StringType(), True),
        StructField("category", StringType(), True),
        StructField("amount", DoubleType(), True),
        StructField("currency", StringType(), True),
        StructField("iban", StringType(), True)
    ])
)


In [0]:
bronze_df = spark.table("catalog_anushka.pdf_fin_bronze.pdf_raw")

extract_udf = udf(extract_silver_records, silver_udf_schema)

silver_df = bronze_df \
    .withColumn("transactions", extract_udf("text")) \
    .selectExpr("explode(transactions) as tx") \
    .select("tx.*")


In [0]:
silver_df.write.format("delta").mode("overwrite").saveAsTable(
    "catalog_anushka.pdf_fin_silver.pdf_silver"
)