# Spark Simple Read + Aggregate (Marquez Postgres)

This notebook:
- connects to the same Docker Postgres (`marquez` DB)
- selects one random table from `public` schema
- reads it with Spark JDBC
- runs a small aggregation
- saves the aggregation result as Parquet


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os


In [2]:
# Postgres connection settings (Docker Compose defaults)
POSTGRES_HOST = "postgres"
POSTGRES_PORT = "5432"
POSTGRES_DB = "marquez"
POSTGRES_USER = "marquez"
POSTGRES_PASSWORD = "marquez"

JDBC_URL = f"jdbc:postgresql://{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
JDBC_PROPS = {
    "user": POSTGRES_USER,
    "password": POSTGRES_PASSWORD,
    "driver": "org.postgresql.Driver",
}

spark = (
    SparkSession.builder
    .appName("openlineage-postgres-simple-read-aggregate")
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.3")
    .getOrCreate()
)

spark.version


'3.5.3'

In [3]:
def read_sql(query: str):
    return (
        spark.read
        .format("jdbc")
        .option("url", JDBC_URL)
        .option("driver", JDBC_PROPS["driver"])
        .option("user", JDBC_PROPS["user"])
        .option("password", JDBC_PROPS["password"])
        .option("dbtable", f"({query}) as q")
        .load()
    )


def read_table(table_name: str, schema: str = "public"):
    return (
        spark.read
        .format("jdbc")
        .option("url", JDBC_URL)
        .option("driver", JDBC_PROPS["driver"])
        .option("user", JDBC_PROPS["user"])
        .option("password", JDBC_PROPS["password"])
        .option("dbtable", f"{schema}.{table_name}")
        .load()
    )


In [4]:
# Pick one random table from public schema (excluding migration history tables)
random_table_df = read_sql("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'public'
      AND table_type = 'BASE TABLE'
      AND table_name NOT ILIKE 'schema_version%'
      AND table_name NOT ILIKE 'flyway%'
    ORDER BY random()
    LIMIT 1
""")

row = random_table_df.first()
if row is None:
    raise RuntimeError("No eligible tables found in public schema")

selected_table = row["table_name"]
print("Selected table:", selected_table)


Selected table: jobs_tag_mapping


In [5]:
source_df = read_table(selected_table)
print("Source row count:", source_df.count())
source_df.printSchema()
source_df.show(20, truncate=False)


Source row count: 0
root
 |-- job_uuid: string (nullable = true)
 |-- tag_uuid: string (nullable = true)
 |-- tagged_at: timestamp (nullable = true)

+--------+--------+---------+
|job_uuid|tag_uuid|tagged_at|
+--------+--------+---------+
+--------+--------+---------+



In [6]:
# Small, generic aggregation: pick a grouping column and count rows
preferred_group_cols = ["namespace_name", "namespace", "type", "name", "event_type", "state"]
columns = source_df.columns

group_col = None
for c in preferred_group_cols:
    if c in columns:
        group_col = c
        break

if group_col is None and columns:
    group_col = columns[0]

if group_col is None:
    raise RuntimeError(f"Table {selected_table} has no columns")

agg_df = (
    source_df
    .groupBy(F.col(group_col).cast("string").alias("group_value"))
    .agg(F.count(F.lit(1)).alias("row_count"))
    .orderBy(F.col("row_count").desc(), F.col("group_value"))
)

print("Grouping column used:", group_col)
agg_df.show(50, truncate=False)


Grouping column used: job_uuid
+-----------+---------+
|group_value|row_count|
+-----------+---------+
+-----------+---------+



In [7]:
# Save aggregated result as Parquet
out_dir = "/home/jovyan/work/data/postgres_simple"
os.makedirs(out_dir, exist_ok=True)

out_path = os.path.join(out_dir, f"{selected_table}_aggregation_parquet")
agg_df.write.mode("overwrite").parquet(out_path)

print("Parquet saved to:", out_path)
out_path


Parquet saved to: /home/jovyan/work/data/postgres_simple/jobs_tag_mapping_aggregation_parquet


'/home/jovyan/work/data/postgres_simple/jobs_tag_mapping_aggregation_parquet'

## Notes

- Run this from the Docker notebook container so host `postgres` resolves.
- Re-running the notebook can pick a different random table.
