In [0]:
from pyspark.sql.functions import input_file_name
import psycopg2

In [0]:
source_path = "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/transactions"

# Schema log location (must be unique if you clear it!)
schema_location = "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/_schema_log/transactions"

# Checkpoint location for streaming
checkpoint_location = "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net/_checkpoint/transactions"


In [0]:
pg_host = "devdolphinpostgresdb.postgres.database.azure.com"
pg_db = "postgres"
pg_user = dbutils.secrets.get(scope="devDolphin", key="kushagra")
pg_pass = dbutils.secrets.get(scope="devDolphin", key="pg-password")


In [0]:
def upsert_to_postgres(batch_df, batch_id):
    print(f"🔄 Processing batch_id: {batch_id}")

    # Example: aggregate merchant txn counts
    agg_df = (
        batch_df.groupBy("merchant")
        .count()
        .withColumnRenamed("count", "total_txn")
    )

    rows = agg_df.collect()

    if not rows:
        print("⚠️  No rows to upsert this batch.")
        return

    conn = psycopg2.connect(
        host=pg_host,
        dbname=pg_db,
        user=pg_user,
        password=pg_pass,
        sslmode="require"
    )

    cur = conn.cursor()

    for row in rows:
        merchant = row["merchant"]
        total_txn = row["total_txn"]

        cur.execute("""
            INSERT INTO merchant_txn_state (merchant, total_txn)
            VALUES (%s, %s)
            ON CONFLICT (merchant)
            DO UPDATE SET total_txn = EXCLUDED.total_txn;
        """, (merchant, total_txn))

    conn.commit()
    cur.close()
    conn.close()

    print(f"✅ Upserted {len(rows)} rows to Postgres.")

In [0]:

df = (
  spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "csv")
  .option("header", "true")
  .option("cloudFiles.inferColumnTypes", "true")
  .option("cloudFiles.schemaLocation", schema_location)
  .load(source_path)
  .withColumn("source_file", input_file_name())
)

df.printSchema()


In [0]:
query = (
  df.writeStream
  .foreachBatch(upsert_to_postgres)
  .option("checkpointLocation", checkpoint_location)
  .start()
)

query.awaitTermination()

In [0]:
def inspect_batch(batch_df, batch_id):
    print(f"🔍 New batch {batch_id}")
    print(f"Rows: {batch_df.count()}")
    display(batch_df)

query = (
  df.writeStream
    .foreachBatch(inspect_batch)
    .option("checkpointLocation", "<your-checkpoint-path>")
    .start()
)

query.awaitTermination()
