# Iceberg Account Summary Query
This notebook queries the `default.summary` Iceberg table using the configuration from `iceberg_account_summary.py`.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField, LongType, DateType, ArrayType,
    DecimalType, IntegerType, StringType, TimestampType
)
import pyspark.sql.functions as F

def create_spark_session() -> SparkSession:
    """
    Create a Spark session with Iceberg REST Catalog and MinIO S3 storage.
    """
    spark = SparkSession.builder.appName('Ascend')\
                                .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
                                .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")\
                                .config("spark.hadoop.fs.s3a.access.key", "admin")\
                                .config("spark.hadoop.fs.s3a.secret.key", "password")\
                                .config("spark.hadoop.fs.s3a.path.style.access", "true")\
                                .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
                                .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")\
                                .config("spark.sql.adaptive.enabled", "true")\
                                .config("spark.sql.adaptive.skewJoin.enabled", "true")\
                                .config("spark.sql.adaptive.localShuffleReader.enabled", "true")\
                                .config("spark.sql.optimizer.dynamicPartitionPruning.enabled", "true")\
                                .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")    
    return spark

spark = create_spark_session()
print("Spark Session Created")

Spark Session Created


26/01/15 07:06:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [16]:
# Query the table
print("Querying default.summary...")
df = spark.sql("SELECT * FROM default.summary order by base_ts desc")


df.show(1000,truncate=False)

Querying default.summary...


                                                                                

+-------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------+--------------+-------------------+-------------------+-------------------+
|cons_a

In [20]:
df.select(F.size("bal_history").alias("bal_history_size")).show()



+----------------+
|bal_history_size|
+----------------+
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
|              36|
+----------------+
only showing top 20 rows



                                                                                

In [21]:
df.count()

                                                                                

179

In [15]:
df.createOrReplaceTempView("account_summary")
spark.sql("SELECT max(base_ts) FROM account_summary").show()



+-------------------+
|       max(base_ts)|
+-------------------+
|2025-06-25 11:43:47|
+-------------------+



                                                                                

In [24]:
df.printSchema()

root
 |-- cons_acct_key: long (nullable = false)
 |-- rpt_as_of_mo: string (nullable = false)
 |-- bal_history: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- dpd_history: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- payment_history: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- status_history: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- current_balance: integer (nullable = true)
 |-- current_dpd: integer (nullable = true)
 |-- account_status: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- updated_ts: timestamp (nullable = true)
 |-- base_ts: timestamp (nullable = true)



In [11]:
spark.sql("show tables in default").show()

+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|  default|account_summary|      false|
|  default|   accounts_all|      false|
|  default|        summary|      false|
+---------+---------------+-----------+



In [13]:
spark.read.table("default.default.accounts_all").orderBy("base_ts").show(1500,truncate=False)

                                                                                

+-------------+----------+------------+---------------+-----------+----------+---------+--------------+-------------------+-------------------+-------------------+
|cons_acct_key|acct_dt   |rpt_as_of_mo|current_balance|current_dpd|payment_am|status_cd|account_status|created_ts         |updated_ts         |base_ts            |
+-------------+----------+------------+---------------+-----------+----------+---------+--------------+-------------------+-------------------+-------------------+
|5            |2015-01-21|2015-01     |1000           |0          |50        |0        |CURRENT       |2015-01-21 21:02:01|2015-01-22 02:02:01|2015-01-21 21:02:01|
|5            |2015-02-23|2015-02     |1001           |0          |50        |0        |CURRENT       |2015-02-23 15:32:38|2015-02-23 16:32:38|2015-02-23 15:32:38|
|5            |2015-03-28|2015-03     |1002           |0          |50        |0        |CURRENT       |2015-03-28 14:45:41|2015-03-30 10:45:41|2015-03-28 14:45:41|
|5            |2

In [8]:
spark.read.table("default.default.accounts_all").count()

1065

In [10]:
spark.sql("DROP TABLE IF EXISTS default.default.accounts_all").show()

++
||
++
++

