In [1]:
spark.read.table("default.summary").show(5)

+-------------+------------+--------------------+--------------------+--------------------+--------------------+---------------+-----------+--------------+-------------------+-------------------+-------------------+
|cons_acct_key|rpt_as_of_mo|         bal_history|         dpd_history|     payment_history|      status_history|current_balance|current_dpd|account_status|         created_ts|         updated_ts|            base_ts|
+-------------+------------+--------------------+--------------------+--------------------+--------------------+---------------+-----------+--------------+-------------------+-------------------+-------------------+
|            5|     2018-12|[1047, 1046, 1045...|[0, 0, 0, 0, 0, 0...|[50, 50, 50, 50, ...|[0, 0, 0, 0, 0, 0...|           1047|          0|       CURRENT|2018-12-27 15:50:30|2018-12-28 17:50:30|2018-12-27 15:50:30|
|            5|     2018-11|[1046, 1045, 1044...|[0, 0, 0, 0, 0, 0...|[50, 50, 50, 50, ...|[0, 0, 0, 0, 0, 0...|           1046|        

In [29]:
spark.read.table("default.summary").write.format("iceberg").mode("overwrite").saveAsTable("default.summary_testing")

                                                                                

In [9]:
%%sql
show tables in default

namespace,tableName,isTemporary
default,summary,False
default,summary_testing,False


In [34]:
%%sql
-- select count(1) from default.summary
select count(1) from default.summary;

count(1)
147


In [14]:
%%sql
with data as (
    select 
    cons_acct_key, 
    rpt_as_of_mo, 
    row_number() over (partition by cons_acct_key,rpt_as_of_mo order by base_ts) as row_num 
    from default.summary_testing)
select * from data where row_num > 1

cons_acct_key,rpt_as_of_mo,row_num


In [30]:
%%sql
select cons_acct_key, base_ts, rpt_as_of_mo from default.summary_testing where cons_acct_key = 1 order by rpt_as_of_mo

cons_acct_key,base_ts,rpt_as_of_mo
1,2025-05-20 11:01:47,2025-05
1,2025-06-25 11:43:47,2025-06


In [18]:
df = spark.read.csv("/home/iceberg/data/accounts_all.csv", header=True, inferSchema=True).filter("base_ts < date '2025-07-05'")
df.count()

153

In [19]:
%%sql
create database if not exists base;

In [24]:
spark.read.table("default.default.accounts_all").filter("base_ts < date '2025-07-05'").write.format("iceberg").mode("overwrite").saveAsTable("base.accounts_all")

                                                                                

In [25]:
%%sql
select count(1) from base.accounts_all

count(1)
153


In [26]:
%%sql
describe base.accounts_all

col_name,data_type,comment
cons_acct_key,bigint,
acct_dt,date,
rpt_as_of_mo,string,
current_balance,int,
current_dpd,int,
payment_am,int,
status_cd,string,
account_status,string,
created_ts,timestamp,
updated_ts,timestamp,


In [23]:
%%sql
describe default.default.accounts_all

col_name,data_type,comment
cons_acct_key,bigint,
acct_dt,date,
rpt_as_of_mo,string,
current_balance,int,
current_dpd,int,
payment_am,int,
status_cd,string,
account_status,string,
created_ts,timestamp,
updated_ts,timestamp,


In [32]:
from pyspark.sql import functions as F

spark.read.table("default.summary_testing").agg(F.max("base_ts").alias("max_base_ts")).show()

+-------------------+
|        max_base_ts|
+-------------------+
|2025-07-03 14:30:00|
+-------------------+



In [39]:
from pyspark.sql.functions import col

max_base_ts = spark.table("default.summary_testing").selectExpr("max(base_ts)").collect()[0][0]
accounts_df = spark.table("default.default.accounts_all").filter(col("base_ts") > max_base_ts)

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("cons_acct_key", "rpt_as_of_mo").orderBy(col("base_ts").desc(), col("created_ts").desc(), col("updated_ts").desc())
latest_accounts_df = accounts_df.withColumn("row_num", row_number().over(window_spec)).filter(col("row_num") == 1).drop("row_num")

latest_accounts_df.count()


                                                                                

108

In [40]:
latest_accounts_df.show()



+-------------+----------+------------+---------------+-----------+----------+---------+--------------+-------------------+-------------------+-------------------+
|cons_acct_key|   acct_dt|rpt_as_of_mo|current_balance|current_dpd|payment_am|status_cd|account_status|         created_ts|         updated_ts|            base_ts|
+-------------+----------+------------+---------------+-----------+----------+---------+--------------+-------------------+-------------------+-------------------+
|            1|2023-01-22|     2023-01|           4114|          0|        50|        0|       CURRENT|2025-07-05 09:00:00|2025-07-05 09:00:00|2025-07-05 09:00:00|
|            1|2023-02-22|     2023-02|           3002|          0|        50|        0|       CURRENT|2025-07-05 09:00:00|2025-07-05 09:00:00|2025-07-05 09:00:00|
|            1|2023-03-05|     2023-03|           2890|          0|        50|        0|       CURRENT|2025-07-05 09:00:00|2025-07-05 09:00:00|2025-07-05 09:00:00|
|            1|2

                                                                                