### Generate the lastest_dpd_summary_table

In [19]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, date_format, expr, floor, to_date,date_sub,trunc, month
from pyspark.sql.types import IntegerType
import datetime
import time

spark = SparkSession.builder.appName('Ascend')\
                            .config("spark.jars", "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar")\
                            .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")\
                            .config("spark.hadoop.fs.s3a.access.key", "admin")\
                            .config("spark.hadoop.fs.s3a.secret.key", "password")\
                            .config("spark.hadoop.fs.s3a.path.style.access", "true")\
                            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
                            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")\
                            .config("spark.sql.adaptive.enabled", "true")\
                            .config("spark.sql.adaptive.skewJoin.enabled", "true")\
                            .config("spark.sql.adaptive.localShuffleReader.enabled", "true")\
                            .config("spark.sql.optimizer.dynamicPartitionPruning.enabled", "true")\
                            .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# Read full dpd_summary history (yes, one-time full scan)
dpd_summary_df = spark.read.table("ascenddb2.dpd_summary")

# Window to pick latest ACCT_DT per account
window_spec = Window.partitionBy("CONS_ACCT_KEY").orderBy(F.col("ACCT_DT").desc())

# Pick latest record per account
latest_dpd = dpd_summary_df.withColumn("rn", F.row_number().over(window_spec)) \
    .filter("rn = 1") \
    .select("CONS_ACCT_KEY", "ACCT_DT", "DPD_GRID")

# Save it as initial latest_dpd_summary
latest_dpd.write.format("iceberg").mode("overwrite").saveAsTable("ascenddb2.latest_dpd_summary")


25/07/09 17:16:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [20]:
%%sql
select * from ascenddb2.latest_dpd_summary

25/07/09 17:16:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


CONS_ACCT_KEY,ACCT_DT,DPD_GRID


In [21]:
%%time
# Define month range
start_month = datetime.date(1998, 1, 1)
end_month = datetime.date(1998, 1, 1)  # Adjust end date as needed

month = start_month
result_table = "ascenddb2.dpd_summary"

while month <= end_month:
    next_month = (month.replace(day=28) + datetime.timedelta(days=4)).replace(day=1)
    # print(f"Next Month: {next_month}")
    
    print(f"Processing month: {month} ...")

    # Load current month data
    current_df = spark.sql(f"""
        SELECT CONS_ACCT_KEY, ACCT_DT, DPD
        FROM ascenddb2.dpd_data
        WHERE ACCT_DT >= DATE '{month}' AND ACCT_DT < DATE '{next_month}'
    """)
    
    # Load only latest record per account (tiny table)
    latest_prev = spark.read.table("ascenddb2.latest_dpd_summary").withColumnRenamed("ACCT_DT", "ACCT_DT_prev")
    
    # Join (no longer need to scan historical months)
    merged_df = current_df.join(latest_prev, on="CONS_ACCT_KEY", how="left")
    
    # Continue with DPD_GRID generation as before
    merged_df = merged_df.withColumn(
        "MONTH_DIFF",
        (F.month("ACCT_DT") - F.month("ACCT_DT_prev")) +
        (F.year("ACCT_DT") - F.year("ACCT_DT_prev")) * 12
    ).withColumn(
        "FILLER_ARRAY",
        F.when(F.col("MONTH_DIFF") > 1, F.expr("transform(sequence(1, MONTH_DIFF - 1), x -> '?')"))
        .otherwise(F.array())
    ).withColumn(
        "Merged_DPD_Array",
        F.concat(
            F.array(F.col("DPD")),
            F.col("FILLER_ARRAY"),
            F.when(F.col("DPD_GRID").isNotNull(), F.split(F.col("DPD_GRID"), "~")).otherwise(F.array())
        )
    )
    
    # Pad & trim as usual
    merged_df = merged_df.withColumn(
        "DPD_Array_Trimmed",
        F.when(
            F.size("Merged_DPD_Array") >= 36,
            F.slice("Merged_DPD_Array", 1, 36)
        ).otherwise(
            F.concat(F.col("Merged_DPD_Array"),
                     F.array([F.lit("?") for _ in range(35)])
            )
        )
    ).withColumn(
        "DPD_GRID",
        F.concat_ws("~", "DPD_Array_Trimmed")
    )
    
    # Save historical output (same as before)
    merged_df.select("CONS_ACCT_KEY", "ACCT_DT", "DPD", "DPD_GRID") \
        .write.format("iceberg").mode("append").saveAsTable("ascenddb2.dpd_summary")


    # Load current month processed data
    current_month_df = merged_df.select("CONS_ACCT_KEY", "ACCT_DT", "DPD_GRID")
    
    # Merge: Take latest between old and new per account
    merged_latest = latest_df.union(current_month_df) \
        .withColumn("rn", F.row_number().over(
            Window.partitionBy("CONS_ACCT_KEY").orderBy(F.col("ACCT_DT").desc())
        )) \
        .filter("rn = 1") \
        .drop("rn")
    
    # 4. Save merged latest back
    merged_latest.write.mode("overwrite").saveAsTable("ascenddb2.latest_dpd_summary")
    
    month = next_month

Processing month: 1998-01-01 ...


AnalysisException: [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view `ascenddb2`.`dpd_summary` because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.

In [27]:
%%sql
DROP TABLE ascenddb2.dpd_summary;