In [1]:
from datetime import datetime
from typing import List, Optional, Tuple
from pyspark.sql.window import Window
from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, to_timestamp, year, month, max as max_, rank, desc, asc # type: ignore
from functools import reduce
from pyspark.sql.functions import to_date, current_date


SOURCE_CATALOG = "datalake"
SOURCE_NAMESPACE = f"{SOURCE_CATALOG}.gold"
TARGET_CATALOG = "datalake"
TARGET_NAMESPACE = f"{TARGET_CATALOG}.gold"
TARGET_DATE_FILTER = "1995-01-01"
FINANCIAL_COLUMNS = ["price"]
DATE_COLUMN = "Date"

In [11]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {TARGET_NAMESPACE}")

DataFrame[]

In [15]:
tables = ['dow_jones', 'gold', 'inflation', 'interest', 'msci_world', 'nasdaq100', 'oil', 'russell2000']

In [16]:
for table in tables:
    table_name = f"datalake.bronze.{table}"
    print(table_name)
    # Read silver tables
    try:
        merged_data = spark.table(table_name)
        
        print("Tables loaded successfully")
    except Exception as e:
        print(f"Error loading silver tables: {e}")
    df_filtered = merged_data.filter(to_date("inserted") < current_date())
    df_filtered.write.mode("overwrite").partitionBy("year", "month").format("iceberg").saveAsTable(table_name)


datalake.bronze.dow_jones
Tables loaded successfully


                                                                                

datalake.bronze.gold
Tables loaded successfully


                                                                                

datalake.bronze.inflation
Tables loaded successfully


                                                                                

datalake.bronze.interest
Tables loaded successfully


                                                                                

datalake.bronze.msci_world
Tables loaded successfully


                                                                                

datalake.bronze.nasdaq100
Tables loaded successfully


                                                                                

datalake.bronze.oil
Tables loaded successfully


                                                                                

datalake.bronze.russell2000
Tables loaded successfully


                                                                                

In [36]:
table_name = "datalake.gold.indices"

In [37]:
# Read silver tables
try:
    merged_data = spark.table(table_name)
    
    print("Tables loaded successfully")
except Exception as e:
    print(f"Error loading silver tables: {e}")

Tables loaded successfully


In [38]:
# Display table schemas and sample data
print("Merge data schema:")
merged_data.printSchema()
merged_data.show(5)

Merge data schema:
root
 |-- date: timestamp (nullable = true)
 |-- gold: float (nullable = true)
 |-- russell2000: float (nullable = true)
 |-- dow_jones: float (nullable = true)
 |-- msci_world: float (nullable = true)
 |-- nasdaq100: float (nullable = true)
 |-- s_p500: float (nullable = true)
 |-- id: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-------------------+-----+-----------+---------+----------+---------+------+----+----+-----+
|               date| gold|russell2000|dow_jones|msci_world|nasdaq100|s_p500|  id|year|month|
+-------------------+-----+-----------+---------+----------+---------+------+----+----+-----+
|2002-10-04 00:00:00|323.3|     347.98|   7528.4|     727.0|    815.4| 800.6|2830|2002|   10|
|2002-10-07 00:00:00|323.1|     338.29|  7422.84|     742.2|   804.64| 785.3|2833|2002|   10|
|2002-10-09 00:00:00|320.8|     327.04|  7286.27|     704.7|   807.42| 776.8|2835|2002|   10|
|2002-10-10 00:00:00|317.4|   

In [39]:
merged_data.count()

11101

In [40]:
merged_data.orderBy(desc('date')).show()

+-------------------+------+-----------+---------+----------+---------+-------+-----+----+-----+
|               date|  gold|russell2000|dow_jones|msci_world|nasdaq100| s_p500|   id|year|month|
+-------------------+------+-----------+---------+----------+---------+-------+-----+----+-----+
|2025-05-27 00:00:00|3363.6|    2039.85| 41603.07|    3813.9| 18737.21|5802.82|11101|2025|    5|
|2025-05-26 00:00:00|3363.6|    2039.85| 41603.07|    3813.9| 18737.21|5802.82|11100|2025|    5|
|2025-05-25 00:00:00|3363.6|    2039.85| 41603.07|    3802.8| 18737.21|5802.82|11099|2025|    5|
|2025-05-24 00:00:00|3363.6|    2039.85| 41603.07|    3802.8| 18737.21|5802.82|11098|2025|    5|
|2025-05-23 00:00:00|3363.6|    2039.85| 41603.07|    3802.8| 18737.21|5802.82|11097|2025|    5|
|2025-05-22 00:00:00|3292.3|    2045.56| 41859.09|    3818.9| 18925.74|5842.01|11096|2025|    5|
|2025-05-21 00:00:00|3309.3|    2046.56| 41860.44|    3829.8| 18872.64|5844.61|11095|2025|    5|
|2025-05-20 00:00:00|3280.3|  

In [9]:
# Filter out rows from today
df_filtered = merged_data.filter(to_date("inserted") < current_date())
df_filtered.count()

11020

In [10]:
df_filtered.write.format("iceberg").mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x00007fd8b7d6e044, pid=222779, tid=222995
#
# JRE version: OpenJDK Runtime Environment (11.0.26+4) (build 11.0.26+4-post-Debian-1deb11u1)
# Java VM: OpenJDK 64-Bit Server VM (11.0.26+4-post-Debian-1deb11u1, mixed mode, sharing, tiered, compressed oops, g1 gc, linux-amd64)
# Problematic frame:
# V  [libjvm.so+0xc35044]  ProtectionDomainEntry::object_no_keepalive()+0x4
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport -p%p -s%s -c%c -d%d -P%P -u%u -g%g -- %E" (or dumping to /src/notebooks/Bao/core.222779)
#
# An error report file with more information is saved as:
# /src/notebooks/Bao/hs_err_pid222779.log
#
# If you would like to submit a bug report, please visit:
#   https://bugs.debian.org/openjdk-11
#


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.p

Py4JError: An error occurred while calling o115.saveAsTable

In [185]:
window_spec = Window.partitionBy(DATE_COLUMN).orderBy(col("inserted").desc())
df_ranked = merged_data.withColumn("rnk", rank().over(window_spec)) 


In [186]:
df_ranked.show()

+----------+-------+-------+-------+-------+------+--------------------+----+-----+--------------------+---+
|      date|  price|   open|   high|    low|volume|         source_file|year|month|            inserted|rnk|
+----------+-------+-------+-------+-------+------+--------------------+----+-----+--------------------+---+
|1995-01-05|11040.0|11040.0|11040.0|11040.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|  1|
|1995-01-06|11035.0|11035.0|11035.0|11035.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|  1|
|1995-01-07|11035.0|11035.0|11035.0|11035.0|  NULL|                NULL|NULL| NULL|2025-05-25 08:44:...|  1|
|1995-01-08|11035.0|11035.0|11035.0|11035.0|  NULL|                NULL|NULL| NULL|2025-05-25 08:44:...|  1|
|1995-01-09|11035.0|11035.0|11035.0|11035.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|  1|
|1995-01-10|11040.0|11040.0|11040.0|11040.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|  1|
|1995-01-11|11045.0

In [188]:
df = df_ranked.filter(col("rnk") == 1).drop("rnk")
df.orderBy(DATE_COLUMN).limit(5).show()

+----------+-------+-------+-------+-------+------+--------------------+----+-----+--------------------+
|      date|  price|   open|   high|    low|volume|         source_file|year|month|            inserted|
+----------+-------+-------+-------+-------+------+--------------------+----+-----+--------------------+
|1995-01-05|11040.0|11040.0|11040.0|11040.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|
|1995-01-06|11035.0|11035.0|11035.0|11035.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|
|1995-01-07|11035.0|11035.0|11035.0|11035.0|  NULL|                NULL|NULL| NULL|2025-05-25 08:44:...|
|1995-01-08|11035.0|11035.0|11035.0|11035.0|  NULL|                NULL|NULL| NULL|2025-05-25 08:44:...|
|1995-01-09|11035.0|11035.0|11035.0|11035.0|  NULL|file:///src/data/...|1995|    1|2025-05-25 08:44:...|
+----------+-------+-------+-------+-------+------+--------------------+----+-----+--------------------+

