# Silver to Gold Data Pipeline
This notebook loads data from the silver layer, applies some transformations, and saves it to the gold layer

In [1]:
from datetime import datetime
from typing import List, Optional, Tuple

from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, input_file_name, year, month, to_timestamp # type: ignore

# Constants
TARGET_CATALOG = "datalake"
TARGET_NAMESPACE = f"{TARGET_CATALOG}.silver"
DATE_FORMATS = ["yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd'T'HH:mm:ssZ", 
               'yyyy-MM-dd HH:mm:ss.SSS', 'yyyy-MM-dd HH:mm:ss', 'MM/dd/yyyy HH:mm:ss', 
               'MM/dd/yyyy', 'yyyy-MM-dd', 'dd/MM/yyyy', 'dd-MM-yyyy', 'MM-dd-yyyy']

In [2]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {TARGET_NAMESPACE}")

DataFrame[]

In [3]:
gold_table = "datalake.gold.merged_financial_data"

In [4]:
try:
    df = spark.table(gold_table)
    print(f"Successfully loaded data from {gold_table}")
except Exception as e:
    print(f"Error reading from silver table: {e}")
    traceback.print_exc()

Successfully loaded data from datalake.gold.merged_financial_data


In [5]:
# Verify the data read from silver
print(f"Reading from {gold_table}:")
df.printSchema()
df.show(5, False)

Reading from datalake.gold.merged_financial_data:
root
 |-- Date: date (nullable = true)
 |-- gold: double (nullable = true)
 |-- oil: double (nullable = true)
 |-- us_dollar: double (nullable = true)
 |-- usd_vnd: double (nullable = true)



                                                                                

+----------+-----+-----+---------+-------+
|Date      |gold |oil  |us_dollar|usd_vnd|
+----------+-----+-----+---------+-------+
|1995-01-02|NULL |NULL |NULL     |11042.0|
|1995-01-03|380.9|17.44|89.21    |11042.0|
|1995-01-04|375.3|17.48|89.35    |11040.0|
|1995-01-05|376.6|17.72|89.04    |11040.0|
|1995-01-06|372.2|17.67|89.71    |11035.0|
+----------+-----+-----+---------+-------+
only showing top 5 rows



In [5]:
df = df.orderBy("Date", descending=True)

In [6]:
df.show(5, False)

[Stage 0:>                                                          (0 + 1) / 1]

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x00007f2b609903d4, pid=1659, tid=1869
#
# JRE version: OpenJDK Runtime Environment (17.0.14+7) (build 17.0.14+7-Debian-1deb11u1)
# Java VM: OpenJDK 64-Bit Server VM (17.0.14+7-Debian-1deb11u1, mixed mode, sharing, tiered, compressed oops, compressed class ptrs, g1 gc, linux-amd64)
# Problematic frame:
# j  org.apache.iceberg.shaded.io.netty.util.internal.InternalThreadLocalMap.slowGet()Lorg/apache/iceberg/shaded/io/netty/util/internal/InternalThreadLocalMap;+0
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport -p%p -s%s -c%c -d%d -P%P -u%u -g%g -- %E" (or dumping to /src/notebooks/core.1659)
#
# An error report file with more information is saved as:
# /src/notebooks/hs_err_pid1659.log
#
# If you would like to submit a bug report, please visit:
#   https://bugs.debian.org/openjdk-17
#


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.p

Py4JError: An error occurred while calling o73.showString

In [6]:
try:
    print(f"Writing to {gold_table}")
    df.write \
        .format("iceberg") \
        .option("write.format.default", "parquet") \
        .option("write.metadata.compression-codec", "gzip") \
        .option("write.parquet.row-group-size-bytes", "134217728") \
        .option("write.distribution-mode", "hash") \
        .option("write.object-storage.enabled", "true") \
        .mode("overwrite") \
        .saveAsTable(gold_table)
    print(f"Successfully wrote data to {gold_table}")
except Exception as e:
    print(f"Error writing to gold table: {e}")
    traceback.print_exc()

Writing to datalake.gold.merged_financial_data
#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x00007fa75bfc8a68, pid=5712, tid=5930
#
# JRE version: OpenJDK Runtime Environment (17.0.14+7) (build 17.0.14+7-Debian-1deb11u1)
# Java VM: OpenJDK 64-Bit Server VM (17.0.14+7-Debian-1deb11u1, mixed mode, sharing, tiered, compressed oops, compressed class ptrs, g1 gc, linux-amd64)
# Problematic frame:
# V  [libjvm.so+0xe2ca68]  SymbolTable::do_lookup(char const*, int, unsigned long)+0xd8
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport -p%p -s%s -c%c -d%d -P%P -u%u -g%g -- %E" (or dumping to /src/notebooks/core.5712)
#
# An error report file with more information is saved as:
# /src/notebooks/hs_err_pid5712.log
#
# If you would like to submit a bug report, please visit:
#   https://bugs.debian.org/openjdk-17
#


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Error writing to gold table: An error occurred while calling o78.saveAsTable


Traceback (most recent call last):
  File "/tmp/ipykernel_5660/1420764407.py", line 3, in <module>
    df.write \
  File "/opt/spark/python/pyspark/sql/readwriter.py", line 1586, in saveAsTable
    self._jwrite.saveAsTable(name)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
  File "/opt/spark/python/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 334, in get_return_value
    raise Py4JError(
py4j.protocol.Py4JError: An error occurred while calling o78.saveAsTable


In [14]:
spark.stop()