# Loading Data from Silver to Gold Layer

This notebook reads data from silver tables (gold, oil, us_dollar, usd_vnd), combines them based on date, and creates a gold table.

In [3]:
from datetime import datetime
from typing import List, Optional, Tuple

from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, to_timestamp, year, month # type: ignore
from functools import reduce

SOURCE_CATALOG = "datalake"
SOURCE_CATALOG = f"{SOURCE_CATALOG}.gold"
SOURCE_TABLE = f"{SOURCE_CATALOG}.indices"
SOURCE_TABLE = f"{SOURCE_CATALOG}.indices"
DATE_COLUMN = "date"
PRICE_COLUMN = "price" 

In [5]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {SOURCE_TABLE}")

DataFrame[]

## Read Silver Tables

Read the required silver tables: gold, oil, us_dollar, usd_vnd

In [6]:
indices_table = spark.table("datalake.gold.indices_predict")
indices_table = indices_table.orderBy("date")

In [7]:
indices_table.show(5)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

+-------------------+-----------+---------+----------+---------+------+-----+----+-----+------------------------+----------------------+-----------------------+----------------------+-------------------+-----------------+------------------------+----------------------+-----------------------+----------------------+-------------------+-----------------+-------------------------+-----------------------+------------------------+-----------------------+--------------------+------------------+-------------------------+-----------------------+------------------------+-----------------------+--------------------+------------------+
|               date|russell2000|dow_jones|msci_world|nasdaq100|s_p500| gold|year|month|russell2000_SrVAR_future|dow_jones_SrVAR_future|msci_world_SrVAR_future|nasdaq100_SrVAR_future|s_p500_SrVAR_future|gold_SrVAR_future|russell2000_VARNN_future|dow_jones_VARNN_future|msci_world_VARNN_future|nasdaq100_VARNN_future|s_p500_VARNN_future|gold_VARNN_future|russell2000_S

In [26]:
# Read silver tables
try:
    silver_gold = spark.table("datalake.silver.gold")
    silver_dow_jones = spark.table("datalake.silver.dow_jones")
    silver_msci_world = spark.table("datalake.silver.msci_world")
    silver_nasdaq100 = spark.table("datalake.silver.nasdaq100")
    
    print("Tables loaded successfully")
except Exception as e:
    print(f"Error loading silver tables: {e}")

Tables loaded successfully


In [29]:
# Display table schemas and sample data
print("gold schema:")
silver_gold.printSchema()
silver_gold.show(10)

# print("russell2000 schema:")
# silver_russell2000.printSchema()
# silver_russell2000.show(5)

print("dow_jones schema:")
silver_dow_jones.printSchema()
silver_dow_jones.show(10)

print("msci_world schema:")
silver_msci_world.printSchema()
silver_msci_world.show(10)

print("nasdaq100 schema:")
silver_nasdaq100.printSchema()
silver_nasdaq100.show(10)

gold schema:
root
 |-- date: date (nullable = true)
 |-- price: double (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- change: double (nullable = true)
 |-- source_file: string (nullable = true)
 |-- year: double (nullable = true)
 |-- month: double (nullable = true)

+----------+------------------+------------------+------------------+------------------+------+--------------------+--------------------+------+-----+
|      date|             price|              open|              high|               low|volume|              change|         source_file|  year|month|
+----------+------------------+------------------+------------------+------------------+------+--------------------+--------------------+------+-----+
|2000-05-01|274.79998779296875|274.79998779296875| 276.1000061035156| 273.6000061035156|   NaN| 0.03999999910593033|file:///src/data/...|2000.0|  5.0|
|2000-0

## Extract and Prepare Data

Extract Date and Price columns from each table and rename them appropriately

In [5]:
gold_df = silver_gold.select(col("Date"), col("Price").alias("gold"))
oil_df = silver_oil.select(col("Date"), col("Price").alias("oil"))
us_dollar_df = silver_us_dollar.select(col("Date"), col("Price").alias("us_dollar"))
usd_vnd_df = silver_usd_vnd.select(col("Date"), col("Price").alias("usd_vnd"))

print("Prepared Gold dataframe:")
gold_df.show(5)

print("Prepared Oil dataframe:")
oil_df.show(5)

print("Prepared US Dollar dataframe:")
us_dollar_df.show(5)

print("Prepared USD-VND dataframe:")
usd_vnd_df.show(5)

Prepared Gold dataframe:
+----------+-----+
|      Date| gold|
+----------+-----+
|2002-10-01|994.4|
|2002-10-02|994.4|
|2002-10-03|994.4|
|2002-10-04|994.4|
|2002-10-05|994.4|
+----------+-----+
only showing top 5 rows

Prepared Oil dataframe:
+----------+-----+
|      Date|  oil|
+----------+-----+
|2002-10-01|30.83|
|2002-10-02|30.49|
|2002-10-03|29.76|
|2002-10-04|29.62|
|2002-10-05|29.62|
+----------+-----+
only showing top 5 rows

Prepared US Dollar dataframe:
+----------+---------+
|      Date|us_dollar|
+----------+---------+
|2002-10-01|   107.48|
|2002-10-02|   107.54|
|2002-10-03|   107.36|
|2002-10-04|   108.02|
|2002-10-05|   108.02|
+----------+---------+
only showing top 5 rows

Prepared USD-VND dataframe:
+----------+-------+
|      Date|usd_vnd|
+----------+-------+
|2002-10-01|15341.0|
|2002-10-02|15336.0|
|2002-10-03|15343.0|
|2002-10-04|15340.0|
|2002-10-05|15340.0|
+----------+-------+
only showing top 5 rows



## Determine Table With Most Data

Count records in each table to find which one has the most data

In [6]:
# Count records in each table
gold_count = gold_df.count()
oil_count = oil_df.count()
us_dollar_count = us_dollar_df.count()
usd_vnd_count = usd_vnd_df.count()

print(f"Gold records: {gold_count}")
print(f"Oil records: {oil_count}")
print(f"US Dollar records: {us_dollar_count}")
print(f"USD-VND records: {usd_vnd_count}")

# # Determine which table has the most records
counts = {
    "gold": gold_count,
    "oil": oil_count,
    "us_dollar": us_dollar_count,
    "usd_vnd": usd_vnd_count
}

base_table_name = max(counts, key=counts.get)
print(f"Base table with most records: {base_table_name}")

Gold records: 11022
Oil records: 11022
US Dollar records: 11022
USD-VND records: 11023
Base table with most records: usd_vnd


## Join Tables by Date

Join all tables by Date, using left outer joins to maintain all dates from the base table

final_df.show(5)

In [16]:
dfs = [oil_df, us_dollar_df, gold_df, usd_vnd_df]
final_df = reduce(lambda df1, df2: df1.join(df2, on='Date', how='outer'), dfs)
final_df = final_df.orderBy('Date') 
final_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- oil: float (nullable = true)
 |-- us_dollar: float (nullable = true)
 |-- gold: float (nullable = true)
 |-- usd_vnd: float (nullable = true)



In [17]:
final_df.show(5)

[Stage 29:>                                                         (0 + 1) / 1]

+----------+-----+---------+-----+-------+
|      Date|  oil|us_dollar| gold|usd_vnd|
+----------+-----+---------+-----+-------+
|1995-01-02| NULL|     NULL| NULL|   NULL|
|1995-01-03|17.44|    89.21|380.9|   NULL|
|1995-01-04|17.48|    89.35|375.3|   NULL|
|1995-01-05|17.72|    89.04|376.6|   NULL|
|1995-01-06|17.67|    89.71|372.2|   NULL|
+----------+-----+---------+-----+-------+
only showing top 5 rows



                                                                                

## Write to Gold Layer

Save the combined table to the gold layer as 'gold_oil_usd_vnd'

Write to gold layer using Iceberg format

In [11]:
try:
    final_df.write \
        .format("iceberg") \
        .mode("append") \
        .saveAsTable("datalake.gold.oil_usd")
    
    print("Successfully wrote combined data to datalake.gold.oil_usd")
except Exception as e:
    print(f"Error writing to gold table: {e}")

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x00007f417e5dfa68, pid=5866, tid=5947
#
# JRE version: OpenJDK Runtime Environment (17.0.14+7) (build 17.0.14+7-Debian-1deb11u1)
# Java VM: OpenJDK 64-Bit Server VM (17.0.14+7-Debian-1deb11u1, mixed mode, sharing, tiered, compressed oops, compressed class ptrs, g1 gc, linux-amd64)
# Problematic frame:
# V  [libjvm.so+0xe2ca68]  SymbolTable::do_lookup(char const*, int, unsigned long)+0xd8
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport -p%p -s%s -c%c -d%d -P%P -u%u -g%g -- %E" (or dumping to /src/notebooks/core.5866)
#
# An error report file with more information is saved as:
# /src/notebooks/hs_err_pid5866.log
#
# If you would like to submit a bug report, please visit:
#   https://bugs.debian.org/openjdk-17
#


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Error writing to gold table: An error occurred while calling o101.saveAsTable


Verify the gold table was created successfully

In [None]:
try:
    gold_table = spark.table("datalake.gold.gold_oil_usd_vnd")
    print("Gold table schema:")
    gold_table.printSchema()
    
    print("Gold table sample data:")
    gold_table.show(10)
    
    print(f"Total records in gold table: {gold_table.count()}")
except Exception as e:
    print(f"Error reading gold table: {e}")

## Data Analysis

Perform some basic analysis on the combined data

In [None]:
# Check for null values in each column
from pyspark.sql.functions import count, when, col, isnan

null_counts = gold_table.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in gold_table.columns])
print("Null value counts by column:")
null_counts.show()

In [None]:
# Get summary statistics for each price column
print("Summary statistics:")
gold_table.select("gold", "oil", "us_dollar", "usd_vnd").summary().show()

In [None]:
# Add year column for yearly analysis
gold_table_with_year = gold_table.withColumn("Year", year("Date"))

# Calculate yearly averages
yearly_avgs = gold_table_with_year.groupBy("Year").agg(
    avg("gold").alias("gold_avg"),
    avg("oil").alias("oil_avg"),
    avg("us_dollar").alias("us_dollar_avg"),
    avg("usd_vnd").alias("usd_vnd_avg")
)

print("Yearly averages:")
yearly_avgs.orderBy("Year").show()

In [15]:
# Stop Spark session when done
spark.stop()
print("Spark session stopped.")

ConnectionRefusedError: [Errno 111] Connection refused

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
