In [1]:
import os
import sys
from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, last, row_number, to_timestamp, regexp_replace, lit # type: ignore
from pyspark.sql.window import Window # type: ignore

In [2]:
spark = SparkSession.builder.appName("Testing_load_Silver").getOrCreate()
spark.sql("CREATE DATABASE IF NOT EXISTS datalake.silver")

DataFrame[]

In [3]:
bronze_table_gold = spark.table("datalake.bronze.gold")
bronze_table_gold.createOrReplaceTempView("bronze_table_gold")

print(bronze_table_gold.printSchema())
bronze_table_gold.show(5)

root
 |-- Date: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Vol: string (nullable = true)
 |-- Change: string (nullable = true)

None


                                                                                

+----------+--------+--------+--------+--------+-------+------+
|      Date|   Price|    Open|    High|     Low|    Vol|Change|
+----------+--------+--------+--------+--------+-------+------+
|11/25/2013|1,241.20|1,241.60|1,254.00|1,225.70|195.44K|-0.23%|
|11/22/2013|1,244.10|1,242.30|1,248.30|1,240.30|135.22K| 0.04%|
|11/21/2013|1,243.60|1,241.80|1,250.00|1,235.80|197.77K|-1.14%|
|11/20/2013|1,258.00|1,274.50|1,275.70|1,240.20|224.19K|-1.22%|
|11/19/2013|1,273.50|1,275.00|1,278.20|1,268.00|105.15K| 0.09%|
+----------+--------+--------+--------+--------+-------+------+
only showing top 5 rows



In [4]:
bronze_table_gold.count()

12645

In [5]:
df_temp = bronze_table_gold.select("Date", "Price", "Open", "High",  "Low")
df_temp = df_temp.withColumn("Date", to_timestamp(col("Date"), "MM/dd/yyyy").cast("date"))
for c in ["Price", "Open", "High", "Low"]:
    df_temp = df_temp.withColumn(c, regexp_replace(col(c), ",", ""))
    df_temp = df_temp.withColumn(c, col(c).cast("double"))

df_temp.printSchema()
df_temp.show(5)

root
 |-- Date: date (nullable = true)
 |-- Price: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)

+----------+------+------+------+------+
|      Date| Price|  Open|  High|   Low|
+----------+------+------+------+------+
|2013-11-25|1241.2|1241.6|1254.0|1225.7|
|2013-11-22|1244.1|1242.3|1248.3|1240.3|
|2013-11-21|1243.6|1241.8|1250.0|1235.8|
|2013-11-20|1258.0|1274.5|1275.7|1240.2|
|2013-11-19|1273.5|1275.0|1278.2|1268.0|
+----------+------+------+------+------+
only showing top 5 rows



In [6]:
start_date = lit('1995-01-02').cast("date")
df_temp = df_temp.filter(col("Date") >= start_date)
df_temp.show(5)

+----------+------+------+------+------+
|      Date| Price|  Open|  High|   Low|
+----------+------+------+------+------+
|2013-11-25|1241.2|1241.6|1254.0|1225.7|
|2013-11-22|1244.1|1242.3|1248.3|1240.3|
|2013-11-21|1243.6|1241.8|1250.0|1235.8|
|2013-11-20|1258.0|1274.5|1275.7|1240.2|
|2013-11-19|1273.5|1275.0|1278.2|1268.0|
+----------+------+------+------+------+
only showing top 5 rows



In [7]:
df_temp.count()

7610

In [8]:
df_temp.write.format("iceberg").mode("overwrite").saveAsTable("datalake.gold.gold_new")

In [9]:
spark.stop()