In [1]:
import os
import re
import logging
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, regexp_replace, year, avg

In [6]:
def read_from_iceberg(spark: SparkSession, table_name: str) -> DataFrame:
  return spark.read.format("iceberg").table(table_name)

In [3]:
def process_and_group(df: DataFrame) -> DataFrame:
  # Gom nhóm theo năm và tính trung bình cho các cột còn lại
  grouped_df = df.groupBy(year("date").alias("year")).agg(*[
      avg(col).alias(f"{col}_avg") for col in df.columns if col != "date"
  ])
    
  return grouped_df

In [4]:
def write_to_iceberg(df: DataFrame, table_name: str):
  df.write.format("iceberg").mode("overwrite").saveAsTable(table_name)

In [10]:
spark = SparkSession.builder.appName("Silver_to_Gold").getOrCreate()
spark.sql("CREATE NAMESPACE IF NOT EXISTS datalake.gold")

DataFrame[]

In [12]:
df_gold = spark.read.format("iceberg").table("datalake.silver.gold")
df_gold.show(10)

+---+----+-----+-----+-----+-----+
| ID|Date|Price| Open| High|  Low|
+---+----+-----+-----+-----+-----+
|  1|NULL|332.5|333.0|333.4|332.3|
|  2|NULL|329.6|329.1|329.7|328.3|
|  3|NULL|332.8|334.1|334.3|332.0|
|  4|NULL|445.3|446.8|449.0|443.6|
|  5|NULL|449.2|448.5|450.5|448.0|
|  6|NULL|413.3|415.0|416.2|412.8|
|  7|NULL|396.3|399.7|400.8|394.6|
|  8|NULL|405.5|401.0|407.3|400.0|
|  9|NULL|336.1|333.5|336.5|333.0|
| 10|NULL|362.0|366.0|367.5|360.5|
+---+----+-----+-----+-----+-----+
only showing top 10 rows



In [None]:
df_vnd_use = spark.read.format("iceberg").table("datalake.silver.vnd_usd")

In [None]:
df_gold_processed = process_and_group(df_gold)
df_vnd_usd_processed = process_and_group(df_vnd_use)

In [None]:
write_to_iceberg(df_gold_processed, "datalake.gold.gold")
write_to_iceberg(df_vnd_usd_processed, "datalake.gold.vnd_usd")

In [None]:
spark.stop()