In [17]:
from datetime import datetime
from typing import List, Optional, Tuple

from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, input_file_name, year, month, to_timestamp # type: ignore

# Constants
BASE_RAW_DATA_DIR = "/src/data/raw/"
TARGET_CATALOG = "datalake"
TARGET_NAMESPACE = f"{TARGET_CATALOG}.silver"
DATE_FORMATS = ["yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd'T'HH:mm:ssZ", 
               'yyyy-MM-dd HH:mm:ss.SSS', 'yyyy-MM-dd HH:mm:ss', 'MM/dd/yyyy HH:mm:ss', 
               'MM/dd/yyyy', 'yyyy-MM-dd', 'dd/MM/yyyy', 'dd-MM-yyyy', 'MM-dd-yyyy']

In [18]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {TARGET_NAMESPACE}")

DataFrame[]

In [20]:
bronze_table_gold = spark.table("datalake.silver.vn_10_year_bond")

print(bronze_table_gold.printSchema())
bronze_table_gold.show(5)

root
 |-- Date: date (nullable = true)
 |-- Price: float (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

None
+----------+-----+-----+-----+-----+----+-----+
|      Date|Price| Open| High|  Low|year|month|
+----------+-----+-----+-----+-----+----+-----+
|2015-05-30|6.538|6.538|6.538|6.538|2015|    5|
|2015-05-29|6.538|6.538|6.538|6.538|2015|    5|
|2015-05-28|6.538|6.538|6.538|6.538|2015|    5|
|2015-05-27|6.535|6.535|6.535|6.535|2015|    5|
|2015-05-26|6.535|6.535|6.535|6.535|2015|    5|
+----------+-----+-----+-----+-----+----+-----+
only showing top 5 rows



In [4]:
df_temp = bronze_table_gold.select("Date", "Price", "Open", "High",  "Low")
df_temp = df_temp.withColumn("Date", to_timestamp(col("Date"), "MM/dd/yyyy").cast("date"))
df_temp = df_temp.withColumn("Price", col("Price").cast("float"))
df_temp = df_temp.withColumn("Open", col("Open").cast("float"))
df_temp = df_temp.withColumn("High", col("High").cast("float"))
df_temp = df_temp.withColumn("Low", col("Low").cast("float"))

print(df_temp.printSchema())
df_temp.show(5)

root
 |-- Date: date (nullable = true)
 |-- Price: float (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)

None
+----------+-----+-----+-----+-----+
|      Date|Price| Open| High|  Low|
+----------+-----+-----+-----+-----+
|2002-10-31|318.4|317.2|318.7|316.0|
|2002-10-30|316.9|318.2|318.2|316.4|
|2002-10-29|318.1|315.7|318.8|315.7|
|2002-10-28|315.6|313.8|315.9|313.5|
|2002-10-25|313.9|311.8|314.8|311.7|
+----------+-----+-----+-----+-----+
only showing top 5 rows



In [6]:
df_temp = df_temp.orderBy("Date")
df_temp.show(5)

                                                                                

+----------+-----+-----+-----+-----+
|      Date|Price| Open| High|  Low|
+----------+-----+-----+-----+-----+
|1975-01-03|174.7|173.0|175.5|170.5|
|1975-01-06|174.4|172.0|174.5|167.5|
|1975-01-07|173.4|171.0|174.0|168.5|
|1975-01-08|177.4|176.0|180.0|176.0|
|1975-01-09|178.3|179.0|179.1|177.0|
+----------+-----+-----+-----+-----+
only showing top 5 rows



In [47]:
df_temp = df_temp.withColumn("Date", to_timestamp("Date", "MM/dd/yyyy").cast("date"))

print(df_temp.printSchema())
df_temp.show(5)

root
 |-- Date: date (nullable = true)
 |-- Price: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)

None
+----------+-----+-----+-----+-----+
|      Date|Price| Open| High|  Low|
+----------+-----+-----+-----+-----+
|1993-12-30|391.9|393.3|393.9|391.0|
|1993-12-29|391.8|386.6|392.5|384.8|
|1993-12-28|388.5|389.7|390.0|387.9|
|1993-12-27|389.1|388.1|389.4|387.7|
|1993-12-23|387.0|387.0|387.3|385.6|
+----------+-----+-----+-----+-----+
only showing top 5 rows



In [7]:
spark.stop()