In [26]:
from datetime import datetime
from typing import List, Optional, Tuple

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd
from pyspark.sql import SparkSession, DataFrame # type: ignore
from pyspark.sql.functions import col, to_timestamp, year, month, col, count, when, lit # type: ignore
from functools import reduce

SOURCE_CATALOG = "datalake"
SOURCE_NAMESPACE = f"{SOURCE_CATALOG}.gold"

In [3]:
spark = (
        SparkSession.builder.appName("Test_read")
        .enableHiveSupport()
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.sql.avro.datetimeRebaseModeInWrite", "CORRECTED")
        .getOrCreate()
    )
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {SOURCE_NAMESPACE}")

DataFrame[]

In [4]:
try:
    silver_indices = spark.table("datalake.gold.indices")
    silver_macro = spark.table("datalake.gold.macro")
    silver_usbonds = spark.table("datalake.gold.USbonds")
    
    print("Tables loaded successfully")
except Exception as e:
    print(f"Error loading silver tables: {e}")

Tables loaded successfully


In [8]:
# Display table schemas and sample data
print("indices schema:")
silver_indices.printSchema()
print(silver_indices.count())
silver_indices.show(5)

print("macro schema:")
silver_macro.printSchema()
print(silver_macro.count())
silver_macro.show(5)

print("USbonds schema:")
silver_usbonds.printSchema()
print(silver_usbonds.count())
silver_usbonds.show(5)

indices schema:
root
 |-- date: date (nullable = true)
 |-- russell2000: float (nullable = true)
 |-- dow_jones: float (nullable = true)
 |-- msci_world: float (nullable = true)
 |-- nasdaq100: float (nullable = true)
 |-- s_p500: float (nullable = true)
 |-- gold: float (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

11022
+----------+-----------+---------+----------+---------+------+-----+----+-----+
|      date|russell2000|dow_jones|msci_world|nasdaq100|s_p500| gold|year|month|
+----------+-----------+---------+----------+---------+------+-----+----+-----+
|2002-10-01|     368.09|  7938.79|     755.4|   870.63| 994.8|994.4|2002|   10|
|2002-10-02|     360.22|  7755.61|     750.8|   849.55| 994.8|994.4|2002|   10|
|2002-10-03|     356.85|  7717.19|     742.3|   833.21| 994.8|994.4|2002|   10|
|2002-10-04|     347.98|   7528.4|     727.0|    815.4| 994.8|994.4|2002|   10|
|2002-10-05|     347.98|   7528.4|     727.0|    815.4| 994.8|994.4|

In [15]:
silver_indices.columns

['date',
 'russell2000',
 'dow_jones',
 'msci_world',
 'nasdaq100',
 's_p500',
 'gold',
 'year',
 'month']

In [27]:
list_col_null = []
null_counts_data = []

for col in silver_indices.columns:
    null_count = silver_indices.filter(silver_indices[col].isNull()).count()
    if null_count > 0:
        list_col_null.append(col)
        null_counts_data.append((col, null_count))

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("column_name", StringType(), False),
    StructField("null_count", IntegerType(), False)
])

null_df = spark.createDataFrame(null_counts_data, schema)

print("Các cột có giá trị null:", list_col_null)
null_df.show()

Các cột có giá trị null: []


                                                                                

+-----------+----------+
|column_name|null_count|
+-----------+----------+
+-----------+----------+



In [28]:
list_col_null = []
null_counts_data = []

for col in silver_macro.columns:
    null_count = silver_macro.filter(silver_macro[col].isNull()).count()
    if null_count > 0:
        list_col_null.append(col)
        null_counts_data.append((col, null_count))

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("column_name", StringType(), False),
    StructField("null_count", IntegerType(), False)
])

null_df = spark.createDataFrame(null_counts_data, schema)

print("Các cột có giá trị null:", list_col_null)
null_df.show()

Các cột có giá trị null: ['gold', 'oil', 'us_dollar', 'usd_vnd']
+-----------+----------+
|column_name|null_count|
+-----------+----------+
|       gold|         2|
|        oil|         2|
|  us_dollar|         2|
|    usd_vnd|         1|
+-----------+----------+



In [29]:
list_col_null = []
null_counts_data = []

for col in silver_usbonds.columns:
    null_count = silver_usbonds.filter(silver_usbonds[col].isNull()).count()
    if null_count > 0:
        list_col_null.append(col)
        null_counts_data.append((col, null_count))

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("column_name", StringType(), False),
    StructField("null_count", IntegerType(), False)
])

null_df = spark.createDataFrame(null_counts_data, schema)

print("Các cột có giá trị null:", list_col_null)
null_df.show()

Các cột có giá trị null: ['gold', 'us_2_year_bond', 'us_5_year_bond']
+--------------+----------+
|   column_name|null_count|
+--------------+----------+
|          gold|         1|
|us_2_year_bond|         1|
|us_5_year_bond|         1|
+--------------+----------+



In [30]:
for col in silver_indices.columns:
    null_rows = silver_indices.filter(silver_indices[col].isNull())
    if null_rows.count() > 0:
        print(f"\nDòng có giá trị null trong cột {col}:")
        null_rows.show(truncate=False)

In [31]:
for col in silver_macro.columns:
    null_rows = silver_macro.filter(silver_macro[col].isNull())
    if null_rows.count() > 0:
        print(f"\nDòng có giá trị null trong cột {col}:")
        null_rows.show(truncate=False)


Dòng có giá trị null trong cột gold:
+----------+----+----+---------+-------+-----+--------------+-------------+----+-----+
|date      |gold|oil |us_dollar|usd_vnd|cpi  |inflation_rate|interest_rate|year|month|
+----------+----+----+---------+-------+-----+--------------+-------------+----+-----+
|1995-01-01|NULL|NULL|NULL     |NULL   |150.5|0.3           |5.53         |1995|1    |
|1995-01-02|NULL|NULL|NULL     |11042.0|150.5|0.3           |5.53         |1995|1    |
+----------+----+----+---------+-------+-----+--------------+-------------+----+-----+


Dòng có giá trị null trong cột oil:
+----------+----+----+---------+-------+-----+--------------+-------------+----+-----+
|date      |gold|oil |us_dollar|usd_vnd|cpi  |inflation_rate|interest_rate|year|month|
+----------+----+----+---------+-------+-----+--------------+-------------+----+-----+
|1995-01-01|NULL|NULL|NULL     |NULL   |150.5|0.3           |5.53         |1995|1    |
|1995-01-02|NULL|NULL|NULL     |11042.0|150.5|0.3     

In [9]:
spark.stop()