In [None]:
import sys

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

sys.path.append("../../..")
from utils.get_date_today_iso_string import get_date_today_iso_string

spark = SparkSession.builder.appName("ReadParquetFile").getOrCreate()

extraction_date = get_date_today_iso_string()

bronze_to_silver_df = spark.read.parquet(f"../../../data/bronze/coin_markets/parquet/{extraction_date}/")

with open("../../bronze/schema_order.txt") as f:
    bronze_to_silver_df_column_order = f.read().split(",")

bronze_to_silver_df = bronze_to_silver_df.select(*bronze_to_silver_df_column_order)


root
 |-- id: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- name: string (nullable = true)
 |-- image: string (nullable = true)
 |-- current_price: double (nullable = true)
 |-- market_cap: long (nullable = true)
 |-- market_cap_rank: long (nullable = true)
 |-- fully_diluted_valuation: long (nullable = true)
 |-- total_volume: double (nullable = true)
 |-- high_24h: double (nullable = true)
 |-- low_24h: double (nullable = true)
 |-- price_change_24h: double (nullable = true)
 |-- price_change_percentage_24h: double (nullable = true)
 |-- market_cap_change_24h: double (nullable = true)
 |-- market_cap_change_percentage_24h: double (nullable = true)
 |-- circulating_supply: double (nullable = true)
 |-- total_supply: double (nullable = true)
 |-- max_supply: double (nullable = true)
 |-- ath: double (nullable = true)
 |-- ath_change_percentage: double (nullable = true)
 |-- ath_date: string (nullable = true)
 |-- atl: double (nullable = true)
 |-- atl_change_perce

In [40]:
# CHECK FOR NULL VALUES OF ID COLUMN

null_id_column_check = (
    bronze_to_silver_df\
    .filter((F.col("id").isNull()))\
    .select(F.col("id").alias("null_id"))
)

null_id_column_check.show()

+-------+
|null_id|
+-------+
+-------+



In [39]:
# CHECK FOR WHITE SPACES OF ID COLUMN

whitespaces_id_column_check = (
    bronze_to_silver_df\
    .filter(F.trim(F.col("id")) != (F.col("id")))\
    .select(F.col("id").alias("whitespaced_id"))\
)

whitespaces_id_column_check.show()

+--------------+
|whitespaced_id|
+--------------+
+--------------+



In [30]:
# CHECK FOR DUPLICATES OF ID COLUMN

duplicate_id_column_check = (
    bronze_to_silver_df\
    .withColumn("cleaned_id", F.trim(F.lower(F.col("id"))))\
    .groupBy("cleaned_id")\
    .agg(F.count("*").alias("cleaned_id_distinct_count"))\
    .filter( F.col("cleaned_id_distinct_count") > 1 )
)

duplicate_id_column_check.show(truncate=False)

+----------+-------------------------+
|cleaned_id|cleaned_id_distinct_count|
+----------+-------------------------+
+----------+-------------------------+



In [57]:
# CHECK FOR NULL VALUES OF SYMBOL COLUMN

null_symbol_column_check = (
    bronze_to_silver_df\
    .filter((F.col("symbol").isNull()))\
    .select(F.col("symbol").alias("null_symbol"))
)

null_symbol_column_check.show()

+-----------+
|null_symbol|
+-----------+
+-----------+



In [41]:
# CHECK FOR WHITE SPACES OF SYMBOL COLUMN

whitespaces_symbol_column_check = (
    bronze_to_silver_df\
    .filter(F.trim(F.col("symbol")) != (F.col("symbol")))\
    .select(F.col("symbol").alias("whitespaced_symbol"))
)

whitespaces_symbol_column_check.show()

+------------------+
|whitespaced_symbol|
+------------------+
+------------------+



In [58]:
# CHECK FOR NULL VALUES OF NAME COLUMN

null_name_column_check = (
    bronze_to_silver_df\
    .filter((F.col("name").isNull()))\
    .select(F.col("name").alias("null_name"))
)

null_name_column_check.show()

+---------+
|null_name|
+---------+
+---------+



In [42]:
# CHECK FOR WHITE SPACES OF NAME COLUMN

whitespaces_name_column_check = (
    bronze_to_silver_df\
    .filter(F.trim(F.col("name")) != (F.col("name")))\
    .select(F.col("name").alias("whitespaced_name"))
)

whitespaces_name_column_check.show()

+----------------+
|whitespaced_name|
+----------------+
+----------------+



In [56]:
bronze_to_silver_df.show()

+------------------+----------+------------------+--------------------+-------------+-------------+---------------+-----------------------+----------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+--------------------+---------------+--------+---------------------+--------------------+----------+---------------------+--------------------+--------------------+--------------------+
|                id|    symbol|              name|               image|current_price|   market_cap|market_cap_rank|fully_diluted_valuation|    total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|        total_supply|     max_supply|     ath|ath_change_percentage|            ath_date|       atl|atl_change_percentage|            atl_date|                 roi|        last_updated|
+------------------+----------

In [55]:
bronze_to_silver_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- name: string (nullable = true)
 |-- image: string (nullable = true)
 |-- current_price: double (nullable = true)
 |-- market_cap: long (nullable = true)
 |-- market_cap_rank: long (nullable = true)
 |-- fully_diluted_valuation: long (nullable = true)
 |-- total_volume: double (nullable = true)
 |-- high_24h: double (nullable = true)
 |-- low_24h: double (nullable = true)
 |-- price_change_24h: double (nullable = true)
 |-- price_change_percentage_24h: double (nullable = true)
 |-- market_cap_change_24h: double (nullable = true)
 |-- market_cap_change_percentage_24h: double (nullable = true)
 |-- circulating_supply: double (nullable = true)
 |-- total_supply: double (nullable = true)
 |-- max_supply: double (nullable = true)
 |-- ath: double (nullable = true)
 |-- ath_change_percentage: double (nullable = true)
 |-- ath_date: string (nullable = true)
 |-- atl: double (nullable = true)
 |-- atl_change_perce

In [54]:
# CHECK URL FORMAT AND NULL VALUES FOR IMAGE COLUMN

url_pattern = r'^(https?://)([A-Za-z0-9.-]+)(:[0-9]+)?(/[A-Za-z0-9._~:/?#\[\]@!$&\'()*+,;=%-]*)?$'

invalid_image_url_column_check = (
    bronze_to_silver_df.filter(F.col("image").isNull() | (~F.col("image").rlike(url_pattern))
)\
    .select(F.col("image").alias("invalid_image_url"))
)

invalid_image_url_column_check.show(truncate=False)


+-----------------+
|invalid_image_url|
+-----------------+
+-----------------+



In [61]:
# CHECK FOR NEGATIVE OR NULL VALUES OF CURRENT PRICE COLUMN

invalid_current_price_column_check = (
    bronze_to_silver_df.filter((F.col("current_price") < 0) | (F.col("current_price").isNull()))\
    .select(F.col("current_price").alias("invalid_current_price"))
)

invalid_current_price_column_check.show()


+---------------------+
|invalid_current_price|
+---------------------+
+---------------------+



In [None]:
# CHECK FOR NEGATIVE OR NULL VALUES OF MARKET CAP COLUMN

invalid_market_cap_column_check = (
    bronze_to_silver_df.filter((F.col("market_cap") < 0) | (F.col("market_cap").isNull()))\
    .select(F.col("market_cap").alias("invalid_market_cap_price"))
)

invalid_market_cap_column_check.show()

+------------------------+
|invalid_market_cap_price|
+------------------------+
+------------------------+



In [18]:
# silver_df = (
#     bronze_df
#     .dropDuplicates()
#     .filter(bronze_df["price"].isNotNull())
#     .withColumnRenamed("timestamp", "event_time")
# )

# silver_df.write.mode("overwrite").parquet("silver/crypto_data.parquet")