In [51]:
import sys

from pyspark.sql.functions import current_timestamp

sys.path.append("../../..")

from readers.bronze.coin_markets.bronze_reading import bronze_to_silver_df

from pipelines.cleaning_functions import (
    datestring_to_timestamp_clean,
    null_value_clean,
    whitespaces_clean,
    whitespaces_and_to_lowercase_clean,
    negative_value_clean,
    duplicates_clean,
    rank_clean_with_flag,
    standardize_roi_struct_clean
)

from utils.get_date_today_iso_string import get_date_today_iso_string

In [52]:
# CLEAN FOR NULL VALUES, UNWANTED WHITESPACES AND DUPLICATES OF ID COLUMN

id_cleaning_functions = [null_value_clean, whitespaces_clean, duplicates_clean]

for function in id_cleaning_functions:
    bronze_to_silver_df = function(bronze_to_silver_df, "id")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [53]:
# CLEAN FOR NULL VALUES, UNWANTED WHITESPACES and STANDARDIZE TO ALL LOWERCASE OF SYMBOL COLUMN

symbol_cleaning_functions = [null_value_clean, whitespaces_and_to_lowercase_clean]

for function in symbol_cleaning_functions:
    bronze_to_silver_df = function(bronze_to_silver_df, "symbol")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [54]:
# CLEAN FOR NULL VALUES abd UNWANTED WHITESPACES OF NAME COLUMN

name_cleaning_functions = [null_value_clean, whitespaces_clean]

for function in name_cleaning_functions:
    bronze_to_silver_df = function(bronze_to_silver_df, "symbol")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [55]:
# CLEAN NULL VALUES FOR IMAGE COLUMN
## THERE IS NO CLEANING YET FOR INVALID URL FORMATS FOR image COLUMN because there is NO BAD DATA

bronze_to_silver_df = null_value_clean(bronze_to_silver_df, "image")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [56]:
# CLEAN FOR NEGATIVE VALUES OF CURRENT PRICE COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"current_price")

bronze_to_silver_df.show(5)


+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [57]:
# CLEAN FOR NEGATIVE VALUES OF MARKET CAP COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"market_cap")

bronze_to_silver_df.show(5)


+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [58]:
# CLEAN FOR NEGATIVE VALUES OF MARKET CAP RANK COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"market_cap_rank")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [59]:
# ADD FLAG FOR MISMATCHING market_cap_rank BASED ON VALUES OF market_cap ARRANGED MANUALLY

bronze_to_silver_df = rank_clean_with_flag(bronze_to_silver_df, "market_cap", "market_cap_rank")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [60]:
# CLEAN FOR NEGATIVE VALUES OF FULLY DILUATED VALUATION COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"fully_diluted_valuation")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [61]:
# CLEAN FOR NEGATIVE VALUES OF TOTAL VOLUME COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"total_volume")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [62]:
# CLEAN FOR NEGATIVE VALUES OF HIGH 24H COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"high_24h")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [63]:
# CLEAN FOR NEGATIVE VALUES OF LOW 24H COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"low_24h")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [64]:
# CLEAN FOR NEGATIVE VALUES OF CIRCULATING SUPPLY COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"circulating_supply")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [65]:
# CLEAN FOR NEGATIVE VALUES OF TOTAL SUPPLY COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"total_supply")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [66]:
# CLEAN FOR NEGATIVE VALUES OF MAX SUPPLY COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"max_supply")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [67]:
# CLEAN FOR NEGATIVE VALUES OF ath COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"ath")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [68]:
# CAST TO DATE TIME OBJECT FOR ath_date COLUMN

bronze_to_silver_df = datestring_to_timestamp_clean(bronze_to_silver_df,"ath_date")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [69]:
# CLEAN FOR NEGATIVE VALUES OF atl COLUMN

bronze_to_silver_df = negative_value_clean(bronze_to_silver_df,"atl")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [70]:
# CAST TO DATE TIME OBJECT FOR atl_date COLUMN

bronze_to_silver_df = datestring_to_timestamp_clean(bronze_to_silver_df,"atl_date")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [71]:
# STANDARDIZE currency value on roi column

bronze_to_silver_df = standardize_roi_struct_clean(bronze_to_silver_df)

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [72]:
# CHECK FOR INVALID DATE FORMAT, DATE IS IN THE FUTURE AND NULL VALUES OF last_updated COLUMN

bronze_to_silver_df = datestring_to_timestamp_clean(bronze_to_silver_df,"last_updated")

bronze_to_silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|
+--------+------+--------+--------------------+-------------+---------

In [73]:
# ADD A NEW COLUMN CALLED "cleaning_date" FOR THE SILVER LAYER

silver_df = bronze_to_silver_df.withColumn("cleaning_date", current_timestamp())

silver_df.show(5)

+--------+------+--------+--------------------+-------------+----------+---------------+-----------------------+------------+--------+--------+--------------------+---------------------------+---------------------+--------------------------------+--------------------+-------------------+----------+------+---------------------+--------------------+--------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      id|symbol|    name|               image|current_price|market_cap|market_cap_rank|fully_diluted_valuation|total_volume|high_24h| low_24h|    price_change_24h|price_change_percentage_24h|market_cap_change_24h|market_cap_change_percentage_24h|  circulating_supply|       total_supply|max_supply|   ath|ath_change_percentage|            ath_date|     atl|atl_change_percentage|            atl_date|                 roi|        last_updated|      ingestion_date|       cleaning_date|
+--------+------+--------+--

In [74]:
silver_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- name: string (nullable = true)
 |-- image: string (nullable = true)
 |-- current_price: double (nullable = true)
 |-- market_cap: long (nullable = true)
 |-- market_cap_rank: long (nullable = true)
 |-- fully_diluted_valuation: long (nullable = true)
 |-- total_volume: double (nullable = true)
 |-- high_24h: double (nullable = true)
 |-- low_24h: double (nullable = true)
 |-- price_change_24h: double (nullable = true)
 |-- price_change_percentage_24h: double (nullable = true)
 |-- market_cap_change_24h: double (nullable = true)
 |-- market_cap_change_percentage_24h: double (nullable = true)
 |-- circulating_supply: double (nullable = true)
 |-- total_supply: double (nullable = true)
 |-- max_supply: double (nullable = true)
 |-- ath: double (nullable = true)
 |-- ath_change_percentage: double (nullable = true)
 |-- ath_date: timestamp (nullable = true)
 |-- atl: double (nullable = true)
 |-- atl_change_pe

In [75]:
silver_df.write.mode("overwrite").parquet(f"../../../data/silver/coin_markets/parquet/{get_date_today_iso_string()}/")