In [1]:
import os
import sys
import json
import requests

from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, lit
from pyspark.sql.types import *


sys.path.append("../..")
from utils.get_date_today_iso_string import get_date_today_iso_string


In [2]:
load_dotenv()

True

In [3]:
API_KEY = os.getenv("API_KEY")
API_URL = os.getenv("API_URL")

#### CALL API ####

In [4]:
coin_markets_url = f"{API_URL}/coins/markets?vs_currency=usd"

headers = {
    "x-cg-demo-api-key": API_KEY,
}

In [5]:
coin_markets_response = requests.get(coin_markets_url, headers=headers)

coin_markets_list = coin_markets_response.json()

formatted_coin_markets_response = json.dumps(coin_markets_list, indent = 4)


In [6]:
print(formatted_coin_markets_response)

[
    {
        "id": "bitcoin",
        "symbol": "btc",
        "name": "Bitcoin",
        "image": "https://coin-images.coingecko.com/coins/images/1/large/bitcoin.png?1696501400",
        "current_price": 102137,
        "market_cap": 2037445236754,
        "market_cap_rank": 1,
        "fully_diluted_valuation": 2037445236754,
        "total_volume": 88079198983,
        "high_24h": 103956,
        "low_24h": 99377,
        "price_change_24h": 231.81,
        "price_change_percentage_24h": 0.22748,
        "market_cap_change_24h": 4915656387,
        "market_cap_change_percentage_24h": 0.24185,
        "circulating_supply": 19945909.0,
        "total_supply": 19945909.0,
        "max_supply": 21000000.0,
        "ath": 126080,
        "ath_change_percentage": -18.96531,
        "ath_date": "2025-10-06T18:57:42.558Z",
        "atl": 67.81,
        "atl_change_percentage": 150570.9204,
        "atl_date": "2013-07-06T00:00:00.000Z",
        "roi": null,
        "last_updated": "2025-

In [7]:
extraction_date = get_date_today_iso_string()
os.makedirs(f"../../data/bronze/coin_markets/raw_json/{extraction_date}", exist_ok=True)

with open(f"../../data/bronze/coin_markets/raw_json/{extraction_date}/coin_markets.json", "w") as f:
    json.dump(coin_markets_list, f)

In [8]:
spark = SparkSession.builder.appName("ConvertJSONtoParquet").getOrCreate()

bronze_df_column_order = [
    "id",
    "symbol",
    "name",
    "image",
    "current_price",
    "market_cap",
    "market_cap_rank",
    "fully_diluted_valuation",
    "total_volume",
    "high_24h",
    "low_24h",
    "price_change_24h",
    "price_change_percentage_24h",
    "market_cap_change_24h",
    "market_cap_change_percentage_24h",
    "circulating_supply",
    "total_supply",
    "max_supply",
    "ath",
    "ath_change_percentage",
    "ath_date",
    "atl",
    "atl_change_percentage",
    "atl_date",
    "roi",
    "last_updated"
]


In [9]:
bronze_df = spark.read.option("inferSchema", True).json(f"../../data/bronze/coin_markets/raw_json/{extraction_date}/coin_markets.json")
bronze_df = bronze_df.withColumn("ingestion_date", current_timestamp())
bronze_df = bronze_df.select(*bronze_df_column_order)


In [10]:
with open("schema_order.txt", "w") as f:
    f.write(",".join(bronze_df.columns))

bronze_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- name: string (nullable = true)
 |-- image: string (nullable = true)
 |-- current_price: double (nullable = true)
 |-- market_cap: long (nullable = true)
 |-- market_cap_rank: long (nullable = true)
 |-- fully_diluted_valuation: long (nullable = true)
 |-- total_volume: double (nullable = true)
 |-- high_24h: double (nullable = true)
 |-- low_24h: double (nullable = true)
 |-- price_change_24h: double (nullable = true)
 |-- price_change_percentage_24h: double (nullable = true)
 |-- market_cap_change_24h: double (nullable = true)
 |-- market_cap_change_percentage_24h: double (nullable = true)
 |-- circulating_supply: double (nullable = true)
 |-- total_supply: double (nullable = true)
 |-- max_supply: double (nullable = true)
 |-- ath: double (nullable = true)
 |-- ath_change_percentage: double (nullable = true)
 |-- ath_date: string (nullable = true)
 |-- atl: double (nullable = true)
 |-- atl_change_perce

In [11]:
bronze_df.write.mode("overwrite").parquet(f"../../data/bronze/coin_markets/parquet/{extraction_date}/")