In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null


0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x8

In [2]:
!pip install pyspark==3.5.1 delta-spark==3.2.0

Collecting pyspark==3.5.1
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting delta-spark==3.2.0
  Downloading delta_spark-3.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading delta_spark-3.2.0-py3-none-any.whl (21 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488492 sha256=f0c55b4cce5d38a46fb7a3855e9015881bc4805b305b093d09a1809b82b22522
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark, delta-spark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.3
    Uninstalling pyspark-3.5.3:
      Successfully uninstalle

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"


In [4]:
from pyspark.sql import SparkSession

# Initialize Spark session with Delta Lake support and required configurations
spark = (
    SparkSession.builder
    .appName("Price Data Cleaning")
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
    .config("spark.sql.crossJoin.enabled", "true")
    .getOrCreate()
)


In [6]:
# Load the price data from a JSON file
price_data_df = spark.read.option("multiline", "true").json("/content/price_data.json")

print("Initial DataFrame count:", price_data_df.count())
price_data_df.show(truncate=False)

Initial DataFrame count: 1
+----------+---------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|deprecated|license_info                                                                     |price                                                                                                                                                                            |unit               |unix_seconds                                                                                       

In [12]:
# Clean the data
# Filter out deprecated entries and rows with null values in 'price', 'unit', or 'unix_seconds'
from pyspark.sql.functions import col, from_unixtime, explode, posexplode
# Clean the data
# Filter out deprecated entries and rows with null values in 'price', 'unit', or 'unix_seconds'
price_cleaned = price_data_df.filter(
    (col("deprecated") == False) &
    (col("price").isNotNull()) &
    (col("unit").isNotNull()) &
    (col("unix_seconds").isNotNull())
)

# Use posexplode to maintain the relationship between unix_seconds and price
price_exploded = price_cleaned.select(
    "license_info",
    posexplode(col("unix_seconds")).alias("pos", "unix_seconds"),
    col("price"),
    "unit",
    "deprecated"
)

# Select the corresponding price using the position index
price_exploded = price_exploded.select(
    "license_info",
    col("unix_seconds"),
    col("price")[col("pos")].alias("price"),  # Match price with the same position
    "unit",
    "deprecated"
)

# Convert Unix timestamps to a readable timestamp format
price_exploded = price_exploded.withColumn("timestamp", from_unixtime(col("unix_seconds")))

# Select relevant columns
price_cleaned_final = price_exploded.select(
    "license_info",
    "price",
    "unit",
    "deprecated",
    "timestamp"
)

# Specify the path where the cleaned CSV file will be stored
csv_file_path = "/content/price_cleaned.csv"

# Save the cleaned DataFrame as a CSV file
price_cleaned_final.write.format("csv").mode("overwrite").option("header", "true").save(csv_file_path)

print(f"Cleaned price data CSV file saved at: {csv_file_path}")

Cleaned price data CSV file saved at: /content/price_cleaned.csv
