In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null


Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Ign:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy Release
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
!pip install pyspark==3.5.1 delta-spark==3.2.0



In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"


In [7]:
from pyspark.sql import SparkSession

# Initialize Spark session with Delta Lake support and required configurations
spark = (
    SparkSession.builder
    .appName("Public Power Data Processing with Data Quality Checks")
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
    .config("spark.sql.crossJoin.enabled", "true")
    .getOrCreate()
)


In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, from_unixtime, expr
from pyspark.sql.types import StructType, StructField, ArrayType, LongType, StringType, FloatType, BooleanType

# Load public power data from JSON file without schema
public_power_df = spark.read.option("multiline", "true").json("/content/public_power_data_24.json")

# Check the initial DataFrame schema and content
print("Initial DataFrame count:", public_power_df.count())
public_power_df.show(truncate=False)

# Filter out deprecated records
public_power_filtered = public_power_df.filter(~col("deprecated"))

# Check after filtering
print("Filtered DataFrame count (not deprecated):", public_power_filtered.count())
public_power_filtered.show(truncate=False)

# Explode the production_types and combine with unix_seconds
public_power_exploded = public_power_filtered.select(
    explode("production_types").alias("production_type"),
    "unix_seconds"
)

# Check exploded DataFrame
print("Exploded DataFrame count:", public_power_exploded.count())
public_power_exploded.show(truncate=False)

# Pair unix_seconds with production data correctly using zip_with
public_power_transformed = public_power_exploded.select(
    col("production_type.name").alias("production_type_name"),
    expr("zip_with(unix_seconds, production_type.data, (x, y) -> struct(x, y))").alias("unix_and_data")
).select(
    explode("unix_and_data").alias("unix_data"),
    "production_type_name"
).select(
    from_unixtime(col("unix_data.x")).alias("timestamp"),
    "production_type_name",
    col("unix_data.y").alias("production_value")
)

# Check transformed DataFrame
print("Transformed DataFrame count:", public_power_transformed.count())
public_power_transformed.show(truncate=False)

# Additional data quality checks: Filter out rows where `production_value` is null
public_power_cleaned = public_power_transformed.filter(col("production_value").isNotNull())

# Final DataFrame check before saving
print("Cleaned DataFrame count (not null values):", public_power_cleaned.count())
public_power_cleaned.show(truncate=False)


Initial DataFrame count: 96
+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+
|deprecated|production_types                                                                                                                                                                           

In [9]:
# Specify the path where the CSV file will be stored
csv_file_path = "/content/public_power_cleaned.csv"

# Save the cleaned DataFrame as a CSV file
public_power_cleaned.write.format("csv").mode("overwrite").option("header", "true").save(csv_file_path)

print(f"CSV file saved at: {csv_file_path}")


CSV file saved at: /content/public_power_cleaned.csv
