In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.81)] [Connected to cloud.r-project.org (108.138.1                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 15.6 kB/128 kB 12%] [Connecting to security.ubuntu.com (185.125.190.81)] [Waiting fo                                                                                                    Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [2 InRelease 63.4 kB/128 kB 50%] [Waiting for headers] [3 InRelease 3,626 B/3,626 B 100%] [Connec0% [2 InRelease 66.3 kB/128 kB 52%] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.10% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                   

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [4]:
from pyspark.sql import SparkSession

# Initialize Spark session with Delta Lake support and required configurations
spark = (
    SparkSession.builder
    .appName("Installed Power Data Cleaning")
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
    .config("spark.sql.crossJoin.enabled", "true")
    .getOrCreate()
)

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, posexplode


# Load the installed power data from a JSON file
installed_power_data_df = spark.read.option("multiline", "true").json("/content/installed_power_data_monthly.json")

# Display the initial DataFrame count and data
print("Initial DataFrame count:", installed_power_data_df.count())
installed_power_data_df.show(truncate=False)

# Clean the data
# Filter out deprecated entries
installed_power_cleaned = installed_power_data_df.filter(
    col("deprecated") == False
)

# Explode the production_types to get individual rows for each production type
production_exploded = installed_power_cleaned.select(
    posexplode(col("time")).alias("pos", "time"),  # Explode 'time' and capture index 'pos'
    explode(col("production_types")).alias("production_type")
)

# Use the 'pos' index to align each 'data' entry with the correct 'time'
final_cleaned_data = production_exploded.select(
    col("time"),
    col("production_type.name").alias("production_name"),
    col("production_type.data").getItem(col("pos")).alias("production_value")
)

# Filter out null values in production_value
final_cleaned_data = final_cleaned_data.filter(
    col("production_value").isNotNull()
)

# Specify the path where the cleaned CSV file will be stored
csv_file_path = "/content/installed_power_cleaned.csv"

# Save the cleaned DataFrame as a CSV file
final_cleaned_data.write.format("csv").mode("overwrite").option("header", "true").save(csv_file_path)

print(f"Cleaned installed power data CSV file saved at: {csv_file_path}")


Initial DataFrame count: 1
+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------