In [None]:
#Apache Spark Libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import input_file_name, regexp_extract, regexp_replace, split, col




# Set configs
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("batch_pipeline")
sparkConf.set("spark.driver.memory", "4g")
sparkConf.set("spark.executor.memory", "4g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()


# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Get bucket path
gcs_bucket_path = "bucket"

# Set file path
csv_files_path = gcs_bucket_path + "*.csv"

# Load Spark DF
df = spark.read.csv(csv_files_path, header=True, inferSchema=True)

print(f"Number of rows: {df.count()}")

# Extract the year from the filename (handles both YYYY and YYYY-MM-DD)
# Add a column with the file name
df = df.withColumn("filename", regexp_replace(input_file_name(), r"bucket", ""))

# Extract the year from the filename (handles both YYYY and YYYYMMDD patterns)
df = df.withColumn("year", regexp_extract("filename", r".*_(\d{2}\d{2})(\d{4})\.csv", 2).cast('int'))

# Extract net manager and type
df = df.withColumn('leverancier', split(col("filename"), '_').getItem(0))

# Check
df.printSchema()
df.show()

DataFrame 'coteq_electricity_2013' ingeladen, aantal rijen: 2503
DataFrame 'coteq_electricity_2014' ingeladen, aantal rijen: 2503
DataFrame 'coteq_electricity_2015' ingeladen, aantal rijen: 2503
DataFrame 'coteq_electricity_2016' ingeladen, aantal rijen: 2503
DataFrame 'coteq_electricity_2017' ingeladen, aantal rijen: 2503
DataFrame 'coteq_electricity_2018' ingeladen, aantal rijen: 2503
DataFrame 'coteq_electricity_2019' ingeladen, aantal rijen: 2563
DataFrame 'coteq_electricity_2020' ingeladen, aantal rijen: 2575
DataFrame 'endinet_electricity_01012011' ingeladen, aantal rijen: 5743
DataFrame 'endinet_electricity_01012012' ingeladen, aantal rijen: 5795
DataFrame 'endinet_electricity_01012013' ingeladen, aantal rijen: 5883
DataFrame 'endinet_electricity_01012014' ingeladen, aantal rijen: 4546
DataFrame 'endinet_electricity_01012015' ingeladen, aantal rijen: 4550
DataFrame 'endinet_electricity_01012016' ingeladen, aantal rijen: 4565
DataFrame 'enduriselectricity_01012013' ingeladen, aan

In [None]:
from pyspark.sql.functions import sum as _sum, col

# Perform group by and aggregate
aggregated_df = df.groupBy("zipcode_from", "zipcode_to", "leverancier", "year").agg(
    _sum("annual_consume").alias("total_annual_consume"),
    _sum("num_connections").alias("total_num_connections")
)

# Calculate average consumption per connection
aggregated_df = aggregated_df.withColumn(
    "average_consumption",
    col("total_annual_consume") / col("total_num_connections")
)

# Show the result
aggregated_df.show()
print(f"Number of rows: {aggregated_df.count()}")


DataFrame 'coteq_electricity_2013' heeft de volgende kolommen:
['net_manager', 'purchase_area', 'street', 'zipcode_from', 'zipcode_to', 'city', 'num_connections', 'delivery_perc', 'perc_of_active_connections', 'type_conn_perc', 'type_of_connection', 'annual_consume', 'annual_consume_lowtarif_perc', 'smartmeter_perc']
----------------------------------------
DataFrame 'coteq_electricity_2014' heeft de volgende kolommen:
['net_manager', 'purchase_area', 'street', 'zipcode_from', 'zipcode_to', 'city', 'num_connections', 'delivery_perc', 'perc_of_active_connections', 'type_conn_perc', 'type_of_connection', 'annual_consume', 'annual_consume_lowtarif_perc', 'smartmeter_perc']
----------------------------------------
DataFrame 'coteq_electricity_2015' heeft de volgende kolommen:
['net_manager', 'purchase_area', 'street', 'zipcode_from', 'zipcode_to', 'city', 'num_connections', 'delivery_perc', 'perc_of_active_connections', 'type_conn_perc', 'type_of_connection', 'annual_consume', 'annual_cons

In [None]:
from pyspark.sql.functions import col, expr, lit, percentile_approx

# Step 1: Calculate Q1 and Q3
percentiles = aggregated_df.select(
    percentile_approx(col("average_consumption"), 0.25, lit(100)).alias("Q1"),
    percentile_approx(col("average_consumption"), 0.75, lit(100)).alias("Q3")
).collect()[0]

Q1, Q3 = percentiles.Q1, percentiles.Q3
IQR = Q3 - Q1

# Step 2: Calculate bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 3: Filter out outliers
filtered_df = aggregated_df.filter(
    (col("average_consumption") >= lower_bound) & (col("average_consumption") <= upper_bound)
)

# Show the result
print(f"Number of rows: {filtered_df.count()}")


In [None]:
# Load in geoJSON file and join it on the aggregate table
from pyspark.sql.functions import col, explode


geojson_df = spark.read.option("multiline", "true").json("file")

# Explode the "features" array to access individual features
geojson_df = geojson_df.select(explode(col("features")).alias("feature"))

# Extract geometry and properties, including 'pc4_code'
geojson_df = geojson_df.select(
    col("feature.geometry.type").alias("geometry_type"),
    col("feature.geometry.coordinates").alias("coordinates"),
    col("feature.properties.pc4_code").alias("pc4_code")
)

# Cast 'pc4_code' to Integer
geojson_df = geojson_df.withColumn("pc4_code", col("pc4_code").cast("int"))

# Show the DataFrame schema and a preview of the data
geojson_df.printSchema()
geojson_df.show(truncate=False)

Unnamed: 0,net_manager,purchase_area,street,zipcode_from,zipcode_to,city,num_connections,delivery_perc,perc_of_active_connections,type_conn_perc,type_of_connection,annual_consume,annual_consume_lowtarif_perc,smartmeter_perc,leverancier,type,jaar,ï»¿NETBEHEERDER,STANDAARDDEVIATIE,%Defintieve aansl (NRM)
0,Coteq Netbeheer BV,Netbeheerder Centraal Overijssel B.V.,Dorpsstraat,7468CP,7471AA,ENTER,19.0,89.47,94.74,89,1x35,4122.0,89.47,0.0,coteq,electricity,2013,,,
1,Coteq Netbeheer BV,Netbeheerder Centraal Overijssel B.V.,De Stoevelaar,7471AB,7471AB,GOOR,37.0,100.0,100.0,86,1x35,1800.0,94.59,0.0,coteq,electricity,2013,,,
2,Coteq Netbeheer BV,Netbeheerder Centraal Overijssel B.V.,De Stoevelaar,7471AC,7471AC,GOOR,16.0,100.0,100.0,100,1x35,1315.0,100.0,0.0,coteq,electricity,2013,,,
3,Coteq Netbeheer BV,Netbeheerder Centraal Overijssel B.V.,De Stoevelaar,7471AD,7471AE,GOOR,25.0,92.0,84.0,44,1x35,6379.0,92.0,0.0,coteq,electricity,2013,,,
4,Coteq Netbeheer BV,Netbeheerder Centraal Overijssel B.V.,Kerkstraat,7471AG,7471AG,GOOR,14.0,85.71,100.0,36,1x35,4404.0,92.86,0.0,coteq,electricity,2013,,,
