In [0]:
import geopandas as gpd

In [0]:
none_in = gpd.read_file('/dbfs/FileStore/milesclement/dgl_none/dgl_none.shp')

In [0]:
none_in.to_parquet('/dbfs/FileStore/milesclement/dgl_none/dgl_none.parquet')

In [0]:
from sedona.spark import *

sedona = SedonaContext.create(spark)
sqlContext.clearCache()

username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()

In [0]:
focal_path = "dbfs:/mnt/lab-res-a1001005/esd_project/miles.clement@defra.gov.uk/LE/LE2223.parquet"
focal_layer = sedona.read.format("geoparquet").load(focal_path)

In [0]:
none_layer = sedona.read.format("geoparquet").load("dbfs:/FileStore/milesclement/dgl_none/dgl_none.parquet")

In [0]:
# Register as Temp Views
focal_layer.createOrReplaceTempView("focal_layer")
none_layer.createOrReplaceTempView("none_layer")

# SQL to Clip
clipped_layer = spark.sql("""
    SELECT f.*
    FROM focal_layer f, none_layer n
    WHERE ST_Intersects(f.geometry, n.geometry)
""")
clipped_layer.show()


In [0]:
unique_values = clipped_layer.select("Prmry_H").distinct()
unique_values.show()

In [0]:
from pyspark.sql.functions import col, sum

# Add a new column for area in hectares
clipped_with_area = clipped_layer.withColumn(
    "area_ha",
    col("ST_Area(geometry)") / 10000  # Convert square meters to hectares
)

# Summarize the total area by unique values in ColA
summary = clipped_with_area.groupBy("Prmry_H").agg(
    sum("area_ha").alias("total_area_ha")
)

summary.show()