###Calculate the overlaps between each habitat and the exploded DGL polygons

In [0]:
from pathlib import Path
import geopandas as gpd
from geopandas import read_file
import pandas as pd

In [0]:
from sedona.spark import *
from sedona.sql import st_constructors as cn
from pyspark.sql.functions import lit, expr, col, like, sum
from sedona.sql import st_functions as fn

sedona = SedonaContext.create(spark)
sqlContext.clearCache()

In [0]:
dgl_exploded = sedona.read.format("geoparquet").load('dbfs:/mnt/lab-res-a1001005/esd_project/Defra_Land/polygon_split_defra_by_organisation_tenure.parquet')

dgl_exploded.createOrReplaceTempView("dgl")

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
dgl_exploded = dgl_exploded.withColumn("id", monotonically_increasing_id())
dgl_exploded.createOrReplaceTempView("dgl")

In [0]:
dgl_exploded.display()

In [0]:
dgl_exploded.count()

In [0]:
hab_path = Path(
    "/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/DGL_vector"
)

alt_hab_path = str(hab_path).replace("/dbfs", "dbfs:")

grid_path = Path(
    "/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Model_Grids"
)

alt_grid_path = str(grid_path).replace("/dbfs", "dbfs:")

output_path = Path(
    "/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Estate_Split_Attr"
)

alt_output_path = str(output_path).replace("/dbfs", "dbfs:")

In [0]:
files = dbutils.fs.ls(alt_hab_path)
parquet_files = [f.path for f in files if f.path.endswith(".parquet")]
parquet_files = [path.rstrip("/").split("/")[-1] for path in parquet_files]

In [0]:
parquet_files

In [0]:
for hab in parquet_files:

  print(hab)

  hab_split = hab.replace('.parquet', '')

  hab_data = sedona.read.format("parquet").load(f"{alt_hab_path}/{hab}").withColumn("geometry", expr("st_makevalid(st_setsrid(ST_GeomFromWKB(geometry),27700))"))

  hab_data.createOrReplaceTempView("hab_data")

  query = f"""
    SELECT dgl.id, 
          dgl.current_organisation AS current_organisation, 
           dgl.land_management_organisation AS land_management_organisation, 
           dgl.tenure AS tenure, 
           dgl.area_m2 AS area_m2, 
           dgl.area_ha AS area_ha, 
           dgl.geometry AS geometry,
           ST_Area(ST_Intersection(dgl.geometry, hab.geometry)) AS area_{hab_split}_m2
    FROM dgl AS dgl
    JOIN hab_data AS hab
    ON ST_Intersects(dgl.geometry, hab.geometry)
  """

  # Pass the constructed query string to spark.sql
  spatial_join = spark.sql(query)

  spatial_join.write.format("geoparquet").mode("overwrite").save(
    f"dbfs:/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Estate_Split_Attr/dgl_join_{hab_split}.parquet")




In [0]:
dgl_out = dgl_exploded

In [0]:
from pyspark.sql.functions import sum as _sum, col, round as _round

data_directory = 'dbfs:/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Estate_Split_Attr/'

for hab in parquet_files:

  hab_split = hab.replace('.parquet', '')

  file_path = f"{data_directory}/dgl_join_{hab_split}.parquet"
  hab_data = spark.read.parquet(file_path)
    
  area_column = f"area_{hab_split}_m2"
    
  result_df = (
        hab_data.select("id", area_column)  
          .groupBy("id")            
          .agg(_sum(area_column).alias(f"areaSum_{hab_split}_m2"))  
    )
  
  dgl_out = dgl_out.join(result_df, on='id', how='left')

  percentage_column = f"{hab_split}_perc"

  dgl_out = (
        dgl_out.withColumn(percentage_column, 
            _round((col(f"areaSum_{hab_split}_m2") / col("area_m2")) * 100, 1))
    )

  dgl_out = dgl_out.drop(f"areaSum_{hab_split}_m2")

In [0]:
dgl_out.display()

In [0]:
dgl_out.count()

In [0]:
dgl_out_pd = dgl_out.toPandas()

In [0]:
dgl_out_pd = gpd.GeoDataFrame(dgl_out_pd, geometry='geometry')
dgl_out_pd.crs = 'epsg:27700'

In [0]:
dgl_out_pd.to_parquet("/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Estate_Split_Attr/dgl_updated_hab_splited.parquet")

In [0]:
def download_link(filepath):
    # NB filepath must be in the format dbfs:/ not /dbfs/
    # Get filename
    filename = filepath[filepath.rfind("/") :]
    # Move file to FileStore
    dbutils.fs.cp(filepath, f"dbfs:/FileStore/{filename}")
    # Construct download url
    url = f"https://{spark.conf.get('spark.databricks.workspaceUrl')}/files/{filename}?o={spark.conf.get('spark.databricks.clusterUsageTags.orgId')}"
    # Return html snippet
    return f"<a href={url} target='_blank'>Download file: {filename}</a>"

In [0]:
displayHTML(download_link('dbfs:/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Estate_Split_Attr/dgl_updated_hab_splited.parquet'))