##Notebook for getting summary breakdowns of habitat occurance on Defra Group Land

Miles Clement, Feb 2025

**NOTE:** Area calculations are based on model outputs, and not the full vector geometries. The values calculated for Defra Land will vary from those previously reported in Phase 1 due to this generalised representation of the spatial extent.

**NOTE 2:** There is overlap between the extent of freehold and leasehold land parcels. The habitat extent of these will be reported separately.

**Habitat Notes**
- Saltmarsh is a subset of Coastal Margins, and included in the extent of the latter (beware of double counting)
- Upland Bog is a subset of Moorland & Heath, and included in the extent of the latter (beware of double counting)
- Dense and Sparse Woodland will also be reported as combined/mixed woodland

In [0]:
import pandas as pd
import os
from pathlib import Path
from functools import reduce
from sds_dash_download import download_file

In [0]:
from sedona.spark import *
from pyspark.sql.functions import expr, when, col, lit, sum
from pyspark.sql import functions as F

sedona = SedonaContext.create(spark)
sqlContext.clearCache()

username = (
    dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
)

In [0]:
table_dir = Path('/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Asset_Tables')
alt_table_dir = str(table_dir).replace("/dbfs", "dbfs:")

In [0]:
par_files = os.listdir(table_dir)
par_files

#### Constants & High-Level Stats
- Overall Extent
- Difference between model representation of DGL and the vector input

In [0]:
# Load in example dataset to calculate Leasehold/Freehold counts
par = '10m_x_assets_combined_moorland.parquet'
data_in = sedona.read.format("parquet").load(f"{alt_table_dir}/{par}")
data_in.createOrReplaceTempView("data_in")

In [0]:
# Get row counts for each split of data (by tenure)
leasehold_count = data_in.filter((data_in.dgl_fh.isNull()) & (data_in.dgl_lh == 1)).count()
freehold_count = data_in.filter((data_in.dgl_fh == 1) & (data_in.dgl_lh.isNull())).count()
both_count = data_in.filter((data_in.dgl_fh == 1) & (data_in.dgl_lh == 1)).count() 
total_count = data_in.filter((data_in.dgl_fh == 1) | (data_in.dgl_lh == 1)).count()

# Times by 100 to convert to grid cells, divide by 10000 to convert to ha
# Simplified to divide by 100 to convert count to ha
leasehold_ha = leasehold_count / 100
freehold_ha = freehold_count / 100
both_ha = both_count / 100
total_ha = total_count / 100

In [0]:
print(f"Leasehold: {leasehold_ha} ha")
print(f"Freehold: {freehold_ha} ha")
print(f"Both holding types: {both_ha} ha")
print(f"Total: {total_ha} ha")
#print(f"Total Check: {leasehold_ha+freehold_ha+both_ha} ha")

In [0]:
# Load in Phase 1 DGL outputs to calc overall difference in extent
dgl_vect = sedona.read.format("parquet").load("dbfs:/mnt/lab-res-a1001005/esd_project/jasmine.elliott@defra.gov.uk/gov_land_analysis/phase_one_final_report_outputs/polygon_ccod_defra_by_organisation_tenure.parquet")

dgl_vect.createOrReplaceTempView("dgl_vect")

In [0]:
# Calculate the total area using agg
vect_area = dgl_vect.agg(sum("area_ha").alias("total_area")).collect()[0]["total_area"]

In [0]:
print(f"Model Area: {total_ha} ha")
print(f"Vector Area: {round(vect_area,1)} ha")
print(f"Difference: {round(vect_area-total_ha,1)} ha more in the vector dataset")
print(f"This equates to {round(((vect_area-total_ha)/vect_area)*100,2)}% of the vector dataset area")

####Table 1: Habitat broken down by Leasehold/Freehold/Both

In [0]:
# Define asset columns that represent each habitat
habitats_cols = {
    'mixed_woodland': ["le_comb","lcm_comb","nfi_dense","phi_deciduous_woodland","le_scrub","nfi_sparse","wood_pasture_park","phi_traditional_orchard","fr_tow"],
    'dense_woodland': ["le_comb","lcm_comb","nfi_dense","phi_deciduous_woodland"],
    'sparse_woodland': ["le_scrub","nfi_sparse","wood_pasture_park","phi_traditional_orchard","fr_tow"],
    'moorland': ["le_comb","phi_comb","lcm_comb"],
    'upland_bog': ["le_bog","phi_blanket_bog","lcm_bog"],
    'grassland': ["le_unimproved_grass","phi_comb","lcm_comb"],
    'coastal': ["le_comb","phi_comb","lcm_comb","ne_marine"],
    'saltmarsh': ["le_saltmarsh","phi_saltmarsh","lcm_saltmarsh","ne_marine_saltmarsh"],
    'arable': ['le_comb', 'phi_comb', 'lcm_comb', 'crome'],
    'water': ["le_comb","lcm_comb","phi_comb","os_ngd_water"],
    'urban': ["le_urban","lcm_comb","ons_urban"]
}

In [0]:
results = []

In [0]:
# Iterate habitat parquet files
for habitat, indicators in habitats_cols.items():

  print(habitat)

  row = {"habitat": habitat} 

  # If mixed woodland, load in dense and sparse and combine
  if habitat == 'mixed_woodland':
    dense = sedona.read.format("parquet").load(f"{alt_table_dir}/10m_x_assets_combined_dense_woodland.parquet")
    sparse = sedona.read.format("parquet").load(f"{alt_table_dir}/10m_x_assets_combined_sparse_woodland.parquet")
    sparse = sparse.drop('geometry', 'dgl_fh', 'dgl_lh')

    dense.createOrReplaceTempView("dense")
    sparse.createOrReplaceTempView("sparse")

    data_in = dense.join(sparse, on="id", how="left")
    data_in.createOrReplaceTempView("data_in")

  else:
    data_in = sedona.read.format("parquet").load(f"{alt_table_dir}/10m_x_assets_combined_{habitat}.parquet")
    data_in.createOrReplaceTempView("data_in")

  # Iterate freehold/leasehold/both
  for tenure in ['dgl_fh','dgl_lh','both']:

    if tenure == 'dgl_fh':
      tenure_condition = ((data_in[tenure] == 1) & (data_in['dgl_lh'].isNull()))
    elif tenure == 'dgl_lh':
      tenure_condition = ((data_in[tenure] == 1) & (data_in['dgl_fh'].isNull()))
    elif tenure == 'both':
      tenure_condition = ((data_in['dgl_fh'] == 1) & (data_in['dgl_lh'] == 1))

    # Set indicator columns for checking 
    indicator_condition = reduce(
            lambda acc, col: acc | (data_in[col] == 1),
            indicators,
            lit(False) 
        )
    
    # Additional condiiton for upland bog (above moorland line)
    if habitat == "upland_bog":
      moorland_line_condition = col("moorland_line") == 1
      indicator_condition = moorland_line_condition & indicator_condition
    
    full_condition = tenure_condition & indicator_condition

    n = data_in.filter(full_condition).count()
    ha = n / 100

    if tenure == 'dgl_fh':
      row["freehold_ha"] = ha
    elif tenure == 'dgl_lh':
      row["leasehold_ha"] = ha
    elif tenure == 'both':
      row["mixed_tenure_ha"] = ha

  results.append(row)

In [0]:
summary_df = pd.DataFrame(results)
summary_df

In [0]:
# Calc % of DGL
summary_df['total_ha'] = summary_df['freehold_ha'] + summary_df['leasehold_ha'] + summary_df['mixed_tenure_ha']
summary_df['fh_%_dgl_all'] = ((summary_df['freehold_ha']/ total_ha) *100).round(2)
summary_df['fh_%_dgl_fh'] = ((summary_df['freehold_ha']/ freehold_ha) *100).round(2)
summary_df['lh_%_dgl_all'] = ((summary_df['leasehold_ha']/ total_ha) *100).round(2)
summary_df['lh_%_dgl_lh'] = ((summary_df['leasehold_ha']/ leasehold_ha) *100).round(2)
summary_df['mix_%_dgl_all'] = ((summary_df['mixed_tenure_ha']/ total_ha) *100).round(2)
summary_df['all_%_dgl_all'] = ((summary_df['total_ha']/ total_ha) *100).round(2)

In [0]:
summary_df

In [0]:
# Calc % of DGL as a proxy for double counting
excluded_habitats = ["saltmarsh", "upland_bog","dense_woodland","sparse_woodland"] # Excluded as double counted habitats
summary_df_subsets_rm = summary_df[~summary_df['habitat'].isin(excluded_habitats)]
tabulated_perc = summary_df_subsets_rm['all_%_dgl_all'].sum().round(2)
tabulated_ha = summary_df_subsets_rm['total_ha'].sum().round(2)
print(f'% of DGL covered by habitat extents: {tabulated_perc}')
print(f'This means {(tabulated_ha-total_ha).round(2)} ha of land with overlap across multiple habitats')

In [0]:
summary_df.to_csv('/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Stats/dgl_overall_summary.csv')

In [0]:
displayHTML(download_file('/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Stats/dgl_overall_summary.csv', move=False))

####Table 2: Habitat broken down by Leasehold/Freehold & Owner Organisation

In [0]:
ownership = sedona.read.format("parquet").load("dbfs:/mnt/lab-res-a1001005/esd_project/Defra_Land/Assets/10m_x_dgl_organisation.parquet")
ownership.createOrReplaceTempView("ownership")
ownership.display()

In [0]:
unique_orgs = ownership.select("organisation").dropDuplicates()
unique_orgs = unique_orgs.rdd.flatMap(lambda x: x).collect()
unique_orgs

In [0]:
results_org = []

In [0]:
# Iterate habitat parquet files
for habitat, indicators in habitats_cols.items():

  print(habitat)

  # If mixed woodland, load in dense and sparse and combine
  if habitat == 'mixed_woodland':
    dense = sedona.read.format("parquet").load(f"{alt_table_dir}/10m_x_assets_combined_dense_woodland.parquet")
    sparse = sedona.read.format("parquet").load(f"{alt_table_dir}/10m_x_assets_combined_sparse_woodland.parquet")
    sparse = sparse.drop('geometry', 'dgl_fh', 'dgl_lh')

    dense.createOrReplaceTempView("dense")
    sparse.createOrReplaceTempView("sparse")

    data_in = dense.join(sparse, on="id", how="left")
    data_in.createOrReplaceTempView("data_in")

  else:
    data_in = sedona.read.format("parquet").load(f"{alt_table_dir}/10m_x_assets_combined_{habitat}.parquet")
    data_in.createOrReplaceTempView("data_in")

  data_org = data_in.join(ownership, on="id", how="left")
  data_org.createOrReplaceTempView("data_org")

  for org in unique_orgs:
    
    row = {"habitat": habitat} 
    row["organisation"] = org

    org_condition = (data_org['organisation'] == org)

    # Iterate freehold/leasehold/both
    for tenure in ['dgl_fh','dgl_lh','both']:

      if tenure == 'dgl_fh':
        tenure_condition = ((data_org[tenure] == 1) & (data_org['dgl_lh'].isNull()))
      elif tenure == 'dgl_lh':
        tenure_condition = ((data_org[tenure] == 1) & (data_org['dgl_fh'].isNull()))
      elif tenure == 'both':
        tenure_condition = ((data_org['dgl_fh'] == 1) & (data_org['dgl_lh'] == 1))

      # Set indicator columns for checking 
      indicator_condition = reduce(
              lambda acc, col: acc | (data_org[col] == 1),
              indicators,
              lit(False) 
          )
      
      # Additional condiiton for upland bog (above moorland line)
      if habitat == "upland_bog":
        moorland_line_condition = col("moorland_line") == 1
        indicator_condition = moorland_line_condition & indicator_condition
      
      full_condition = org_condition & tenure_condition & indicator_condition

      n = data_org.filter(full_condition).count()
      ha = n / 100

      if tenure == 'dgl_fh':
        row["freehold_ha"] = ha
      elif tenure == 'dgl_lh':
        row["leasehold_ha"] = ha
      elif tenure == 'both':
        row["mixed_tenure_ha"] = ha

      n=None
      ha=None

    results_org.append(row)

In [0]:
summary_org_df = pd.DataFrame(results_org)
# Remove any habitat-organisation pairs that have no land in any of the columns
summary_org_df = summary_org_df[~((summary_org_df['freehold_ha'] == 0) & (summary_org_df['leasehold_ha'] == 0) & (summary_org_df['mixed_tenure_ha'] == 0))]
summary_org_df

In [0]:
summary_org_df.to_csv('/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Stats/dgl_organisational_summary.csv')

In [0]:
displayHTML(download_file('/dbfs/mnt/lab-res-a1001005/esd_project/Defra_Land/Final/Stats/dgl_organisational_summary.csv', move=False))