In [0]:
%python
from pyspark.sql.functions import col, when, lit, regexp_replace, substring, create_map
from pyspark.sql import functions as F
from itertools import chain

# Load data from the source tables
df_b = (spark.read.option("header", "true")
    .option("recursiveFileLookup", "true")
    .parquet('abfss://unrestricted@udalstdatacuratedprod.dfs.core.windows.net/reference/UKHD/ODS/NHS_Trusts_SCD/Published/1/UKHD_ODS_NHS_Trusts_SCD_00000.parquet')
)
df_a = (spark.read.option("header", "true")
    .option("recursiveFileLookup", "true")
    .parquet('abfss://unrestricted@udalstdatacuratedprod.dfs.core.windows.net/reference/UKHD/ODS/NHS_Trust_Sites_Assets_And_Units_SCD/Published/1/UKHD_ODS_NHS_Trust_Sites_Assets_And_Units_SCD_00000.parquet')
)
parquet_directory = "abfss://unrestricted@udalstdatacuratedprod.dfs.core.windows.net/reference/UKHD/ODS/Postcode_Grid_Refs_Eng_Wal_Sco_And_NI_SCD/Published/1/"

postcode_grid_refs_Unf = (
    spark.read.option("header", "true")
    .option("recursiveFileLookup", "true")  # Ensures all files in the folder are read
    .parquet(parquet_directory)
)
df_c = postcode_grid_refs_Unf.filter(col("Is_Latest") == 1)
df_b = df_b.filter(col("Is_Latest") == 1)
df_a = df_a.filter(col("Is_Latest") == 1)
postcode_grid_refs_Unf = postcode_grid_refs_Unf.filter(col("Is_Latest") == 1)
# Ensure Postcode_8_chars column is created in both df_c and df_d
df_c = df_c.withColumn("Postcode_8_chars", regexp_replace(col("Postcode_8_chars"), " ", ""))
df_d = df_c.alias("d")

df_b = df_b.withColumn("Postcode", regexp_replace(col("Postcode"), " ", ""))
df_a = df_a.withColumn("Postcode", regexp_replace(col("Postcode"), " ", ""))

# Join operations
trusts_sites_df = (
    df_b.alias("b")
    .join(df_a.alias("a"), col("b.Organisation_Code") == col("a.Parent_Organisation_Code"), "left")
    .join(df_c.alias("c"), col("c.Postcode_8_chars") == col("b.Postcode"), "left")
    .join(df_d, col("d.Postcode_8_chars") == col("a.Postcode"), "left")
    .select(
        col("a.Parent_Organisation_Code").alias("Parent_Organisation_Code"),
        col("b.Organisation_Name").alias("Parent_Organisation_Name"),
        col("b.Postcode").alias("Parent_Organisation_Postcode"),
        substring(col("b.Postcode"), 1, 3).alias("Parent_Organisation_Postcode_District"),
        col("c.yr2011_LSOA").alias("Parent_Organisation_yr2011_LSOA"),
        when(col("a.Parent_Organisation_Code").isin(['RAT','RKL','RPG','RQY','RRP','RV3','RV5','RWK','TAF']), lit(1)).otherwise(lit(None)).alias("MH_Trust_Flag"),
        lit(None).cast("string").alias("MH_Provider_Abbrev"),
        col("a.Organisation_Code").alias("Site_Organisation_Code"),
        col("a.Organisation_Name").alias("Site_Name"),
        col("a.Postcode").alias("Site_Postcode"),
        substring(col("a.Postcode"), 1, 3).alias("Site_Postcode_District"),
        col("d.yr2011_LSOA").alias("Site_yr2011_LSOA")
    )
)


distinct_parent_orgs = (
    trusts_sites_df
    .select(
        col("Parent_Organisation_Code"),
        col("Parent_Organisation_Name"),
        col("Parent_Organisation_Postcode")
    )
    .distinct()
)
# Update MH Provider Abbrev
mh_mapping = {
    "RAT": "NELFT", "RKL": "WLT", "RV3": "CNWL", "RPG": "OXLEAS",
    "RWK": "ELFT", "RRP": "BEH", "RQY": "SWLStG", "RV5": "SLAM", "TAF": "CANDI"
}

mapping_expr = create_map([lit(x) for x in chain(*mh_mapping.items())])

distinct_parent_orgs = distinct_parent_orgs.withColumn(
    "MH_Provider_Abbrev", when(col("Parent_Organisation_Code").isin(list(mh_mapping.keys())),
                                mapping_expr.getItem(col("Parent_Organisation_Code"))).otherwise(lit(None))
)
trusts_sites_df = trusts_sites_df.withColumn(
    "MH_Provider_Abbrev", when(col("Parent_Organisation_Code").isin(list(mh_mapping.keys())),
                                mapping_expr.getItem(col("Parent_Organisation_Code"))).otherwise(lit(None))
)

# Save the result as a temporary view for further use
distinct_parent_orgs.createOrReplaceTempView("TrustOnly")
#display(trusts_sites_df)
distinct_parent_orgs.write.format("delta").mode("overwrite").save("abfss://analytics-projects@udalstdataanalysisprod.dfs.core.windows.net/PATLondon/MHUEC_Reference_Files/Trusts_Only/")

trusts_sites_df.write.format("delta").mode("overwrite").save("abfss://analytics-projects@udalstdataanalysisprod.dfs.core.windows.net/PATLondon/MHUEC_Reference_Files/Trusts_and_Sites/")


#trusts_sites_df.display()



Parent_Organisation_Code,Parent_Organisation_Name,Parent_Organisation_Postcode,Parent_Organisation_Postcode_District,Parent_Organisation_yr2011_LSOA,MH_Trust_Flag,MH_Provider_Abbrev,Site_Organisation_Code,Site_Name,Site_Postcode,Site_Postcode_District,Site_yr2011_LSOA
RY4,HERTFORDSHIRE COMMUNITY NHS TRUST,AL71BW,AL7,E01023954,,,RY459,HARPENDEN MEMORIAL HOSPITAL,AL54TA,AL5,E01023689
RYX,CENTRAL LONDON COMMUNITY HEALTHCARE NHS TRUST,NW15JD,NW1,E01004717,,,Q9E3Y,HARPENDEN MEMORIAL HOSPITAL,AL54TA,AL5,E01023689
RYW,BIRMINGHAM COMMUNITY HEALTHCARE NHS FOUNDATION TRUST,B74BN,B74,E01033561,,,RYWL4,WAND MEDICAL CENTRE,B120UF,B12,E01009362
RYW,BIRMINGHAM COMMUNITY HEALTHCARE NHS FOUNDATION TRUST,B74BN,B74,E01033561,,,RYW08,GREEN ROAD,B288DD,B28,E01009046
RYW,BIRMINGHAM COMMUNITY HEALTHCARE NHS FOUNDATION TRUST,B74BN,B74,E01033561,,,RYWC5,ELLIOT ROAD STORES - UNITS 7 & 8,B296LR,B29,E01009284
RYA,WEST MIDLANDS AMBULANCE SERVICE UNIVERSITY NHS FOUNDATION TRUST,DY51LX,DY5,E01009746,,,RYAEA,SHELDON CAS-CP,B330SE,B33,E01009512
RYA,WEST MIDLANDS AMBULANCE SERVICE UNIVERSITY NHS FOUNDATION TRUST,DY51LX,DY5,E01009746,,,RYAE0,SHELDON CAS-CP,B330SE,B33,E01009512
RYG,COVENTRY AND WARWICKSHIRE PARTNERSHIP NHS TRUST,CV66NY,CV6,E01009589,,,RYG23,WATER ORTON CLINIC,B461RD,B46,E01031042
RYA,WEST MIDLANDS AMBULANCE SERVICE UNIVERSITY NHS FOUNDATION TRUST,DY51LX,DY5,E01009746,,,RYACY,REDDITCH CHURCHILL CAS-CP,B980RE,B98,E01032275
RBA,TAUNTON AND SOMERSET NHS FOUNDATION TRUST,TA15DA,TA1,E01029302,,,RBAH5,BATH FERTILITY CENTRE,BA28SG,BA2,E01014442
