In [0]:
%python
# Import necessary libraries
from pyspark.sql.functions import col, when, row_number, regexp_replace, lit, count
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType
# Load tables into DataFrames
gp_hierarchies_all_df = spark.read.table('hive_metastore.reporting_ukhd_ods.gp_hierarchies_all')
rightcare_practice_ccg_pcn_quarter_lookup_df = (
    spark.read.option("header", "true")
    .option("recursiveFileLookup", "true")
    .parquet('abfss://unrestricted@udalstdatacuratedprod.dfs.core.windows.net/reference/Internal/Reference/RightCare_practice_CCG_pcn_quarter_lookup/Published/')
)
commissioner_hierarchies_df = (
    spark.read.option("header", "true")
    .option("recursiveFileLookup", "true")
    .parquet('abfss://reporting@udalstdatacuratedprod.dfs.core.windows.net/unrestricted/reference/UKHD/ODS/Commissioner_Hierarchies_ICB/')
)
national_statistics_postcode_lookup_df = spark.table("ukhd_ods.national_statistics_postcode_lookup_scd")

parquet_directory = "abfss://unrestricted@udalstdatacuratedprod.dfs.core.windows.net/reference/UKHD/ODS/Postcode_Grid_Refs_Eng_Wal_Sco_And_NI_SCD/Published/1/"

postcode_grid_refs_Unf = (
    spark.read.option("header", "true")
    .option("recursiveFileLookup", "true")  # Ensures all files in the folder are read
    .parquet(parquet_directory)
)
postcode_grid_refs_Unf = postcode_grid_refs_Unf.withColumn("PCDS_NoGaps", regexp_replace(col("Postcode_8_chars"), " ", ""))
 

# 1. Create DataFrame for TempGP1
temp_gp1 = (
    gp_hierarchies_all_df.alias("B")
    .join(
        rightcare_practice_ccg_pcn_quarter_lookup_df.alias("C"),
        col("B.GP_Code") == col("C.Practice"),
        "left"
    )
    .join(
        commissioner_hierarchies_df.alias("D"),
        col("D.Organisation_Code") == col("C.CCG2019_20_Q4"),
        "left"
    )
    .select(
        "B.GP_Code", 
        "B.GP_PCN_Code", 
        "B.GP_PCN_Name", 
        "B.GP_STP_Code",
        regexp_replace("B.GP_STP_Name", " INTEGRATED CARE BOARD", "").alias("GP_STP_Name"),
        "B.GP_Region_Code", 
        regexp_replace("B.GP_Region_Name", " COMMISSIONING REGION", "").alias("GP_Region_Name"),
        "C.PRACTICE", 
        "C.CCG2019_20_Q4", 
        regexp_replace("D.Organisation_Name", " CCG", "").alias("New_CCG")
    )
)

# 2. Create DataFrame for TempGP2 (Group and Count)
temp_gp2 = temp_gp1.groupBy("GP_PCN_Code", "GP_PCN_Name", "New_CCG").agg(
    count("GP_PCN_Name").alias("GPS")
)

# 3. Create DataFrame for TempGP3 (ROW_NUMBER)
window_spec = Window.partitionBy("GP_PCN_Code", "GP_PCN_Name").orderBy(col("GPS").desc())
temp_gp3 = temp_gp2.withColumn("LA_ORDER", row_number().over(window_spec))

# 4. Create DataFrame for TempGP4
temp_gp4 = (
    gp_hierarchies_all_df.alias("B")
    .join(
        rightcare_practice_ccg_pcn_quarter_lookup_df.alias("C"),
        col("B.GP_Code") == col("C.Practice"),
        "left"
    )
    .join(
        temp_gp3.alias("Z"),
        col("Z.GP_PCN_Code") == col("B.GP_PCN_Code"),
        "left"
    )
    .where(col("Z.LA_ORDER") == 1)
    .select(
        "B.GP_Code", "B.GP_Name", "B.GP_PCN_Code", "B.GP_PCN_Name", "B.GP_STP_Code",
        regexp_replace("B.GP_STP_Name", " INTEGRATED CARE BOARD", "").alias("GP_STP_Name"),
        "B.GP_Region_Code", 
        regexp_replace("B.GP_Region_Name", " COMMISSIONING REGION", "").alias("GP_Region_Name"),
        col("C.PRACTICE").alias("Practice_code"), "C.CCG2019_20_Q4", 
        regexp_replace("B.GP_Postcode", " ", "").alias("PCDS_NoGaps"),
        regexp_replace("B.GP_Postcode", " ", "").substr(0, 7).alias("PCDS_7"),
        regexp_replace("B.GP_Postcode", " ", "").substr(0, 6).alias("PCDS_6"),
        regexp_replace("B.GP_Postcode", " ", "").substr(0, 5).alias("PCDS_5"),
        regexp_replace("B.GP_Postcode", " ", "").substr(0, 4).alias("PCDS_4"),
        col("B.GP_Postcode").substr(0, 3).alias("PCDS_3"),
        lit(None).cast("string").alias("2019_CCG_Name"),
        "Z.New_CCG",
        row_number().over(Window.partitionBy("B.GP_Code").orderBy(
            when(col("B.GP_PCN_Rel_End_Date").isNull(), 1).otherwise(0),
            col("B.GP_PCN_Rel_End_Date").desc()
        )).alias("GP_ORDER"),
        lit(None).cast("string").alias("Lower_Super_Output_Area_Code"),
        lit(None).cast("string").alias("Lower_Super_Output_Area_Name"),
        lit(None).cast("string").alias("Middle_Super_Output_Area_Code"),
        lit(None).cast("string").alias("Middle_Super_Output_Area_Name"),
        lit(None).cast("string").alias("Longitude"),
        lit(None).cast("string").alias("Latitude"),
        lit(None).cast("string").alias("Spatial_Accuracy")
    )
)

# 5. Create DataFrame for TempGP6 (Postcode Lookup)
temp_gp6 = (
    national_statistics_postcode_lookup_df
    .select(
        "PCDS_NoGaps", "Postcode_1", "Postcode_2", "Postcode_3", 
        "Local_Authority_Code", "Local_Authority_Name","Lower_Super_Output_Area_Code","Lower_Super_Output_Area_Name","Middle_Super_Output_Area_Code","Middle_Super_Output_Area_Name","Longitude","Latitude"
    )
    .distinct()
)

# 6. Create DataFrame for TempGP7 (Final Select and Join)
temp_gp7 = (
    temp_gp4.alias("b")
    .join(temp_gp6.alias("la"), col("la.PCDS_NoGaps") == col("b.PCDS_NoGaps"), "left")
    #.join(temp_gp66.alias("dlo"), col("la.PCDS_NoGaps") == col("b.PCDS_NoGaps"), "left")
    .where(col("b.GP_ORDER") == 1)
    .select(
        "b.GP_Code", "b.Practice_code", "b.GP_Name", "b.GP_PCN_Code", "b.GP_PCN_Name",
        "b.GP_STP_Code", "b.GP_STP_Name", "b.GP_Region_Code", "b.GP_Region_Name", 
        "b.CCG2019_20_Q4", "b.PCDS_NoGaps", "b.2019_CCG_Name", "la.Local_Authority_Name",
        "la.Lower_Super_Output_Area_Code", "la.Lower_Super_Output_Area_Name",
        "la.Middle_Super_Output_Area_Code", "la.Middle_Super_Output_Area_Name",
        "la.Longitude", "la.Latitude", "b.Spatial_Accuracy"
    )
)
# Define the schema for temp_gp7 DataFrame
schema = StructType([
    StructField("GP_Code", StringType()),
    StructField("Practice_code", StringType()),
    StructField("GP_Name", StringType()),
    StructField("GP_PCN_Code", StringType()),
    StructField("GP_PCN_Name", StringType()),
    StructField("GP_STP_Code", StringType()),
    StructField("GP_STP_Name", StringType()),
    StructField("GP_Region_Code", StringType()),
    StructField("GP_Region_Name", StringType()),
    StructField("CCG2019_20_Q4", StringType()),
    StructField("PCDS_NoGaps", StringType()),
    StructField("2019_CCG_Name", StringType()),
    StructField("Local_Authority_Name", StringType()),
    StructField("Lower_Super_Output_Area_Code", StringType()),
    StructField("Lower_Super_Output_Area_Name", StringType()),
    StructField("Middle_Super_Output_Area_Code", StringType()),
    StructField("Middle_Super_Output_Area_Name", StringType()),
    StructField("Longitude", StringType()),
    StructField("Latitude", StringType()),
    StructField("Spatial_Accuracy", StringType())
])

# Apply the schema to temp_gp7 DataFrame
temp_gp7 = spark.createDataFrame(temp_gp7.rdd, schema)
display(temp_gp7)
temp_gp6.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("abfss://analytics-projects@udalstdataanalysisprod.dfs.core.windows.net/PATLondon/MHUEC_Reference_Files/PostCode_to_LA/")
temp_gp7.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("abfss://analytics-projects@udalstdataanalysisprod.dfs.core.windows.net/PATLondon/MHUEC_Reference_Files/GP_Data/")


GP_Code,Practice_code,GP_Name,GP_PCN_Code,GP_PCN_Name,GP_STP_Code,GP_STP_Name,GP_Region_Code,GP_Region_Name,CCG2019_20_Q4,PCDS_NoGaps,2019_CCG_Name,Local_Authority_Name,Lower_Super_Output_Area_Code,Lower_Super_Output_Area_Name,Middle_Super_Output_Area_Code,Middle_Super_Output_Area_Name,Longitude,Latitude,Spatial_Accuracy
E82077,E82077,DAVENPORT HOUSE SURGERY,U08579,HARPENDEN HEALTH PCN,QM7,NHS HERTFORDSHIRE AND WEST ESSEX,Y61,EAST OF ENGLAND,06N,AL54HX,,St Albans,E01023701,St Albans 003C,E02004926,St Albans 003,-0.354549,51.816177,
E82019,E82019,BRIDGE COTTAGE SURGERY,U15138,HERTFORD AND RURALS PCN,QM7,NHS HERTFORDSHIRE AND WEST ESSEX,Y61,EAST OF ENGLAND,06K,AL69EF,,Welwyn Hatfield,E01023969,Welwyn Hatfield 002C,E02004981,Welwyn Hatfield 002,-0.215995,51.830002,
M85756,M85756,SPRINGFIELD MEDICAL PRACT,U54948,"BALSALL HEATH, SPARKHILL & MOSELEY PCN",QHL,NHS BIRMINGHAM AND SOLIHULL,Y60,MIDLANDS,15E,B114DG,,Birmingham,E01009393,Birmingham 143D,E02007040,Birmingham 143,-1.859629,52.444953,
M85084,M85084,THE WAND MEDICAL CENTRE,U27129,EDGBASTON PCN,QHL,NHS BIRMINGHAM AND SOLIHULL,Y60,MIDLANDS,15E,B120UF,,Birmingham,E01009362,Birmingham 071A,E02001897,Birmingham 071,-1.888682,52.464187,
M85164,M85164,NEWPORT MEDICAL GROUP,U48923,SWB URBAN HEALTH PCN,QHL,NHS BIRMINGHAM AND SOLIHULL,Y60,MIDLANDS,05L,B128QE,,Birmingham,E01033648,Birmingham 084F,E02001910,Birmingham 084,-1.879633,52.45223,
M85783,M85783,STRENSHAM ROAD SURGERY,U22471,SMARTCARE CENTRAL PCN,QHL,NHS BIRMINGHAM AND SOLIHULL,Y60,MIDLANDS,15E,B129RR,,Birmingham,E01009182,Birmingham 092A,E02001918,Birmingham 092,-1.892563,52.453843,
M85753,M85753,VICARAGE ROAD SURGERY,U22471,SMARTCARE CENTRAL PCN,QHL,NHS BIRMINGHAM AND SOLIHULL,Y60,MIDLANDS,15E,B147NH,,Birmingham,E01008963,Birmingham 107B,E02001933,Birmingham 107,-1.906686,52.427595,
M85177,M85177,SWANSWELL MEDICAL CENTRE,U25587,COMMUNITY CARE HALL GREEN PCN,QHL,NHS BIRMINGHAM AND SOLIHULL,Y60,MIDLANDS,15E,B277AL,,Solihull,E01010174,Solihull 013A,E02002093,Solihull 013,-1.816529,52.432515,
M81025,M81025,NEW ROAD SURGERY,U84529,REDDITCH & BROMSGROVE & DISTRICT PCN,QGH,NHS HEREFORDSHIRE AND WORCESTERSHIRE,Y60,MIDLANDS,05J,B459HY,,Bromsgrove,E01032172,Bromsgrove 003D,E02006698,Bromsgrove 003,-2.022092,52.392321,
M88009,M88009,NORVIC FAMILY PRACTICE,U98291,SWB TOGETHER4HEALTHCARE PCN,QUA,NHS BLACK COUNTRY,Y60,MIDLANDS,05L,B663PZ,,Sandwell,E01010064,Sandwell 040D,E02006943,Sandwell 040,-1.960133,52.489728,
