In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import json
from delta import DeltaTable
from datetime import *
from pyspark.storagelevel import *


In [0]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

In [0]:
spark.conf.set("spark.shuffle.partion", "200")
spark.catalog.clearCache()

In [0]:
spark.sql("""DROP TABLE IF EXISTS correlation_lookup""")
dbutils.fs.rm("/mnt/ci-patronage/delta_tables/correlation_lookup", True)

In [0]:
# psa = "/mnt/ci-mvi/Processed/SVeteran.SMVIPersonSiteAssociation/"
# person = "dbfs:/mnt/ci-mvi/Processed/SVeteran.SMVIPerson"
# mvi_institution = "/mnt/ci-mvi/Raw/NDim.MVIInstitution/"

data_sources = {
    "psa": {
        "path": "/mnt/ci-mvi/Processed/SVeteran.SMVIPersonSiteAssociation/",
        "columns": [
        "MVIPersonICN",
        "MVITreatingFacilityInstitutionSID",
        "TreatingFacilityPersonIdentifier",
        "ActiveMergedIdentifier",
        "CorrelationModifiedDateTime",
        ],
        "filter_condition": ((col("MVIPersonICN").isNotNull()) &
            (col("TreatingFacilityPersonIdentifier").rlike("^[0-9]+$")) & 
            # (col("MVITreatingFacilityInstitutionSID").isin([5667, 6061, 6722])) &
            ((col("ActiveMergedIdentifier") == "Active") | col("ActiveMergedIdentifier").isNull())),
        "WindowSpec": Window.partitionBy('MVIPersonICN', 'MVITreatingFacilityInstitutionSID', 'TreatingFacilityPersonIdentifier').orderBy(desc('CorrelationModifiedDateTime'))
    },
    "person": {
        "path":"/mnt/ci-mvi/Processed/SVeteran.SMVIPerson/",
        "columns" :["MVIPersonICN", "ICNStatus"],
        "filter_condition": (col("rnk") == 1),
        "WindowSpec": Window.partitionBy('MVIPersonICN').orderBy(desc('calc_IngestionTimestamp'))
        },
    "mvi_institution":{
        "path":"/mnt/ci-mvi/Raw/NDim.MVIInstitution/",
        "columns": ["MVIInstitutionSID", "InstitutionCode"],
        "filter_condition":(col("MVIInstitutionSID").rlike("^[0-9]+$")),
        "WindowSpec":""
    }
}

def read_parquet(source_key):
    source_info = data_sources[source_key]
    path = source_info["path"]
    return spark.read.parquet(path)


In [0]:
# Read and filter raw PSA DataFrame
raw_psa_df = (read_parquet("psa")
    .filter(data_sources["psa"]["filter_condition"])
    .withColumn("rnk", rank().over(data_sources["psa"]["WindowSpec"]))
    .filter(col("rnk") == 1)
    .select(data_sources["psa"]["columns"])
    .repartition(200, "MVIPersonICN")
)

# Read and filter Person DataFrame
person_df = (
    read_parquet("person")
    .withColumn(
        "rnk",
        rank().over(data_sources["person"]["WindowSpec"]),
    )
    .filter(data_sources["person"]["filter_condition"])
    .select(data_sources["person"]["columns"])
    .repartition(200, "MVIPersonICN")
)

# Read and filter Institutions DataFrame

institutions_df = (
    read_parquet("mvi_institution")
    .filter(data_sources["mvi_institution"]["filter_condition"])
    .select(data_sources["mvi_institution"]["columns"])
    .distinct()
)

# Join DataFrames 
joined_df = (
    raw_psa_df
    .join(
        broadcast(institutions_df),
        raw_psa_df["MVITreatingFacilityInstitutionSID"]
        == institutions_df["MVIInstitutionSID"],
        "left",
    )
    .select(
        raw_psa_df["MVIPersonICN"],
        institutions_df["InstitutionCode"],
        institutions_df["MVIInstitutionSID"],
        raw_psa_df["TreatingFacilityPersonIdentifier"],
        raw_psa_df["CorrelationModifiedDateTime"]
    )
    .distinct()
).repartition(200, "MVIPersonICN").cache()

# Find Duplicates
person_institution_dups = (
    joined_df.groupBy("MVIPersonICN", "InstitutionCode")
    .count()
    .filter(col("count") > 1)
    .withColumnRenamed("count", "count_iens")
)
institution_tfpi_dups = (
    joined_df.groupBy("InstitutionCode", "TreatingFacilityPersonIdentifier")
    .count()
    .filter(col("count") > 1)
    .withColumnRenamed("count", "count_icns")
)
# Clean correlations, filtering out duplicates

correlations_df = (
    joined_df
    .join(broadcast(person_institution_dups), ["MVIPersonICN", "InstitutionCode"], "left")
    .join(broadcast(institution_tfpi_dups), ["InstitutionCode", "TreatingFacilityPersonIdentifier"], "left", )
).repartition(200, "MVIPersonICN").cache()

duplicate_correlations_df = correlations_df.filter(col("count_iens").isNotNull() | col("count_icns").isNotNull())

clean_correlations_df = (correlations_df
                      .filter(col("count_iens").isNull())
                      .filter(col("count_icns").isNull())
                      .drop("count_iens", "count_icns")
                      .select("MVIPersonICN",  "CorrelationModifiedDateTime", "InstitutionCode", "TreatingFacilityPersonIdentifier")
                      .repartition(200, "MVIPersonICN").cache()
)
# Get max date for each MVIPersonICN
max_date_df = (
    joined_df
    .groupBy(col("MVIPersonICN"))
    .agg(max("CorrelationModifiedDateTime").alias("Last_Modified_Date"))
).repartition(200, "MVIPersonICN").cache()

In [0]:
start = datetime.now()
correlation_lookup_df = (
    max_date_df.join(clean_correlations_df, ['MVIPersonICN'], 'left').join(person_df, ['MVIPersonICN'], "left")
    .groupBy("MVIPersonICN","ICNStatus", "Last_Modified_Date")
    .pivot("InstitutionCode")
    .agg(first("TreatingFacilityPersonIdentifier"))
    .withColumnRenamed("200CORP", "participant_id")
    .withColumnRenamed("200DOD", "edipi")
    .withColumnRenamed("200VETS", "va_profile_id")
).select("MVIPersonICN", "participant_id", "edipi", "va_profile_id", "ICNStatus", "Last_Modified_Date").cache()
end = datetime.now()

print(f"Time taken: {end - start}")



In [0]:
#  0:02:46.173652

In [0]:
json_format = (
    max_date_df.join(clean_correlations_df, ['MVIPersonICN'], 'left').join(person_df, ['MVIPersonICN'], "left")
    .groupBy("MVIPersonICN", "Last_Modified_Date", "ICNStatus")
    .agg(
        collect_list(
                struct(
                    col("InstitutionCode").alias("Institution"),
                    col("TreatingFacilityPersonIdentifier").alias("Identifier")
                )
        ).alias("InstitutionIDs")
    )
).cache()

json_format.createOrReplaceTempView("temp_json")

# Usage:
# SELECT MVIPersonICN, Last_Modified_Date, ICNStatus, Inst FROM temp_json lateral view explode(InstitutionIDS) as Inst where Inst.Institution = '200DOD'


In [0]:
correlation_lookup_df.write.option("path", "/mnt/ci-patronage/delta_tables/correlation_lookup").saveAsTable("correlation_lookup")
# duplicate_correlations_df.write.option("path","/mnt/ci-patronage/delta_tables/duplicate_correlations").saveAsTable("duplicate_correlations")
# json_format.write.option("path", "/mnt/ci-patronage/delta_tables/CorrelationsForAllInstitutions").saveAsTable("CorrelationsForAllInstitutions")



In [0]:
%sql
SELECT MVIPersonICN, Last_Modified_Date, ICNStatus, Inst.Institution, Inst.Identifier FROM temp_json lateral view explode(InstitutionIDS) as Inst where Inst.Institution = '200DOD'

In [0]:
%sql
SELECT MVIPersonICN, Last_Modified_Date, ICNStatus, Inst.Identifier as 200CORP FROM temp_json lateral view explode(InstitutionIDS) as Inst where Inst.Institution = '200CORP'


In [0]:
json_format.filter(json_format.MVIPersonICN == 1006433939).display()

In [0]:
%sql
SELECT *
FROM correlation_lookup

In [0]:
duplicate_correlations.createOrReplaceTempView("duplicate_correlations")

In [0]:
%sql
SELECT count(*) 
FROM 
-- correlation_lookup a
delta.`/mnt/Patronage/identity_correlations` a
where a.MVIPersonICN not in (SELECT b.MVIPersonICN FROM delta.`/mnt/ci-patronage/delta_tables/correlation_lookup` b)
-- NOT EXISTS (SELECT 1 FROM delta.`/mnt/ci-patronage/delta_tables/correlation_lookup` a 
                --   where a.participant_id = b.participant_id and a.participant_id is not null and b.participant_id is not null)


In [0]:
%sql
SELECT * 
-- FROM duplicate_correlations
FROM delta.`/mnt/Patronage/duplicate_identity_correlations` 
where MVIPersonICN in (1019740818, 1081694406,1083817600, 1038815991, 1083491719)

In [0]:
%sql
SELECT count(MVIPersonICN), count(participant_id), count(edipi), count(va_profile_id), 'his' FROM delta.`/mnt/Patronage/identity_correlations` 
union
SELECT count(MVIPersonICN), count(participant_id), count(edipi), count(va_profile_id), 'my' FROM correlation_lookup
-- delta.`/mnt/ci-patronage/delta_tables/correlation_lookup` 

In [0]:
%sql
SELECT count(MVIPersonICN), count(participant_id), count(edipi), count(va_profile_id), 'his' FROM delta.`/mnt/Patronage/identity_correlations` 
union
SELECT count(MVIPersonICN), count(participant_id), count(edipi), count(va_profile_id), 'my' FROM correlation_lookup
-- delta.`/mnt/ci-patronage/delta_tables/correlation_lookup` 

In [0]:
%sql
SELECT * 
-- FROM correlation_lookup a
FROM delta.`/mnt/ci-patronage/delta_tables/correlation_lookup` a
where NOT EXISTS (SELECT 1 FROM delta.`/mnt/Patronage/identity_correlations` b where a.va_profile_id = b.va_profile_id  and b.va_profile_id is not null)
and  a.va_profile_id is not null

In [0]:
%sql
SELECT * FROM delta.`/mnt/Patronage/identity_correlations` WHERE MVIPersonICN in (1061990009,1058733289,1039175664,1045235898,1004772523,1058437895)

In [0]:
%sql
SELECT * FROM delta.`/mnt/Patronage/duplicate_identity_correlations`
-- WHERE TreatingFacilityPersonIdentifier  in (31131419,32405689,31083987,5583865,14705650,32252045)
WHERE MVIPersonICN in (1061990009,1058733289,1039175664,1045235898,1004772523,1058437895)

In [0]:
# Import required libraries
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta import DeltaTable
from datetime import datetime
from pyspark.storagelevel import StorageLevel

In [0]:
# Set Spark configurations for performance
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.shuffle.partitions", "200")
spark.catalog.clearCache()

In [0]:
# Utility: Data source configuration
DATA_SOURCES = {
    "psa": {
        "path": "/mnt/ci-mvi/Processed/SVeteran.SMVIPersonSiteAssociation/",
        "columns": [
            "MVIPersonICN",
            "MVITreatingFacilityInstitutionSID",
            "TreatingFacilityPersonIdentifier",
            "ActiveMergedIdentifier",
            "CorrelationModifiedDateTime",
        ],
        "filter_condition": (
            (col("MVIPersonICN").isNotNull()) &
            (col("TreatingFacilityPersonIdentifier").rlike("^[0-9]+$")) &
            ((col("ActiveMergedIdentifier") == "Active") | col("ActiveMergedIdentifier").isNull())
        ),
        "window_spec": Window.partitionBy('MVIPersonICN', 'MVITreatingFacilityInstitutionSID', 'TreatingFacilityPersonIdentifier').orderBy(desc('CorrelationModifiedDateTime'))
    },
    "person": {
        "path": "/mnt/ci-mvi/Processed/SVeteran.SMVIPerson/",
        "columns": ["MVIPersonICN", "ICNStatus"],
        "filter_condition": (col("rnk") == 1),
        "window_spec": Window.partitionBy('MVIPersonICN').orderBy(desc('calc_IngestionTimestamp'))
    },
    "mvi_institution": {
        "path": "/mnt/ci-mvi/Raw/NDim.MVIInstitution/",
        "columns": ["MVIInstitutionSID", "InstitutionCode"],
        "filter_condition": (col("MVIInstitutionSID").rlike("^[0-9]+$")),
        "window_spec": None
    }
}

In [0]:
# Function: Read parquet data for a given source
# Returns a DataFrame

def read_parquet(source_key: str) -> DataFrame:
    """Read parquet data for the given source key."""
    source = DATA_SOURCES[source_key]
    return spark.read.parquet(source["path"])

In [0]:
# Function: Prepare PSA DataFrame (optimized: no repartition/cache here)
def prepare_psa_df() -> DataFrame:
    """Read and filter the PSA DataFrame."""
    src = DATA_SOURCES["psa"]
    df = (
        read_parquet("psa")
        .filter(src["filter_condition"])
        .withColumn("rnk", rank().over(src["window_spec"]))
        .filter(col("rnk") == 1)
        .select(src["columns"])
    )
    return df

In [0]:
# Function: Prepare Person DataFrame (optimized: no repartition/cache here)
def prepare_person_df() -> DataFrame:
    """Read and filter the Person DataFrame."""
    src = DATA_SOURCES["person"]
    df = (
        read_parquet("person")
        .withColumn("rnk", rank().over(src["window_spec"]))
        .filter(src["filter_condition"])
        .select(src["columns"])
    )
    return df

In [0]:
# Function: Prepare Institutions DataFrame (optimized: no distinct unless needed)
def prepare_institutions_df() -> DataFrame:
    """Read and filter the Institutions DataFrame."""
    src = DATA_SOURCES["mvi_institution"]
    df = (
        read_parquet("mvi_institution")
        .filter(src["filter_condition"])
        .select(src["columns"])
    )
    return df

In [0]:
# Function: Join DataFrames (optimized: prune columns, no cache/repartition)
def join_dataframes(psa_df: DataFrame, institutions_df: DataFrame) -> DataFrame:
    """Join PSA and Institutions DataFrames."""
    joined = (
        psa_df.join(
            broadcast(institutions_df),
            psa_df["MVITreatingFacilityInstitutionSID"] == institutions_df["MVIInstitutionSID"],
            "left"
        )
        .select(
            psa_df["MVIPersonICN"],
            institutions_df["InstitutionCode"],
            institutions_df["MVIInstitutionSID"],
            psa_df["TreatingFacilityPersonIdentifier"],
            psa_df["CorrelationModifiedDateTime"]
        )
        .distinct()
    )
    return joined

In [0]:
# Function: Find Duplicates (optimized: no cache/repartition)
def find_duplicates(joined_df: DataFrame):
    """Find duplicate person-institution and institution-identifier pairs."""
    person_institution_dups = (
        joined_df.groupBy("MVIPersonICN", "InstitutionCode")
        .count()
        .filter(col("count") > 1)
        .withColumnRenamed("count", "count_iens")
    )
    institution_tfpi_dups = (
        joined_df.groupBy("InstitutionCode", "TreatingFacilityPersonIdentifier")
        .count()
        .filter(col("count") > 1)
        .withColumnRenamed("count", "count_icns")
    )
    return person_institution_dups, institution_tfpi_dups

In [0]:
# Function: Clean Correlations (optimized: no cache/repartition)
def clean_correlations(joined_df: DataFrame, person_institution_dups: DataFrame, institution_tfpi_dups: DataFrame) -> (DataFrame, DataFrame):
    """Remove duplicate correlations."""
    correlations = (
        joined_df
        .join(broadcast(person_institution_dups), ["MVIPersonICN", "InstitutionCode"], "left")
        .join(broadcast(institution_tfpi_dups), ["InstitutionCode", "TreatingFacilityPersonIdentifier"], "left")
    )
    duplicate_correlations = correlations.filter(col("count_iens").isNotNull() | col("count_icns").isNotNull())
    clean_correlations = (
        correlations
        .filter(col("count_iens").isNull())
        .filter(col("count_icns").isNull())
        .drop("count_iens", "count_icns")
        .select("MVIPersonICN", "CorrelationModifiedDateTime", "InstitutionCode", "TreatingFacilityPersonIdentifier")
    )
    return clean_correlations, duplicate_correlations

In [0]:
# Function: Get Max Date for Each Person (optimized: no cache/repartition)
def get_max_date_df(joined_df: DataFrame) -> DataFrame:
    """Get the max CorrelationModifiedDateTime for each MVIPersonICN."""
    return (
        joined_df
        .groupBy(col("MVIPersonICN"))
        .agg(max("CorrelationModifiedDateTime").alias("Last_Modified_Date"))
    )

In [0]:
# Function: Build Correlation Lookup Table (optimized: no cache/repartition)
def build_correlation_lookup(max_date_df: DataFrame, clean_correlations_df: DataFrame, person_df: DataFrame) -> DataFrame:
    """Build the final correlation lookup DataFrame."""
    return (
        max_date_df.join(clean_correlations_df, ['MVIPersonICN'], 'left')
        .join(person_df, ['MVIPersonICN'], "left")
        .groupBy("MVIPersonICN", "ICNStatus", "Last_Modified_Date")
        .pivot("InstitutionCode")
        .agg(first("TreatingFacilityPersonIdentifier"))
        .withColumnRenamed("200CORP", "participant_id")
        .withColumnRenamed("200DOD", "edipi")
        .withColumnRenamed("200VETS", "va_profile_id")
        .select("MVIPersonICN", "participant_id", "edipi", "va_profile_id", "ICNStatus", "Last_Modified_Date")
    )

In [0]:
# Function: Build JSON Format Table (optimized: no cache/repartition)
def build_json_format(max_date_df: DataFrame, clean_correlations_df: DataFrame, person_df: DataFrame) -> DataFrame:
    """Build the JSON format DataFrame for all institutions."""
    return (
        max_date_df.join(clean_correlations_df, ['MVIPersonICN'], 'left')
        .join(person_df, ['MVIPersonICN'], "left")
        .groupBy("MVIPersonICN", "Last_Modified_Date", "ICNStatus")
        .agg(
            collect_list(
                struct(
                    col("InstitutionCode").alias("Institution"),
                    col("TreatingFacilityPersonIdentifier").alias("Identifier")
                )
            ).alias("InstitutionIDs")
        )
    )

In [0]:
# Main pipeline execution (optimized: only repartition/cache before write if needed)
spark.sql("""DROP TABLE IF EXISTS correlation_lookup""")
dbutils.fs.rm("/mnt/ci-patronage/delta_tables/correlation_lookup", True)

psa_df = prepare_psa_df()
person_df = prepare_person_df()
institutions_df = prepare_institutions_df()

# Only distinct after join, before heavy groupBy
joined_df = join_dataframes(psa_df, institutions_df)

person_institution_dups, institution_tfpi_dups = find_duplicates(joined_df)
clean_correlations_df, duplicate_correlations_df = clean_correlations(joined_df, person_institution_dups, institution_tfpi_dups)
max_date_df = get_max_date_df(joined_df)

start = datetime.now()
correlation_lookup_df = build_correlation_lookup(max_date_df, clean_correlations_df, person_df).cache()
end = datetime.now()
print(f"Time taken: {end - start}")

json_format = build_json_format(max_date_df, clean_correlations_df, person_df)
json_format.createOrReplaceTempView("temp_json")

In [0]:
# Save results (optimized: repartition before write for parallelism)
def save_results():
    correlation_lookup_df.repartition(200, "MVIPersonICN").write.option("path", "/mnt/ci-patronage/delta_tables/correlation_lookup").mode("overwrite").saveAsTable("correlation_lookup")
    # duplicate_correlations_df.repartition(200, "MVIPersonICN").write.option("path", "/mnt/ci-patronage/delta_tables/duplicate_correlations").mode("overwrite").saveAsTable("duplicate_correlations")
    # json_format.repartition(200, "MVIPersonICN").write.option("path", "/mnt/ci-patronage/delta_tables/CorrelationsForAllInstitutions").mode("overwrite").saveAsTable("CorrelationsForAllInstitutions")

save_results()