# Patronage Relationships: Production Pipeline Example
This notebook demonstrates how to use the `patronage_relationships` package in a real pipeline. Update paths and configs as needed for your environment.

In [0]:
# Import required libraries and the reusable package
from pyspark.sql import SparkSession
from patronage_relationships.core import *
from datetime import datetime

# spark = SparkSession.builder.getOrCreate()

In [0]:
# Read data from production sources (update paths as needed)
psa_df = spark.read.parquet("/mnt/ci-mvi/Processed/SVeteran.SMVIPersonSiteAssociation/")
person_df = spark.read.parquet("/mnt/ci-mvi/Processed/SVeteran.SMVIPerson/")
institution_df = spark.read.parquet("/mnt/ci-mvi/Raw/NDim.MVIInstitution/")

In [0]:
# Apply reusable functions
filtered_psa = filter_psa(psa_df).cache()
filtered_person = filter_person(person_df).cache()
filtered_institution = filter_institution(institution_df).cache()

joined = join_psa_institution(filtered_psa, filtered_institution).cache()

psa_df.unpersist()
filtered_psa.unpersist()

person_institution_dups, institution_tfpi_dups = find_duplicate_relationships(joined)
person_institution_dups = person_institution_dups.cache()
institution_tfpi_dups = institution_tfpi_dups.cache()
clean_corr, dup_corr = remove_duplicate_relationships(joined, person_institution_dups, institution_tfpi_dups)

institution_tfpi_dups.unpersist()
dup_corr.unpersist() 
clean_corr = clean_corr.cache()
latest_date = get_latest_correlation_date(joined)
joined.unpersist()
latest_date.cache()

start = datetime.now()
lookup_table = build_correlation_lookup_table(latest_date, clean_corr, filtered_person)
end = datetime.now()
print(f"Time taken: {end - start}")

json_table = build_json_correlation_table(latest_date, clean_corr, filtered_person)
clean_corr.unpersist()
filtered_person.unpersist()
latest_date.unpersist()
lookup_table.cache()
json_table.createOrReplaceTempView("temp_json")

In [0]:
# Save results (repartition for parallelism if needed)
lookup_table.write.option("path", "/mnt/ci-patronage/delta_tables/correlation_lookup").mode("overwrite").saveAsTable("correlation_lookup")
# dup_corr.repartition(200, "MVIPersonICN").write.option("path", "/mnt/ci-patronage/delta_tables/duplicate_correlations").mode("overwrite").saveAsTable("duplicate_correlations")
# json_table.repartition(200, "MVIPersonICN").write.option("path", "/mnt/ci-patronage/delta_tables/CorrelationsForAllInstitutions").mode("overwrite").saveAsTable("CorrelationsForAllInstitutions")

In [0]:
%sql
SELECT count(MVIPersonICN), count(participant_id), count(edipi), count(va_profile_id), 'his' FROM delta.`/mnt/Patronage/identity_correlations` 
union
SELECT count(MVIPersonICN), count(participant_id), count(edipi), count(va_profile_id), 'my' FROM correlation_lookup
-- delta.`/mnt/ci-patronage/delta_tables/correlation_lookup` 