In [0]:
%run ./00_functions_and_libraries

In [0]:
%run ./01_params

In [0]:
spark.conf.set('spark.sql.execution.arrow.enabled', True)
spark.conf.set('spark.sql.execution.arrow.fallback.enabled', False)
spark.conf.set("spark.sql.session.timeZone", "America/New_York")
spark.conf.set(
  params["AzureSASLocation"],
  dbutils.secrets.get(scope=params["AzureSASScope"],key=params["AzureSASKey"])
)


db = params["Database"]
checkpoint = params["sparkCheckpointDir"]
cfgfile = params['erConfigFile']
intDataDir = params["intermediateDataDir"]
tempDataDir = params["tempdir"]
outboundDir = params["AzureET3Mount"]+"prod/outbound"
archiveDir = params["AzureET3Mount"]+"prod/archive/outbound"
snowflakeDir = params["AzureET3Mount"]+"prod/snowflake/"

spark.sql(f"create database if not exists {db}")
spark.sql(f"use {db}")
spark.sparkContext.setCheckpointDir(checkpoint)
print(f"Input Parameters:\n   Database: {db}\n   Spark Checkpoint Dir: {checkpoint}\n   Weights Config File:{cfgfile}\n   Intermediate Dir:{intDataDir}\n   Temp Dir:{tempDataDir}\n   Outbound Dir:{outboundDir}\n   Outbound Dir:{archiveDir}\n   Outbound Dir:{snowflakeDir}")

In [0]:
rf_write = params["RF_Write"]

In [0]:
w_LName = params["WEIGHT: Last Name"]
w_FName = params["WEIGHT: First Name"]
w_MName = params["WEIGHT: Middle Initial/Name"]
w_Street = params["WEIGHT: Patient's Home Address"]
w_City = params["WEIGHT: Patient's Home City"]
w_County = params["WEIGHT: Patient's Home County"]
w_State = params["WEIGHT: Patient's Home State"]
w_Zip = params["WEIGHT: Patient's Home ZIP Code"]
w_SSN = params["WEIGHT: Social Security Number"]
w_Gender = params["WEIGHT: Gender"]
w_Race = params["WEIGHT: Race"]
w_Age = params["WEIGHT: Age"]
w_Dob = params["WEIGHT: Date of Birth"]
w_Lic = params["WEIGHT: Driver's License Number"]
w_MBI = params["WEIGHT: MBI"]
w_StateDL = params["WEIGHT: State Issuing Driver's License"]

w_total_active = 0
for x in [w_LName, w_FName, w_MName,w_Street, w_City, w_County, w_State, w_Zip,w_Gender, w_Race, w_Dob, w_Age,  w_Lic, w_StateDL, w_SSN,w_MBI]:
  w_total_active=+float(x)

match_threshold = params["Match Threshold"]

In [0]:
#######################
"""PARSE CONFIG FILE"""
#######################
config = spark.read.option("multiline", True).json(cfgfile)

# "Mappings" tell us how to take data from the source columns and map them to our logical contexts for matching
#    e.g. concat the f_name and l_name columns from source1 and map that to PersonNames
#           and take full_name from source2 and map that to PersonNames.
mappings = (config
            .select(explode("contextConfig").alias("contextConfig"))
            .withColumn("contextName", col("contextConfig.contextName"))
            .withColumn("sourceMappings", explode("contextConfig.sourceMappings"))
            .withColumn("deltaTable", col("sourceMappings.deltaTable"))
            .withColumn("sourceCols", col("sourceMappings.sourceCols"))
            .withColumn("targetAlias", col("sourceMappings.targetAlias"))
            .drop("contextConfig", "sourceMappings")
           )

# "Sources" are the delta tables containing the source data to be matched
#   This configuration will tell us:
#        - what the existing primary key is
#        - whether to look for duplicates within the source (i.e. selfDedup = True/False)
sources = (config
           .select(explode("sourceConfig").alias("sourceConfig"))
           .withColumn("deltaTable", col("sourceConfig.deltaTable"))
           .withColumn("primaryKey", col("sourceConfig.primaryKey"))
           .withColumn("selfDedup", col("sourceConfig.selfDedup"))
           .drop("sourceConfig")
          )

# "Tokenizers" are methods for matching.  We can tokenize by spaces/words, or by ngram.
#    This config is set per matching context.  So, if you want to match on PersonNames and BusinessNames, there should be an entry for both
#    We also need to know how to set up the MinHash algorithm for each context
tokenizers = (config
              .select(explode("contextConfig").alias("contextConfig"))
              .withColumn("contextName", col("contextConfig.contextName"))
              .withColumn("tokenizerConfig", explode("contextConfig.tokenizerConfig"))
              .withColumn("mode", col("tokenizerConfig.mode"))
              .withColumn("jaccardIndexThreshold", col("tokenizerConfig.binningConfig.jaccardIndexThreshold"))
              .withColumn("numHashTables", col("tokenizerConfig.binningConfig.numHashTables"))
              .withColumn("idfCutoff", col("tokenizerConfig.binningConfig.idfCutoff"))
              .withColumn("termFreq", col("tokenizerConfig.tf"))
              .drop("contextConfig", "tokenizerConfig")
             )

sources.show(5)
tokenizers.show(20)
mappings.show(100, False)

In [0]:
#########################
"""CONFIGURING SOURCES"""
#########################
sourceList = []  # Each element of this list will be a Dataframe of a source Delta table
sourceCount = len(sources.select("deltaTable").collect())  # Loop once for each source Delta table in config file

# Lets get a list of all of the columns needed from each delta table
# If a column isn't listed in the mappings, then we don't explicitly need it.
allColumns = (mappings
              .withColumn("sourceCols", explode("sourceCols"))
              .groupBy("deltaTable")
              .agg(collect_set("sourceCols").alias("columns"))
             )

print("\nReading source delta tables into dataframes...")
for sNum in range(sourceCount):
  # There will be 1 iteration of this loop per source table. 
  # sName/sKey/sCol ==> Name/PrimaryKey/ColumnsNamesAsString
  sName = allColumns.select("deltaTable").rdd.collect()[sNum][0]
  sKey = sources.filter(col("deltaTable")==sName).select("primaryKey").collect()[0][0]
  sCol = ", ".join(sKey + allColumns.select("columns").rdd.collect()[sNum][0])
  source = spark.sql(f"select {sCol} from {sName} where er_flag = 1").fillna("")
  assert source.select(sKey[0]).distinct().count() == source.count(), f"Error: Specified primaryKey \"{sKey[0]}\" for source \"{sName}\" is not unique"
  source = source.withColumn(sKey[0], concat_ws("__", *[lit(sName), col(sKey[0])])) # sKey must be a single column
  source = source.withColumnRenamed(sKey[0], "sourceId")
  source = normCols(source)
  sourceList.append(source)
  print(f"   Source '{sName}' is ready")

In [0]:
##########################
"""CONFIGURING CONTEXTS"""
##########################

# There will be 1 entry per context, keyed on the context name in the following two dictionaries
# The dictionary values will be dataframes containing the context values and the source names & record ids they originated from
contextDict = {}     # Dataframes in this list are deduped so there is one row per value in each context.
contextDictRaw = {}  # Dataframes in this list are not deduped
# Q: Why dedup in the step above?
# A: If we have 1000 records that all contain the value "some cool words" and 50 records with the value "some cooler words", 
#    we shouldn't compare "some cool words" to "some cooler words" 50,000 times. Instead, we just do it once and keep track of all
#    the other records that share the same value

contextNames = mappings.select("contextName").distinct()
contextCount = contextNames.count()  # We will be looping over all of the context in the contextConfig section of the config file
contextColumns = (mappings  # For each context, let's only keep the source columns that we need.
                  .withColumn("sourceCols", explode("sourceCols"))
                  .groupBy("contextName", "deltaTable")
                  .agg(collect_set("sourceCols").alias("columns"))
                 )

# We are doing a lot of up-front deduplication so as to minimize the work done during MinHashLSH. 
# We need a method for undoing the deduplication afterward - 
#    e.g. if 2 records are identical, keep one and MinHash it, then for every candidate pair it generates, 
#         create a second candidate pair using the record we dropped earlier
# So, we store the full list of ALL records and what values they contained
# We'll join back on this table after doing the MinHash approach.
schema = StructType([
  StructField("sourceId", StringType()), 
  StructField("featureId", StringType()), 
  StructField("valueId", StringType()),
  StructField("contextName", StringType())
])
source_to_feature = spark.createDataFrame([], schema)

# 1 loop per context
for cNum in range(contextCount):
  cName = contextNames.collect()[cNum][0]
  thisContext = []
  thisContextRaw = []
  for sNum in range(sourceCount): # 1 inner loop per source delta table
    sName = allColumns.select("deltaTable").rdd.collect()[sNum][0]   # Get the delta table name
    cCol = (contextColumns
            .filter((col("contextName")==cName) & (col("deltaTable")==sName))
            .select("columns")
           ).collect()[0][0]  # Get the list of the required columns
    _context = sourceList[sNum].select(["sourceId"] + cCol)  # keep only the required columns
    _context = map_to_targets(_context, mappings, cName, sName)  # map the source columns to our target columns
    _context = melt(_context, ["sourceId"], _context.columns[1:])  # pivot the data so we get 1 row per context value
    # Note, in the above step, this pivot is necessary, because 1 source could have multiple representations of the same context
    #    e.g. full_name, known_alias, also_known_as, etc.  OR home_address, work_address, alt_address, etc.
    #    So 1 source row could produce more than 1 row in our context table... 1 row per context value.
    
    _context = drop_bad_values(add_hashes(_context)).withColumn("contextName", lit(cName))
    thisContextRaw.append(_context)
    thisContext.append(dedup_values(_context).checkpoint())  # Here we dedup on the values to minimize work during MinHash
    # Note, in the above step we are checkpointing here because otherwise we higt an NPE at the tokenize() step
    
  combinedContext = reduce(DataFrame.unionAll, thisContext)  # Combine this contexts data from all of the sources
  combinedContextRaw = reduce(DataFrame.unionAll, thisContextRaw)  # Same thing as above but for non-deduped data
  contextDict[cName] = combinedContext
  contextDictRaw[cName] = combinedContextRaw
  combinedContextRaw.write.format("delta").mode("overwrite").save(f"{intDataDir}/delta/SILVER/context_{cName}")
  combinedContextRaw = spark.read.format("delta").load(f"{intDataDir}/delta/SILVER/context_{cName}")
  
  # This is our lookup table to get back to the original record granularity
  source_to_feature = source_to_feature.union(combinedContextRaw.select("sourceId", "featureId", "valueId", "contextName"))

source_to_feature.createOrReplaceTempView("source_to_feature")

for k,v in contextDict.items():
  print("context info")
  print(f"  name: {k}")
  print(f"  record count: {v.count()}")
  print(". example: ")
  try:
    pprint.pprint(v.take(1)[0].asDict())
  except IndexError:
    print(Exception(f"\n Warning: The Context Table {k} is empty. It will not contribute to probabilistic matching."))
  print("\n")

In [0]:
#######################
"""CONFIGURING PAIRS"""
#######################

# featuresList is a list of dictionaries; 1 dict per tokenization strategy
#    each dictionary will contain the contextName, the tokenizer method, and a mapping of sourceIds to vectorized/weighted features
featuresList = []

# pairsList is a list of Dataframes containing the candidate pairs generated from each tokenization/binning strategy
#    each dataframe in this list contains just 2 columns: id1, id2 (where these ids each identify a unique value for the context)
#      e.g. The DF may say that featureId 55 forms a candidate pair with featureId 145.  
#           We can go back to the source_to_feature table to see that featureId 55 == "some cool words" and featureId 145 == "some cooler words"
pairsList = []

ip_schema = StructType([StructField("sourceId1", StringType()), StructField("sourceId2", StringType())])
ident_pairs = spark.createDataFrame([], ip_schema)

for TCFG in tokenizers.collect():
  # iterating over each tokenization strategy.  
  #   TCFG has the following schema
  #      element 0: contextName (string)
  #      element 1: mode (string)
  #      element 2: jaccardIndexThreshold (double)
  #      element 3: numHashTables (long)
  #      element 4: idfCutoff (double)
  #      element 5: termFreq (boolean)
  _termFreq = True if TCFG[5] is None else False
  
  # Here we finally tokenize our data so, for instance, "some cool words" becomes ["some", "cool", "words"]
  # Note we are dropping rows where the tokens column is empty
  _tokens = tokenize(contextDict.get(TCFG[0]), mode=TCFG[1], sid="sourceId").filter(size(col("tokens"))>0)
  if _tokens.count() == 0:
    continue
  
  # Here we convert the token arrays into sparse vectors with IDF weights per token
  #   Note, the featurize function returns 2 Dataframes, the first has been filtered to remove insignficant tokens (below the idfThreshold)
  #         the second contains vectors with ALL tokens (we need these for accurate similarity functions during scoring later)
  #   Also Note, the full dataframe is not presently needed, so we discard it 
  _sigFeatures, junk = featurize(_tokens, idf_threshold=TCFG[4], sid="sourceId", tf=_termFreq)
  
  # We are tokenizing again because we also need a version here where we did NOT dedup all of our values.
  #   We need this so we can properly reconstruct all of the candidate pairs later.
  _tokensALL = tokenize(contextDictRaw.get(TCFG[0]), mode=TCFG[1], sid="sourceId").filter(size(col("tokens"))>0)
  
  # This time, all we care about is the DataFrame with all tokens
  # Similarly, the filtered version of this dataframe is not needed at this time, so we discard it 
  if TCFG[1]=="numeric":
    _allFeatures = numeric_featurize(_tokensALL, sid="sourceId")
  else:
    junk, _allFeatures = featurize(_tokensALL, idf_threshold=TCFG[4], sid="sourceId", tf=_termFreq)
  
  # Accumulate our results into featuresList
  featuresList.append({"name":TCFG[0], "tokenizer":TCFG[1], "fullFeatures":_allFeatures})
  
  # It is possible that a tokenizer is set up, but no binning is desired.
  # So, if a jaccardIndexThreshold has been set, let's assume user wants to do binning.
  if TCFG[2] is not None:
    _pairs = binning(_sigFeatures, threshold=TCFG[2], numHashes=TCFG[3]).drop("minHashJaccardDistance")
    #print(f"size of pairs for {TCFG[0]} is {_pairs.count()}")
    pairsList.append(_pairs)
    
    """
    We eliminated exact matches earlier and all we are getting from binning now are fuzzy matches.  
    In more other words... if:
      record1.text = "some cool words" 
      record2.text = "some cool words"
      record3.text = "some cooler words"
    Binning will generate one candidate pair between 2:3 (or 1:3 but not both).  We also need 1:2, which we are doing here.
    We self-joining source_to_feature to find exact matches within each context
    """
    thisContextData = source_to_feature.filter(col("contextName")==TCFG[0]).select("sourceId", "valueId")
    thisContextIdentPairs1 = thisContextData.withColumnRenamed("sourceId","sourceId1")
    thisContextIdentPairs2 = thisContextData.withColumnRenamed("sourceId","sourceId2")

    thisContextIdentPairs = (thisContextIdentPairs1.join(thisContextIdentPairs2, "valueId", "inner")
                             .filter(col("sourceId1")!=col("sourceId2"))
                             .drop("valueId")
                            )
    ident_pairs = ident_pairs.union(thisContextIdentPairs)
    #print(f"ident_pairs size after {TCFG[0]} is {ident_pairs.count()}")

"""Let's accumulate all of the candidate pairs from all of the binning here into one big dataframe
   We are still just looking at 2 columns: featureId1, featureId2
"""
all_pairs = reduce(DataFrame.unionAll, pairsList)

"""So, we have featureId1:featureId2, but we need to get back to sourceIds.  It will require 2 hops.
   Let's get to valueId1:valueId2 (remember source_to_feature maps all featureIds back to all of their original valueIds and sourceIds)
"""
all_pairs = (all_pairs
             .join(source_to_feature, all_pairs.featureId1==source_to_feature.featureId)
             .withColumnRenamed("valueId", "valueId1")
             .withColumnRenamed("contextName", "contextName1")
             .drop("sourceId", "featureId1", "featureId")
             .join(source_to_feature, all_pairs.featureId2==source_to_feature.featureId)
             .withColumnRenamed("valueId", "valueId2")
             .withColumnRenamed("contextName", "contextName2")
             .drop("sourceId", "featureId2", "featureId")
            )

"""Now we can join with source_to_feature again to get all sourceId::sourceId
   But we need to make sure we are always using matching Contexts.  
     e.g. an Address value and Address_street value could match, but we wouldn't want to generate a pair here
   Finally, after doing this series of joins, we should be back to our original record granularity, having undone all of the earlier
      optimization dedups.
"""
all_pairs = (all_pairs
             .join(
               source_to_feature,
               (all_pairs.valueId1==source_to_feature.valueId) & (all_pairs.contextName1==source_to_feature.contextName) 
             )
             .withColumnRenamed("sourceId", "sourceId1")
             .drop("valueId", "valueId1", "featureId", "contextName", "contextName1")
             .join(
               source_to_feature, 
               (all_pairs.valueId2==source_to_feature.valueId) & (all_pairs.contextName2==source_to_feature.contextName)
             )
             .withColumnRenamed("sourceId", "sourceId2")
             .drop("valueId", "valueId2", "featureId", "contextName", "contextName2")
            )

""" Combine all fuzzy pairs with all exact pairs """

all_pairs = all_pairs.union(ident_pairs)

"""Rearrange IDs so they are in deterministic (lexical order), for deduping
   If we don't do this step that it's possible we have ID1:ID2 AND ID2:ID1.  We need to sort them so that a dedup will catch them.
   This would be harder if all_pairs had more columns... but it's just 2 ID columns.  We'll join back in the actual features later.
""" 
all_pairs = (all_pairs
             .withColumn("sourceID1_temp", string_first(col("sourceId1"),col("sourceId2")))
             .withColumn("sourceID2_temp", string_last(col("sourceId1"),col("sourceId2")))
             .withColumn("sourceId1", col("sourceID1_temp"))
             .withColumn("sourceId2", col("sourceID2_temp"))
             .drop("sourceID1_temp","sourceID2_temp")
            ).dropDuplicates(["sourceId1", "sourceId2"]).filter(col("sourceId1")!=col("sourceId2"))

"""Drop pairs from within the same source unless selfDedup is configured
  If we are looking for matches across more than 1 dataset, we have the option of searching for dupes
    across AND within datasets, or just across.
  We have to loop over each source here because it can be configured differently for each source.
"""
for SRC in sources.collect():
  if not SRC[2]: #If selfDedup is False
    all_pairs = all_pairs.filter(~((col("sourceID1").contains(SRC[0])) & (col("sourceID2").contains(SRC[0]))))
    
all_pairs.write.format("delta").mode("overwrite").save(f"{intDataDir}/delta/SILVER/all_pairs")
all_pairs = spark.read.format("delta").load(f"{intDataDir}/delta/SILVER/all_pairs")

In [0]:
print(f"Total pairs to be considered: {all_pairs.count()}")

In [0]:
#################
"""SCORE PAIRS"""
#################

"""Join back in the original features so we can actually score the potential pairs"""
scored_pairs = all_pairs.select("sourceId1","sourceId2")
scoreColumnsList = []
rcols = []

# This loops over our actual context features
# The "fullFeatures" key:val pair within each element of featuresList has the schema: {sourceId, featureId, feature_vector}
# We can join on sourceID now and get the feature_vector for both sides of each candidate pair
# Then we can calculate jaccard and cosine similarities.
for featureInfo in featuresList:
  _thisName = featureInfo.get("name")
  _thisTokenizer = featureInfo.get("tokenizer")
  _thisFeature = featureInfo.get("fullFeatures").select("sourceId","features")
  scoreColumn = f"{_thisName}__{_thisTokenizer}"
  feature1 = f"{scoreColumn}__features1"
  feature2 = f"{scoreColumn}__features2"
  cosCol = f"{scoreColumn}__cosine_sim"
  numCol = f"{scoreColumn}_sim"
  
  #Note: There is a subtle thing happening here.
  #  First, and straightforwardly, we attach the feature vectors and use them to calculate the similarities.
  #  But afterward we do a groupBy on sourceId pairs again, so as to keep the max value of each similarity
  #  This is needed because the earlier joins could be cartesianing our pairs again.
  #  e.g. 
  #.    there may only be one candidate pair for ID1 and ID2,
  #     but ID1 could have two different feature vectors for the same context 
  #     (remember we may have mapped more than 1 column to a single context)
  #     so, maybe ID1 has two names - "Lucas" AND "Luke" - while ID2 only has one - "Luke"
  #     After this join we will again have 2 scored records for that one pair ID1:ID2:0% and ID1:ID2:100%.
  #     and, obviously, we keep the higher score.
  scored_pairs = (scored_pairs
                   .join(broadcast(_thisFeature), scored_pairs.sourceId1 == _thisFeature.sourceId, how="left")
                   .withColumnRenamed("features", feature1)
                   .select("sourceId1", "sourceId2", *rcols, feature1)
                   .join(broadcast(_thisFeature), scored_pairs.sourceId2 == _thisFeature.sourceId, how="left")
                   .withColumnRenamed("features", feature2)
                   .select("sourceId1", "sourceId2", *rcols, feature1, feature2)
                  )
  
  if _thisTokenizer == "numeric":
    scored_pairs = (scored_pairs
                    .withColumn(numCol, round(relative_numeric_sim(feature1, feature2), 3))
                    .groupBy("sourceId1","sourceId2")
                    .max(*rcols, numCol)
                   )
    rcols += [numCol]
                    
  else:
    scored_pairs = (scored_pairs
                    .withColumn(cosCol, round(cos_sim(col(feature1), col(feature2)), 3))
                    .groupBy("sourceId1","sourceId2")
                    .max(*rcols, cosCol)
                   )
    rcols += [cosCol]

  for _col in rcols:
    scored_pairs = scored_pairs.withColumnRenamed(f"Max({_col})", _col)
    
scored_pairs.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{intDataDir}/delta/GOLD/scored_pairs")
scored_pairs = spark.read.format("delta").load(f"{intDataDir}/delta/GOLD/scored_pairs")

In [0]:
# This list should contain all of our numeric independent variables to be used as features
keepers = [
  "FirstName__trigram__cosine_sim",
  "MiddleName__trigram__cosine_sim",
  "LastName__trigram__cosine_sim",
  "Address_street__trigram__cosine_sim",
  "Address_city__trigram__cosine_sim",
  "Address_county__trigram__cosine_sim",
  "Address_state__trigram__cosine_sim",
  "Address_zip__default__cosine_sim",
  "Gender__default__cosine_sim",
  "DOB__default__cosine_sim",
  "Age__numeric_sim",
  "Race__default__cosine_sim",
  "SSN__default__cosine_sim",
  "MBI__default__cosine_sim",
  "DriverLicNum__default__cosine_sim",
  "StateIssDriverLic__default__cosine_sim"
]

# For purpose of weighted scores, we only care about our similarities and IDs
for keeper in keepers:
  if keeper not in scored_pairs.columns:
    scored_pairs = scored_pairs.withColumn(keeper, lit(None))
weighted_input = scored_pairs.select(keepers + ["sourceID1", "sourceID2"])

MATCH_THRESHOLD = float(match_threshold)

predictions = (weighted_input
               .withColumn(
                 "pm_score", 
                 weighted_score(
                   col("FirstName__trigram__cosine_sim"),
                   col("MiddleName__trigram__cosine_sim"),
                   col("LastName__trigram__cosine_sim"),
                   col("Address_street__trigram__cosine_sim"),
                   col("Address_city__trigram__cosine_sim"),
                   col("Address_county__trigram__cosine_sim"),
                   col("Address_state__trigram__cosine_sim"),
                   col("Address_zip__default__cosine_sim"),
                   col("Gender__default__cosine_sim"),
                   col("DOB__default__cosine_sim"),
                   col("Age__numeric_sim"),
                   col("Race__default__cosine_sim"),
                   col("SSN__default__cosine_sim"),
                   col("MBI__default__cosine_sim"),
                   col("DriverLicNum__default__cosine_sim"),
                   col("StateIssDriverLic__default__cosine_sim")
                 )
               )
              ).withColumn("prediction", when(col("pm_score")>=MATCH_THRESHOLD, lit(1)).otherwise(lit(0)))
predictions.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{intDataDir}/delta/GOLD/predictions")
predictions=spark.read.format("delta").load(f"{intDataDir}/delta/GOLD/predictions")

In [0]:
# Is this where we should drop PM_SCORE < 85?
pm_thresh_below = predictions.filter(col("pm_score") < 85)
pm_above = predictions.filter(col("pm_score") >= 85)

# Might be too early to implement?
print(f"Total rows: {predictions.count()}") 
print(f"Total rows with PM score > 85: {pm_above.count()}")
print(f"Total rows with pm score < 85: {pm_thresh_below.count()}")
# Looks like we filter for > 85 Threshold when creating the graph frames below

In [0]:
"""
To use GraphFrames.connectedComponents, we need to provide two pieces:
  - nodes: A Dataframe containing an ID column that uniquely identifies every source record
           These will be the nodes points in our graph
  - edges: A Dataframe containing 3 columns - 2 ID columns and a match probability
           The IDs should represent the left and right side of a pair
           The IDs should match with IDs in the nodes Dataframe 
           The match probability should be between 0-1
"""

# To get our edges, we'll start with our predictions
# Also for readability, I update the names
# For now, I only want to consider very likely matches in my groups, so I keep only labels with >92% confidence
# I scale the confidences so that they range from 0-1 instead of 0.92 to 1.00 (Not sure if this is necessary)
edges = (predictions
         .withColumn('src', regexp_extract(col('sourceID1'), '(.)(__)(\w+)', 3))
         .withColumn('dst', regexp_extract(col('sourceID2'), '(.)(__)(\w+)', 3))
         .filter(col("pm_score")>=match_threshold)
         .withColumn("tf_adjusted_match_prob", col("pm_score"))
         .select("src", "dst", "tf_adjusted_match_prob")
        )

# To get all of the nodes, combine all of the source IDs from the primary and delta batch patient tables
all_data = spark.sql("Select * from PCR_Master")
nodes = all_data.selectExpr("patient_id as id")

from graphframes import GraphFrame

# Build the GraphFrame
g = GraphFrame(nodes, edges)

# Identify linked nodes
cc = g.connectedComponents().select("id","component")

# Join results with original data for human readability
results = cc.join(all_data, cc.id==all_data.patient_id, "right").drop("id").orderBy("component","patient_id")
#display(results)

In [0]:
from pyspark.sql.functions import arrays_overlap, col, collect_set, first, lit,  when

# this is a list of sme-flagged ids, along with all records that should NOT be linked to the record
delinked_pairs = spark.sql("select flagged_id, delinked_partner_id from delinked_pairs")

# same as above but the delinked records have been accumulated into an array called "delinked_partner_ids"
delinked_records = delinked_pairs.groupBy("flagged_id").agg(collect_set("delinked_partner_id").alias("delinked_partner_ids"))

# This is a mapping of each sme-flagged id to the component/group number to which it belongs (after the last round of PM matching/groupings) 
component_ids_to_review = (results
                           .select("patient_id", "component")
                           .join(delinked_records, results.patient_id==delinked_records.flagged_id)
                           .select("flagged_id", "component")
                           .withColumnRenamed("component", "component2")
                          )

# This is the same as above, but with a new column called "member_ids" that contains an array of all record ids in the group
components_to_review = (component_ids_to_review
                        .join(results.select("patient_id", "component"), component_ids_to_review.component2 == results.component)
                        .drop("component2")
                        .groupBy("component", "flagged_id")
                        .agg(collect_set("patient_id").alias("member_ids"))
                       )

# Here we join the components_to_review table with the delinked_records table and filter for cases when there IS overlap between the 
#   delinked_partner_ids array and the member_ids array
# In other words, if the sme-flagged record currently belongs to a group that contains at least 1 specifically forbidden record, 
#   then we need to isolate the sme-flagged record
ids_to_isolate = (delinked_records
                  .join(components_to_review, "flagged_id")
                  .filter(arrays_overlap(col("delinked_partner_ids"), col("member_ids")))
                  .select("flagged_id")
                  .withColumn("indicator", lit(1))
                 )

# We need to move these IDs into brand new groups. We can do this by giving them unique component values.
# We know the patient_id's are unique integers and the components are integers, too.
# Let's get the largest current value of component then add the patient_id
#max_component = results.select(max("component")).collect()[0][0]
results.createOrReplaceTempView('results_temp')
max_component=spark.sql('select max(component) from results_temp').collect()[0][0]

# Now we take our original results and join it with the ids_to_isolate table, adding on just the indicator (1) for when a record needs to be moved
#   we use a .when().otherwise() call to update the component using the arithmetic outlined above when the indicator is 1
results = (results
           .join(ids_to_isolate, results.patient_id == ids_to_isolate.flagged_id, "left")
           .drop("flagged_id")
           .withColumn("component", 
                       when(col("indicator")==1, lit(max_component) + col("patient_id"))
                       .otherwise(col("component"))
                      )
           .drop("indicator")
          )
#display(results)

In [0]:
#score the null rate by row for relevant columns in the dataframe
#col_list = ['first_name','last_name','address','city','state','zip_code','ssn','gender','race','dob','mbi']

results_with_nr = results.withColumn("nonnull_count", nonnull_count(col("first_name"), col("last_name"), col("middle_name"), col("address"), col('city'), col('state'), col('zip_code'), col('ssn'), col('county'),col('gender'), col('race'), col('dob'), col('mbi'), col('age'), col('drivers_license_number'), col('state_issuing_drivers_license')))
                                     
grouping_field = ['component']
window_spec = Window.partitionBy(*grouping_field).orderBy(col('nonnull_count').cast(IntegerType()).desc(), col("source").asc(), 
                                                          col('claims_update_timestamp').desc(), col('pcr_received_timestamp').desc(),
                                                         col('dispatch_timestamp').desc(),col('patient_id').asc())

results_with_rank_index = results_with_nr.withColumn("rank", rank().over(window_spec))

results_with_rank_index = results_with_rank_index.withColumn('mp_id', regexp_replace('mp_id','M','')) #slice the "M" off if it exists, added back in line below

results_pc = (results_with_rank_index
              .withColumn('pc_flag', when(col('rank')==1,'P').otherwise('C'))
              .withColumn('mp_id', concat(lit('M'),first(col("mp_id")).over(window_spec)))
             )
#results_pc.display()

In [0]:
# We will do a self join on the grouped results to generate all possible pairs from each group
t1 = results_pc.select(col("patient_id").alias("patient_id1"), col("mp_id").alias("mp_id1"), col("pc_flag").alias("pc_flag1"))
t2 = results_pc.select(col("patient_id").alias("patient_id2"), col("mp_id").alias("mp_id2"), col("pc_flag").alias("pc_flag2"))

pred_lkp = (predictions.select("sourceId1", "sourceId2", "pm_score")
            .withColumn("sourceId1", regexp_extract(col('sourceID1'), '(.)(__)(\w+)', 3))
            .withColumn("sourceId2", regexp_extract(col('sourceID2'), '(.)(__)(\w+)', 3))
           )

# However, within each group, we only care about pairs where the right side is the parent
# Also, we have to rearrange the pair labels as the smaller id is always on the left side
#   but we want to make it easy to tell which one was the child later, so keep the child pcr_id around
pairs_with_parent = (t1.join(t2, t1.mp_id1==t2.mp_id2)
                     .filter(col("pc_flag2")=="P")
                     .filter(col("patient_id1")!=col("patient_id2"))
                     .withColumn("sourceId1", when(col("patient_id1")<col("patient_id2"), col("patient_id1")).otherwise(col("patient_id2")))
                     .withColumn("sourceId2", when(col("patient_id1")<col("patient_id2"), col("patient_id2")).otherwise(col("patient_id1")))
                     .select("sourceId1", "sourceId2", col("patient_id1").alias("patient_id"))
                    )

# Next, let's check if we already have a pm_score from the earlier weighted scoring step
prev_scored = pairs_with_parent.join(pred_lkp, ["sourceId1", "sourceId2"], how="left")  # these already have a pm_score

# For those without a pm_score, we'll need to get the column similarities, then use the same weighted model as before
# mp_lkp is a shortcut back to the pcr_id that represents the child record, regardless if sourceId1 and sourceId2 were rearranged
mp_lkp = prev_scored.filter(col("pm_score").isNull()).select("sourceId1", "sourceId2", "patient_id")
unscored = mp_lkp.select("sourceId1", "sourceId2")

# Get the features and similarities for those in the unscored group
# This code is essentially a duplicate of a cell from above... this can definitely be generalized and cleaned up
scoreColumnsList = []
rcols = []
for featureInfo in featuresList:
  _thisName = featureInfo.get("name")
  _thisTokenizer = featureInfo.get("tokenizer")
  _thisFeature = (featureInfo.get("fullFeatures").select("sourceId","features")
                  .withColumn("sourceId", regexp_extract(col('sourceID'), '(.)(__)(\w+)', 3))
                 ).checkpoint()
  scoreColumn = f"{_thisName}__{_thisTokenizer}"
  feature1 = f"{scoreColumn}__features1"
  feature2 = f"{scoreColumn}__features2"
  cosCol = f"{scoreColumn}__cosine_sim"
  numCol = f"{scoreColumn}_sim"

  unscored = (unscored
              .join(broadcast(_thisFeature), unscored.sourceId1 == _thisFeature.sourceId, how="left")
              .withColumnRenamed("features", feature1)
              .select("sourceId1", "sourceId2", *rcols, feature1)
              .join(broadcast(_thisFeature), unscored.sourceId2 == _thisFeature.sourceId, how="left")
              .withColumnRenamed("features", feature2)
              .select("sourceId1", "sourceId2", *rcols, feature1, feature2)
             )
  
  if _thisTokenizer == "numeric":
    unscored = (unscored
                .withColumn(numCol, round(relative_numeric_sim(col(feature1), col(feature2)), 3))
                .groupBy("sourceId1","sourceId2")
                .max(*rcols, numCol)
               )
    rcols += [numCol]
                    
  else:
    unscored = (unscored
                .withColumn(cosCol, round(cos_sim(col(feature1), col(feature2)), 3))
                .groupBy("sourceId1","sourceId2")
                .max(*rcols, cosCol)
               )
    rcols += [cosCol]

  for _col in rcols:
    unscored = unscored.withColumnRenamed(f"Max({_col})", _col)

# Here we join back to mp_lkp to get the pcr_id back
for keeper in keepers:
  if keeper not in unscored.columns:
    unscored = unscored.withColumn(keeper, lit(0.0))
unscored = (unscored
            .join(mp_lkp, ["sourceId1","sourceId2"], how="left")
            .select(keepers + ["sourceID1", "sourceID2", "patient_id"])
           )

PM_scores = (unscored\
             .withColumn(
               "pm_score", 
               weighted_score(
                 col("FirstName__trigram__cosine_sim"),
                 col("MiddleName__trigram__cosine_sim"),
                 col("LastName__trigram__cosine_sim"),
                 col("Address_street__trigram__cosine_sim"),
                 col("Address_city__trigram__cosine_sim"),
                 col("Address_county__trigram__cosine_sim"),
                 col("Address_state__trigram__cosine_sim"),
                 col("Address_zip__default__cosine_sim"),
                 col("Gender__default__cosine_sim"),
                 col("DOB__default__cosine_sim"),
                 col("Age__numeric_sim"),
                 col("Race__default__cosine_sim"),
                 col("SSN__default__cosine_sim"),
                 col("MBI__default__cosine_sim"),
                 col("DriverLicNum__default__cosine_sim"),
                 col("StateIssDriverLic__default__cosine_sim")
               )
             )
             .select("patient_id", "pm_score")
            )

# Now we have all of the pm_scores for all children in two disctinc DFs...union them here
PM_scores = PM_scores.union(prev_scored.filter(~col("pm_score").isNull()).select("patient_id", "pm_score"))

# Enrich the previous grouping results (with parent identified) with the pm_scores
results_pc_pm = results_pc.drop("pm_score").join(PM_scores, "patient_id", how="left")
results_pc_pm.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{intDataDir}/delta/GOLD/results_pc_pm")
results_pc_pm = spark.read.format("delta").load(f"{intDataDir}/delta/GOLD/results_pc_pm")

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

# Only need to run if there are scores with less than 85 pm threshold
if results_pc_pm.filter(col('pm_score') < 85).count() > 0:
  # We need to find all rows that have a pm_score < 85 and update them to singletons
  # First we separate the < 85 rows from the rest
  under_threshold = results_pc_pm.filter(col('pm_score') < 85)
  over_threshold = results_pc_pm.filter((col('pm_score') >= 85) | (col("pm_score").isNull()))


  # For all rows with a threshold < 85 we need to update them to singletons
  # This involves:
  # removing the mp_id
  # Updating Child flag to P
  # Updating component field to a single component
  # updating mp_id to match M + patient id


  # # Updating C flag to P
  under_threshold = under_threshold.withColumn('pc_flag', lit('P'))

  # # Update mp_id to match patient_id for singleton
  under_threshold = under_threshold.withColumn('mp_id', concat(lit('M'),col("patient_id")))

  # # drop component to re-add it later
  under_threshold = under_threshold.drop("component")

  # # DROP PM SCORE
  under_threshold = under_threshold.withColumn('pm_score', lit(None))

  # # update each row with new component id
  max_comp = results_pc_pm.agg({"component": "max"}).collect()[0][0]

  # # find total number of rows needed for new component
  new_comps_needed = under_threshold.count()

  # # create list of sequential components with lengths = length of under threshold
  new_components = [x for x in range(max_comp + 1, max_comp + new_comps_needed + 1)]
  # print(new_components)

  # # update components
  b = sqlContext.createDataFrame([(l,) for l in new_components], ['component'])

  # #add 'sequential' index and join both dataframe to get the final result
  under_threshold = under_threshold.withColumn("row_idx",  row_number().over(Window.orderBy(monotonically_increasing_id())))
  b = b.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))


  # concat the datasets back together
  under_threshold = under_threshold.join(b, under_threshold.row_idx == b.row_idx).drop("row_idx")

  # reorder columns for concatenation
  under_threshold = under_threshold.select(over_threshold.columns)

  # # Combine data back into 1 df
  results_pc_pm = under_threshold.union(over_threshold)

In [0]:
df_AKA_order = results_pc_pm.orderBy(['pc_flag', 'dispatch_timestamp'], ascending=[0,0])
df_AKA_order = df_AKA_order.withColumn('first_name', upper(col('first_name'))).withColumn('last_name', upper(col('last_name')))
df_AKA_order.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{intDataDir}/delta/GOLD/df_AKA_order")
df_AKA_order = spark.read.format("delta").load(f"{intDataDir}/delta/GOLD/df_AKA_order")
#aggregate first names, sort on component, count(most frequent name), average pm score
first_name_AKA = df_AKA_order.groupBy('component','first_name')\
  .agg(count('first_name').alias('cnt'), first('pc_flag').alias('flag'), avg('pm_score').alias('score_avg'), first('dispatch_timestamp').alias('dis_time'))\
  .orderBy(['component','cnt','flag', 'score_avg', 'dis_time'], ascending=[1,0,0,0,0])

window_spec = Window.partitionBy('component').orderBy(col('cnt').desc(),col('flag'), col('score_avg').desc(), col('dis_time'))

#rank the names into top 5 and pivot into their own cols
first_name_rank = first_name_AKA.withColumn('rank', rank().over(window_spec)).filter(col('rank') <= 5).groupBy('component').pivot('rank').agg(first('first_name'))

#make sure there are at least 5 cols of AKAs
for i in range(1,6):
  if str(i) in first_name_rank.columns:
    print(str(i),'exists')
  else:
#     first_name_rank = first_name_rank.withColumn(str(i), lit(None).cast(NullType()))
    first_name_rank = first_name_rank.withColumn(str(i), lit(None).cast(StringType()))

#rename cols
first_name_rank = first_name_rank.withColumnRenamed('1','aka_first_name_1')\
.withColumnRenamed('2','aka_first_name_2')\
.withColumnRenamed('3','aka_first_name_3')\
.withColumnRenamed('4','aka_first_name_4')\
.withColumnRenamed('5','aka_first_name_5')

In [0]:
#aggregate last names, sort on component, count(most frequent name), average pm score

last_name_AKA = df_AKA_order.groupBy('component','last_name')\
  .agg(count('last_name').alias('cnt'), first('pc_flag').alias('flag'), avg('pm_score').alias('score_avg'), first('dispatch_timestamp').alias('dis_time'))\
  .orderBy(['component','cnt','flag', 'score_avg', 'dis_time'], ascending=[1,0,0,0,0])

window_spec = Window.partitionBy('component').orderBy(col('cnt').desc(),col('flag'), col('score_avg').desc(), col('dis_time'))

#rank the names into top 5 and pivot into their own cols
last_name_rank = last_name_AKA.withColumn('rank', rank().over(window_spec)).filter(col('rank') <= 5).groupBy('component').pivot('rank').agg(first('last_name'))

#make sure there are at least 5 cols of AKAs
for i in range(1,6):
  if str(i) in last_name_rank.columns:
    print(str(i),'exists')
  else:
    last_name_rank = last_name_rank.withColumn(str(i), lit(None).cast(StringType()))

#rename cols    
last_name_rank = last_name_rank.withColumnRenamed('1','aka_last_name_1')\
.withColumnRenamed('2','aka_last_name_2')\
.withColumnRenamed('3','aka_last_name_3')\
.withColumnRenamed('4','aka_last_name_4')\
.withColumnRenamed('5','aka_last_name_5')

In [0]:
#join the two tables to original

result_cols = ['component', 'patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'address', 'city', 'county', 'state', 'zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'dob', 'state_issuing_drivers_license', 'drivers_license_number', 'alternative_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mp_id', 'pc_flag', 'pm_score', 'er_flag']
results_pc_pm = results_pc_pm.select(result_cols).alias('a').join((first_name_rank.alias('b')), on=['component'], how='left').select('a.*','b.*')

results_pc_pm = results_pc_pm.alias('a').join((last_name_rank.alias('b')), on=['component'], how='left').select('a.*','b.*')

In [0]:
(results_pc_pm
 .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'address', 'city', 'county', 'state', 'zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'dob', 'state_issuing_drivers_license', 'drivers_license_number', 'alternative_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mp_id', 'pc_flag', 'pm_score', 'aka_first_name_1', 'aka_first_name_2',
'aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5', 'er_flag')
 .createOrReplaceTempView("PCR_updated")
)
spark.sql("INSERT OVERWRITE TABLE PCR_master SELECT * from PCR_updated")

In [0]:
spark.sql('truncate table weights')
w_schema = StructType([
  StructField('param_name', StringType(), False),
  StructField('param_weight', FloatType(), False)
])
weights = [('Threshold_Percentage', float(match_threshold)), ('Last_Name_WP', float(w_LName)), ('First_Name_WP', float(w_FName)), ('Middle_Name_WP', float(w_MName)), 
          ("Address_WP", float(w_Street)), ('City_WP', float(w_City)), ('County_WP', float(w_County)), ('State_WP', float(w_State)), ('ZIP_Code_WP', float(w_Zip)), 
           ('SSN_WP', float(w_SSN)), ('Gender_WP', float(w_Gender)), ('Race_WP', float(w_Race)), ('Age_WP', float(w_Age)), ('DOB_WP', float(w_Dob)), ('Drivers_License_Number_WP', float(w_Lic)), ('MBI_WP', float(w_MBI)), ('State_Issuing_Drivers_License_WP', float(w_StateDL))]
spark.createDataFrame(weights, w_schema).createOrReplaceTempView('weights_curr')
spark.sql('''MERGE INTO weights a using weights_curr b  ON a.param_name==b.param_name
  WHEN MATCHED 
    THEN UPDATE SET *
  WHEN NOT MATCHED 
    THEN INSERT *''')

In [0]:
#updated_matched_results
df_matched = (spark.sql('select * from pcr_master where mp_id in (Select mp_id from pcr_master group by mp_id having count(*)>1)')
        .withColumnRenamed('address', 'home_address')
        .withColumnRenamed('city', 'home_city')
        .withColumnRenamed('county', 'home_county')
        .withColumnRenamed('state', 'home_state')
        .withColumnRenamed('zip_code', 'home_zip_code')
        .withColumnRenamed('dob', 'date_of_birth')
        .withColumnRenamed('alternative_address', 'alternate_home_address')
        .withColumn('pm_status', lit('M'))
        .withColumnRenamed('mp_id', 'mpid')
        .withColumnRenamed('pc_flag', 'pm_record_type')
        .withColumn('pm_date', current_date())
        .withColumn('pm_timestamp', current_timestamp())
        .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'home_address', 'home_city', 'home_county', 'home_state', 'home_zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'date_of_birth', 'state_issuing_drivers_license', 'drivers_license_number', 'alternate_home_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mpid', 'pm_score', 'pm_status', 'pm_record_type', 'pm_date', 'pm_timestamp','aka_first_name_1','aka_first_name_2','aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5'))

df_matched = df_matched.withColumn('pm_score', df_matched['pm_score'].cast(IntegerType()))\
                                        .withColumn('dispatch_timestamp', date_format('dispatch_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('pcr_received_timestamp', date_format('pcr_received_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('claims_update_timestamp', date_format('claims_update_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('pm_timestamp', date_format('pm_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))

# #updated_unmatched_results
df_unmatched = (spark.sql("Select * from pcr_master where mp_id not in (Select mp_id from pcr_master group by mp_id having count(*)>1)")
        .withColumnRenamed('address', 'home_address')
        .withColumnRenamed('city', 'home_city')
        .withColumnRenamed('county', 'home_county')
        .withColumnRenamed('state', 'home_state')
        .withColumnRenamed('zip_code', 'home_zip_code')
        .withColumnRenamed('dob', 'date_of_birth')
        .withColumnRenamed('alternative_address', 'alternate_home_address')
        .withColumn('pm_status', lit('U'))
        .withColumnRenamed('mp_id', 'mpid')
        .withColumnRenamed('pc_flag', 'pm_record_type')
        .withColumn('pm_date', current_date())
        .withColumn('pm_timestamp', current_timestamp())
        .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'home_address', 'home_city', 'home_county', 'home_state', 'home_zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'date_of_birth', 'state_issuing_drivers_license', 'drivers_license_number', 'alternate_home_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mpid', 'pm_score', 'pm_status', 'pm_record_type', 'pm_date', 'pm_timestamp','aka_first_name_1','aka_first_name_2','aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5'))


df_unmatched = df_unmatched.withColumn('pm_score', df_unmatched['pm_score'].cast(IntegerType()))\
                                            .withColumn('dispatch_timestamp', date_format('dispatch_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pcr_received_timestamp', date_format('pcr_received_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('claims_update_timestamp', date_format('claims_update_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pm_timestamp', date_format('pm_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pm_record_type', lit(None).cast(StringType())) #change 'P' to null for unmatched records

In [0]:
print('***MATCHED COUNTS***')
print('Matched count:',df_matched.count())
print('Matched distinct count:',df_matched.distinct().count())
print('Count MPIDs:',df_matched.select('mpid').count())
print('Unique MPIDs:',df_matched.select('mpid').distinct().count())
print('Unique Patient IDs:',df_matched.select('patient_id').distinct().count())
print('pm_status count:',df_matched.select('pm_status').count())

In [0]:
print('***UNMATCHED COUNTS***')
print('Unmatched count:',df_unmatched.count())
print('Unmatched distinct count:',df_unmatched.distinct().count())
print('Count MPIDs:',df_unmatched.select('mpid').count())
print('Unique MPIDs:',df_unmatched.select('mpid').distinct().count())
print('Unique Patient IDs:',df_unmatched.select('patient_id').distinct().count())
print('pm_status count:',df_unmatched.select('pm_status').count())

In [0]:
w_schema = StructType([
  StructField('param_name', StringType(), False),
  StructField('param_weight', FloatType(), False)
])
weights = [('Threshold_Percentage', float(match_threshold)), ('Last_Name_WP', float(w_LName)), ('First_Name_WP', float(w_FName)), ('Middle_Name_WP', float(w_MName)), 
          ("Address_WP", float(w_Street)), ('City_WP', float(w_City)), ('County_WP', float(w_County)), 
          ('State_WP', float(w_State)), ('ZIP_Code_WP', float(w_Zip)), ('SSN_WP', float(w_SSN)), 
          ('Gender_WP', float(w_Gender)), ('Race_WP', float(w_Race)), ('Age_WP', float(w_Age)),
          ('DOB_WP', float(w_Dob)), ('Drivers_License_Number_WP', float(w_Lic)), ('MBI_WP', float(w_MBI)), ('State_Issuing_Drivers_License_WP', float(w_StateDL))]
df_weights = spark.createDataFrame(weights, w_schema)
df_weights_reformat = df_weights.withColumn('groupCol', lit('g1'))
df_weights_reformat = df_weights_reformat.groupBy('groupCol').pivot('param_name').mean('param_weight').drop('groupCol')
df_weights_reformat = df_weights_reformat.select('Threshold_Percentage','Last_Name_WP','First_Name_WP','Middle_Name_WP','Address_WP','City_WP', 'County_WP','State_WP','Zip_Code_WP','SSN_WP','Gender_WP','Race_WP','Age_WP','DOB_WP','State_Issuing_Drivers_License_WP',
                                                'Drivers_License_Number_WP','MBI_WP')
dbutils.fs.rm(f"{tempDataDir}/weights", True)
df_weights_reformat.coalesce(1).write.mode('overwrite').format('csv').option('header', 'true').option('delimiter', '|').option('emptyValue','').save(f"{tempDataDir}/weights")

#updated_matched_results
dbutils.fs.rm(f"{tempDataDir}/updated_matched", True)
df_matched.coalesce(1).write.mode('overwrite').format('csv').option('header', 'true').option('delimiter', '|').option('emptyValue','').save(f"{tempDataDir}/updated_matched")

# #updated_unmatched_results
dbutils.fs.rm(f"{tempDataDir}/updated_unmatched", True)
df_unmatched.coalesce(1).write.mode('overwrite').format('csv').option('header', 'false').option('delimiter', '|').option('emptyValue','').save(f"{tempDataDir}/updated_unmatched")

In [0]:
dbutils.notebook.exit("Step 3: PCR Batch Dedup with Sample Data completed successfully")