In [0]:
%run ./00_functions_and_libraries

In [0]:
%run ./01_params

In [0]:
spark.conf.set('spark.sql.execution.arrow.enabled', True)
spark.conf.set('spark.sql.execution.arrow.fallback.enabled', False)
spark.conf.set("spark.databricks.io.cache.enabled", True)
spark.conf.set("spark.sql.session.timeZone", "America/New_York")
spark.conf.set(
  params["AzureSASLocation"],
  dbutils.secrets.get(scope=params["AzureSASScope"],key=params["AzureSASKey"])
)
spark.sql('use {}'.format(params["Database"]))

In [0]:
#Read in claims dataset and add in patient_id and save to claims_master
claims_master=spark.read.format('csv').option('header', True).option('delimiter', ',').load(f"{params['AzureET3Mount']}prod/inbound/inbound_claims/pm_clm_match_yj_2021.csv")
claims_master=claims_master.withColumn('patient_id',lit(None))
claims_master.write.format("delta").mode("overwrite").save(f"{params['AzureET3Mount']}prod/MasterData/claims_master")

In [0]:
%scala
//From Census.gov, read in zip code tabulation areas (zcta) dataset, to get counties by zipcode
import org.apache.commons.io.IOUtils 
import java.net.URL 
 
val urlfile=new URL("https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_county20_natl.txt")
  val testDummyCSV = IOUtils.toString(urlfile,"UTF-8").lines.toList.toDS()
  val zcta = spark
                .read.option("header", true)
                .option("inferSchema", true)
                .option("delimiter","|")
                .csv(testDummyCSV)
zcta.createOrReplaceTempView("zcta")

In [0]:
#Refine zcta dataset to only include those zipcodes which are contained within one sole county
zcta=spark.sql('''select lpad(cast(geoid_zcta5_20 as string),5,'0') as zcta, NAMELSAD_COUNTY_20 as County
                from zcta 
                where GEOID_ZCTA5_20 in (select geoid_zcta5_20 from zcta group by 1 having count(*) = 1)
                ''')

In [0]:
#Bring in county names from zcta and create dataframe for claims pm. The previous step will ensure that we can safely assume the county name
claims_pm=claims_master.join(zcta,claims_master.SRC_ZIP5_CD==zcta.zcta,how = 'left').drop('zcta')

In [0]:
#Retrieve preferred USPS City Name and add to claims pm dataframe. Source: https://www.huduser.gov/portal/datasets/usps_crosswalk.html#data
spark.read.format('csv').option('header', True).option('delimiter', '|').load(f"{params['AzureET3Mount']}prod/HelperData/ZIP_COUNTY_122021.csv").createOrReplaceTempView('ZipTown')
ZipTown=spark.sql("""select * from ZipTown
                      where zip in (select zip from ZipTown group by 1 having count(zip)=1)""")

In [0]:
ZipTown=ZipTown.withColumnRenamed('usps_zip_pref_city','City').select('zip','City')
claims_pm=claims_pm.join(ZipTown,claims_pm.SRC_ZIP5_CD==ZipTown.zip, how='left').drop('zip')

In [0]:
#convert DOB, gender and race to match format with pcr_master
claims_pm=claims_pm.replace(to_replace=race_dict, subset=['BENE_RACE_CD'])
claims_pm=claims_pm.replace(to_replace=sex_dict, subset=['BENE_SEX_CD'])
claims_pm=claims_pm.withColumn('bene_BRTH_DT',to_date(col('bene_BRTH_DT'),'ddMMMyyyy'))
claims_pm=claims_pm.withColumn('Age',(months_between(lit(current_date()),col('bene_BRTH_DT'))/lit(12)).cast("integer").cast("string"))
claims_pm=claims_pm.withColumn('pm_score',lit(None))
claims_pm=claims_pm.withColumn('er_flag',lit(1))

In [0]:
claims_pm.select('CLM_UNIQ_ID','BENE_MBI_ID','bene_BRTH_DT','BENE_LAST_NAME','BENE_1ST_NAME','BENE_MIDL_NAME','BENE_LINE_1_ADR','SRC_USPS_STATE_CD','SRC_ZIP5_CD','BENE_SSN_NUM','BENE_SEX_CD','BENE_RACE_CD','City','County','Age','patient_id','pm_score','er_flag').withColumn('pm_score',col('pm_score').cast('double')).withColumn('patient_id',col('patient_id').cast('string')).write.format("delta").mode("overwrite").save(f"/mnt/edfr/et3/prod/HelperData_claims/delta/GOLD/claims_pm")