## Initiate session and imports

In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

In [None]:
from pyspark.sql.functions import rank
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, to_date, datediff, current_date

## Control Cohort Selection

In [None]:
spark.sql("use CUA_db")

In [None]:
#Call in All CUAs
CUA_pop= spark.sql("""
    select personid, stdid, display
    from CUA_concat
""")
CUA_pop.cache()

In [None]:
#CUA person ID list
personid_list = list(CUA_pop.select('personid').distinct().toPandas()['personid'])
len(personid_list)

In [None]:
spark.sql("use real_world_data_jun_2023")

In [None]:
#All female query
all_female = spark.sql("""
    select personid, gender.standard.primaryDisplay as gender
    from demographics
    where gender.standard.primaryDisplay== "Female"
    """)
all_female.cache()

print(all_female.select('personid').distinct().count())

In [None]:
demographics_sdf=spark.sql("""
    SELECT personid, birthdate, deceased, dateofdeath, races.standard.id[0] as race_id, 
    races.standard.primaryDisplay[0] as race_display, 
    ethnicities.standard.id[0] as ethnic_id, 
    ethnicities.standard.codingSystemId[0] as ethnic_id1,
    ethnicities.standard.primaryDisplay[0] as ethnic_display, testpatientflag, 
    zipcodes[0], gender.standard.primaryDisplay as gender
    FROM demographics
    WHERE gender.standard.primaryDisplay== "Female"
    """)
demographics_sdf

In [None]:
#Get age of patients (as of Sept 25, 2023)
dem_age=demo_nonCUA.select('personid','birthdate', 'deceased', 'dateofdeath', 'race_display', 'ethnic_display', 'testpatientflag','zipcodes', 'gender')\
            .withColumn('age',datediff(current_date(),to_date(col('birthdate')))/365.25)\
            .drop('birthdate')
dem_age

In [None]:
#Filter test patients
true_patients=dem_age.where(col('testpatientflag')=="False")
true_patients.select('personid').distinct().count()
true_patients.limit(5).toPandas()

In [None]:
demo_non=true_patients.drop("dateofdeath")

In [None]:
##Query preferred demographics table

pdem_pull=spark.sql("""
    SELECT  
    personid,
    prefrace,
    prefracereason,
    prefethnicity,
    prefethnicityreason,
    prefzip,
    prefurban,
    prefmetropolitan
    FROM preferred_demographics
    """)
pdem_pull

In [None]:
#Filter to remove CUA patients
pdem_non=pdem_pull.filter(col("personid").isin(personid_list)==False)

In [None]:
pdem_non.select('personid').distinct().count()

In [None]:
nonCUA_dem = demo_non.join(pdem_non, 'personid', 'left')
print(nonCUA_dem.select('personid').distinct().count())
nonCUA_dem.limit(10).toPandas()

In [None]:
nonCUA_personid=true_patients.drop("deceased", "race_display", "ethnic_display", "zipcodes", "age", "testpatientflag", "dateofdeath", "gender", "race_id", "race_id1", "ethnic_id")

In [None]:
nonCUA_personid.limit(5).toPandas()

In [None]:
non_personid=nonCUA_personid.dropDuplicates(['personid'])

In [None]:
non_personid.count()

In [None]:
non_personid.write.saveAsTable('non_CUA_db.personid_table')

## Include BMI data

In [None]:
BMI_pull= spark.sql("""
    SELECT personid, measurementcode.standard.id as id, measurementcode.standard.primaryDisplay as display, typedvalue.numericValue.value as BMI_ratio, servicedate, typedvalue.dateValue.date 
    FROM measurement
    WHERE measurementcode.standard.id=='39156-5'
""")
BMI_pull

In [None]:
BMI=BMI_pull.drop("id", "display", "servicedate", "date")
BMI.limit(10).toPandas()

In [None]:
BMI_numeric = BMI.withColumn("BMI_ratio", col("BMI_ratio").cast(FloatType()))

In [None]:
BMI_non=BMI_numeric.filter(col("personid").isin(personid_list)==False)
print(BMI_non.select('personid').distinct().count())

In [None]:
BMI_max=BMI_numeric.groupBy('personid').max('BMI_ratio')

In [None]:
#Join BMI data to demographic data
nonCUA_demo = nonCUA_dem.join(BMI_max, 'personid', 'left')

print(nonCUA_demo.select('personid').distinct().count())
nonCUA_demo.limit(10).toPandas()

In [None]:
demo_BMI = nonCUA_demo.withColumnRenamed("max(BMI_ratio)", "max_BMI_ratio")

demo_BMI.limit(5).toPandas()

In [None]:
demo_BMI.write.saveAsTable('non_CUA_db.demo_BMI')

## Pull Condition Table

In [None]:
spark.sql("use real_world_data_jun_2023")

In [None]:
condition= spark.sql("""
    select personid, conditioncode.standard.id as standardid, conditioncode.standard.primaryDisplay as display
    from condition
""")
condition.cache()

In [None]:
problem=spark.sql("""
    select personid, problemlistcode.standard.id as standardid,
    problemlistcode.standard.primaryDisplay as display
    from problem_list
""")
problem.cache()

In [None]:
nonCUA_prob=problem.filter(col("personid").isin(personid_list)==False)
print(nonCUA_prob.select('personid').distinct().count())

In [None]:
nonCUA_cond=condition.filter(col("personid").isin(personid_list)==False)
print(nonCUA_cond.select('personid').distinct().count())

In [None]:
nonCUA_concat=nonCUA_cond.union(nonCUA_prob)
nonCUA_concat_count=nonCUA_concat.select('personid').distinct().count()
print(nonCUA_concat_count)
nonCUA_concat.limit(20).toPandas()

In [None]:
dem_con=nonpersonid.join(nonCUA_concat,['personid'],how='left')
dem_con.limit(15).toPandas()
print(dem_con.select('personid').distinct().count())

In [None]:
dem_con_left.write.saveAsTable('non_CUA_db.dem_con')

## Procedure Table

In [None]:
procedure= spark.sql("""
    SELECT 
    personid,
    procedurecode.standard.id as procedure_code,
    procedurecode.standard.primaryDisplay as procedure_display,
    servicestartdate as startdate,
    serviceenddate as enddate
    FROM procedure
""")
procedure

In [None]:
Non_procedure=procedure.filter(col("personid").isin(personid_list)==False)

In [None]:
Non_procedure.limit(5).toPandas()

In [None]:
Non_procedure.select('personid').distinct().count()

In [None]:
Non_procedure.write.saveAsTable('non_CUA_db.procedure_table')