In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

from pyspark.sql.functions import when, col
from pyspark.sql.functions import col, sum as spark_sum
from pyspark.sql.functions import avg
from pyspark.sql.functions import when

## Clean CUA tables

In [None]:
spark.sql("use CUA_db")

In [None]:
sfm= spark.sql("""
    SELECT *
    FROM semi_final_matrix3
""")
sfm

### Remove if missing age

In [None]:
age_null_count = sfm.select(spark_sum(col("age").isNull().cast("integer"))).collect()[0][0]
print("Number of null values in 'age':", age_null_count)

In [None]:
no_age = sfm.filter(sfm["age"].isNull())
no_age.toPandas()

In [None]:
sfm_clean_age = sfm.na.drop(subset=["age"])
sfm_clean_age.count()

In [None]:
age_null_count2 = sfm_clean_age.select(spark_sum(col("age").isNull().cast("integer"))).collect()[0][0]
print("Number of null values in 'age':", age_null_count2)
## note resulted in 0 null

In [None]:
BMI_null_count = sfm.select(spark_sum(col("max_BMI_ratio").isNull().cast("integer"))).collect()[0][0]
print("Number of null values in 'age':", BMI_null_count)
## note resulted in 2522 null

In [None]:
mean_age = sfm.agg(avg("age")).collect()[0][0]
print("Mean of 'age':", mean_age)

In [None]:
quantiles = sfm.approxQuantile("age", [0.5], 0.001)  # 0.5 indicates median
median_value = quantiles[0]
print("Median of 'age':", median_value)

##Median of 'age': 35.84394250513347

### Impute BMI for missing

In [None]:
mean_BMI = sfm.agg(avg("max_BMI_ratio")).collect()[0][0]
print("Mean of 'max_BMI_ratio':", mean_BMI)

##Mean of 'max_BMI_ratio': 33.90729062899

In [None]:
quantiles_BMI = sfm.approxQuantile("max_BMI_ratio", [0.5], 0.001)  # 0.5 indicates median
median_BMI = quantiles_BMI[0]
print("Median of 'BMI':", median_BMI)

##Median of 'BMI': 30.850000381469727

In [None]:
bmi_replacement_med = '31'
sfm_bmi_imp = sfm_clean_age.withColumn('max_BMI_ratio', when(sfm_clean_age['max_BMI_ratio'].isNull(), bmi_replacement_med).otherwise(sfm_clean_age['max_BMI_ratio']))

In [None]:
BMI_null_count2 = sfm_bmi_imp.select(spark_sum(col("max_BMI_ratio").isNull().cast("integer"))).collect()[0][0]
print("Number of null values in 'age':", BMI_null_count2)

In [None]:
sfm_bmi_imp.limit(5).toPandas()

### Look for missing race data and add on to unknown column

In [None]:
filtered_race_count = (
    sfm_bmi_imp.filter((col("AIAN") == "1") | (col("NHPI") == "1") | (col("Asian") == "1")| (col("Black") == "1")| (col("White") == "1")| (col("MENA") == "1")| (col("Hisp_Latino") == "1")| (col("Other") == "1")| (col("API_ethn") == "1")| (col("Mixed") == "1")| (col("Unknown") == "1"))
    .select("personid")
    .distinct()
    .count()
)
filtered_race_count

In [None]:
filtered_race_count2 = (
    sfm_bmi_imp.filter((col("AIAN") == "0") & (col("NHPI") == "0") & (col("Asian") == "0")& (col("Black") == "0")& (col("White") == "0")& (col("MENA") == "0")& (col("Hisp_Latino") == "0")& (col("Other") == "0")& (col("API_ethn") == "0")& (col("Mixed") == "0")& (col("Unknown") == "0"))
    .select("personid")
    .distinct()
    .count()
)
filtered_race_count2

In [None]:
28462-27638

In [None]:
filtered_personids = (
    sfm_bmi_imp.filter((col("AIAN") == "0") & (col("NHPI") == "0") & (col("Asian") == "0")& (col("Black") == "0")& (col("White") == "0")& (col("MENA") == "0")& (col("Hisp_Latino") == "0")& (col("Other") == "0")& (col("API_ethn") == "0")& (col("Mixed") == "0")& (col("Unknown") == "0"))
    .select("personid")
)
filtered_personids

filtered_personid_list = [row.personid for row in filtered_personids.collect()]

In [None]:
sfm_with_unknown = sfm_bmi_imp.withColumn("Unknown", when(col("personid").isin(filtered_personid_list), 1).otherwise(col("Unknown")))

In [None]:
filtered_unknown_count2 = (
    sfm_with_unknown.filter((col("Unknown") == "1"))
    .distinct()
    .count()
)
filtered_unknown_count2

In [None]:
sfm_with_unknown.write.saveAsTable('CUA_db.semi_final_matrix_clean_imputed')

## Clean Control Tables

In [None]:
spark.sql('use non_CUA_db')

In [None]:
nsfm= spark.sql("""
    SELECT *
    FROM semi_final_matrix_update
""")
nsfm

In [None]:
nsfm.select('personid').distinct().count()

In [None]:
non_age_null_count = nsfm.select(spark_sum(col("age").isNull().cast("integer"))).collect()[0][0]
print("Number of null values in 'age':", non_age_null_count)

##nulls= 0

In [None]:
non_BMI_null_count = nsfm.select(spark_sum(col("max_BMI_ratio").isNull().cast("integer"))).collect()[0][0]
print("Number of null values in 'age':", non_BMI_null_count)
##nulls = 0

In [None]:
nfiltered_race_count = (
    nsfm.filter((col("AIAN") == "1") | (col("NHPI") == "1") | (col("Asian") == "1")| (col("Black") == "1")| (col("White") == "1")| (col("MENA") == "1")| (col("Hisp_Latino") == "1")| (col("Other") == "1")| (col("API_ethn") == "1")| (col("Mixed") == "1")| (col("Unknown") == "1"))
    .select("personid")
    .distinct()
    .count()
)
nfiltered_race_count

In [None]:
nfiltered_race_count2 = (
    nsfm.filter((col("AIAN") == "0") & (col("NHPI") == "0") & (col("Asian") == "0")& (col("Black") == "0")& (col("White") == "0")& (col("MENA") == "0")& (col("Hisp_Latino") == "0")& (col("Other") == "0")& (col("API_ethn") == "0")& (col("Mixed") == "0")& (col("Unknown") == "0"))
    .select("personid")
    .distinct()
    .count()
)
nfiltered_race_count2

In [None]:
nfiltered_personids = (
    nsfm.filter((col("AIAN") == "0") & (col("NHPI") == "0") & (col("Asian") == "0")& (col("Black") == "0")& (col("White") == "0")& (col("MENA") == "0")& (col("Hisp_Latino") == "0")& (col("Other") == "0")& (col("API_ethn") == "0")& (col("Mixed") == "0")& (col("Unknown") == "0"))
    .select("personid")
)
nfiltered_personids

nfiltered_personid_list = [row.personid for row in nfiltered_personids.collect()]


In [None]:
nsfm_with_unknown = nsfm.withColumn("Unknown", when(col("personid").isin(nfiltered_personid_list), 1).otherwise(col("Unknown")))

In [None]:
nsfm_with_unknown.limit(5).toPandas()

In [None]:
filtered_unknown_count2 = (
    nsfm.filter((col("Unknown") == "1"))
    .distinct()
    .count()
)
filtered_unknown_count2

##no nulls

In [None]:
nfiltered_unknown_count3 = (
    nsfm_with_unknown.filter((col("Unknown") == "1"))
    .distinct()
    .count()
)
nfiltered_unknown_count3
## no nulls

In [None]:
nsfm_with_unknown.count()
## 23805007

In [None]:
nsfm_with_unknown.write.saveAsTable('non_CUA_db.semi_final_matrix_clean2')

## Concatenate with the CUA database for PSM

In [None]:
spark.sql("use non_CUA_db")

In [None]:
nSFM = spark.sql("""
    select *
    from semi_final_matrix_clean2
""")
nSFM

In [None]:
spark.sql("use CUA_db")

In [None]:
SFM = spark.sql("""
    select *
    from semi_final_matrix_clean_imputed
""")
SFM

In [None]:
concat_cua_non=SFM.union(nSFM)

In [None]:
print(concat_cua_non.count())
concat_cua_non.limit(5).toPandas()

In [None]:
concat_cua_non.write.saveAsTable('CUA_db.concat_cua_non_table')