# CCU056 Covariates
 
**Description** This notebook creates the covariates which will be defined from the latest records before the date of operation for each individual as follows:
* Prior history of outcomes;
* Prior history of comorbidities;
 
**Authors**

**Reviewers** ⚠ UNREVIEWED

**Acknowledgements** Based on previous work by Tom Bolton, Alexia Sampri for CCU018_01, earlier CCU002 sub-projects and subsequently CCU003_05-D10-covariates

**Notes**

**Data Output**
- **`ccu056_out_covariates`** : covariates for the cohort

In [0]:
spark.sql('CLEAR CACHE')
spark.conf.set('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true')

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import Window

from functools import reduce

import databricks.koalas as ks
import pandas as pd
import numpy as np

import re
import io
import datetime

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import seaborn as sns

print("Matplotlib version: ", matplotlib.__version__)
print("Seaborn version: ", sns.__version__)
_datetimenow = datetime.datetime.now() # .strftime("%Y%m%d")
print(f"_datetimenow:  {_datetimenow}")

In [0]:
%run "../SHDS/common/functions"

# 0. Parameters

In [0]:
%run "./CCU056-01-parameters"

# 1. Data

In [0]:
codelist = spark.table(f'{dsa}.{proj}_out_codelist_covariates')

cohort = spark.table(f'{dsa}.{proj}_tmp_main_cohort_final2')

hes_apc_long = spark.table(path_cur_hes_apc_long)

In [0]:
display(codelist)

In [0]:
codelist_cardiovascular_risk_factors = (codelist.filter(f.col("codelist").contains("Cardiovascular Risk Factors")).drop("codelist"))

codelist_comorbidities = (codelist.filter(f.col("codelist").contains("Comorbidities")).drop("codelist"))

codelist_cardiovascular_post_intervention_outcomes = (codelist.filter(f.col("codelist").contains("Post Intervention")).drop("codelist"))

# 2. Prepare

In [0]:
print('--------------------------------------------------------------------------------------')
print('individual_censor_dates')
print('--------------------------------------------------------------------------------------')

individual_censor_dates = (
  cohort
  .withColumn("DOB", f.when(f.col("DOB") == "Unknown", "1800-01-01").otherwise(f.col("DOB")))
  .withColumnRenamed('DOB', 'CENSOR_DATE_START')
  .withColumnRenamed('OPERATION_DATE', 'CENSOR_DATE_END')
  .withColumn('CENSOR_DATE_START', f.to_date(f.col('CENSOR_DATE_START')))
  .withColumn('CENSOR_DATE_END', f.to_date(f.col('CENSOR_DATE_END')))
)

# check
count_var(individual_censor_dates, 'PERSON_ID'); print()
print(individual_censor_dates.limit(10).toPandas().to_string()); print()

In [0]:
print('--------------------------------------------------------------------------------------')
print('hes_apc')
print('--------------------------------------------------------------------------------------')
# reduce and rename columns
hes_apc_long_prepared = (
  hes_apc_long
  .select('PERSON_ID', f.col('EPISTART').alias('DATE'), 'CODE', 'DIAG_POSITION', 'DIAG_DIGITS')
)

# check 1
# count_var(hes_apc_long_prepared, 'PERSON_ID'); print()

# merge in individual censor dates
# _hes_apc = merge(_hes_apc, individual_censor_dates, ['PERSON_ID'], validate='m:1', keep_results=['both'], indicator=0); print()
hes_apc_long_prepared = (
  hes_apc_long_prepared
  .join(individual_censor_dates, on='PERSON_ID', how='inner')
)

# check 2
# count_var(hes_apc_long_prepared, 'PERSON_ID'); print()

# check before CENSOR_DATE_END, accounting for nulls
# note: checked in curated_data for potential columns to use in the case of null DATE (EPISTART) - no substantial gain from other columns
# 1 - DATE is null
# 2 - DATE is not null and DATE <= CENSOR_DATE_END
# 3 - DATE is not null and DATE > CENSOR_DATE_END
hes_apc_long_prepared = (
  hes_apc_long_prepared
  .withColumn('flag_1',
    f.when((f.col('DATE').isNull()), 1)
     .when((f.col('DATE').isNotNull()) & (f.col('DATE') <= f.col('CENSOR_DATE_END')), 2)
     .when((f.col('DATE').isNotNull()) & (f.col('DATE') >  f.col('CENSOR_DATE_END')), 3)
  )
)
# tmpt = tab(hes_apc_long_prepared, '_tmp1'); print()

# filter to before CENSOR_DATE_END
# keep _tmp1 == 2
# tidy
hes_apc_long_prepared = (
  hes_apc_long_prepared
  .where(f.col('flag_1').isin([2]))
  .drop('flag_1')
)

# check 3
# count_var(hes_apc_long_prepared, 'PERSON_ID'); print()

# check on or after CENSOR_DATE_START
# note: nulls were replaced in previous data step
# 1 - DATE >= CENSOR_DATE_START
# 2 - DATE <  CENSOR_DATE_START
hes_apc_long_prepared = (
  hes_apc_long_prepared
  .withColumn('flag_2',\
    f.when((f.col('DATE') >= f.col('CENSOR_DATE_START')), 1)\
     .when((f.col('DATE') <  f.col('CENSOR_DATE_START')), 2)\
  )
)
# tmpt = tab(hes_apc_long_prepared, 'flag_2'); print()

# filter to on or after CENSOR_DATE_START
# keep _tmp2 == 1
# tidy
hes_apc_long_prepared = (
  hes_apc_long_prepared
  .where(f.col('flag_2').isin([1]))
  .drop('flag_2')
)

# check 4
# count_var(hes_apc_long_prepared, 'PERSON_ID'); print()
# print(hes_apc_long_prepared.limit(10).toPandas().to_string()); print()

# temp save (checkpoint)
hes_apc_long_prepared = save_table(df=hes_apc_long_prepared, out_name=f'{proj}_tmp_covariates_hes_apc')

In [0]:
hes_apc_long_prepared = spark.table(f'{dsa}.{proj}_tmp_covariates_hes_apc')

In [0]:
display(hes_apc_long_prepared)

# 3. Codelists

In [0]:
display(codelist.select("codelist","name").distinct().orderBy("name","codelist"))

In [0]:
print('codelist_icd\n')
codelist_icd = (
  codelist
  .where(f.col('terminology') == 'ICD10')
)
tmpt = tab(codelist_icd, 'name', 'terminology'); print()
print(codelist_icd.orderBy('name', 'code').toPandas().to_string()); print()

In [0]:
display(codelist_icd)

# 4. Codelist match

In [0]:

hes_apc_long_prepared_test = (hes_apc_long_prepared.drop("DIAG_POSITION","DIAG_DIGITS"))

In [0]:
# dictionary - dataset, codelist, and ordering in the event of tied records
dict_hx_out = {
    'hes_apc':  ['hes_apc_long_prepared_test',  'codelist_icd',  1]
}

# run codelist match and codelist match summary functions
hx_out, hx_out_1st, hx_out_1st_wide = codelist_match(dict_hx_out, _name_prefix=f'cov_'); print() #default _last_event=0: filter to 1st event
hx_out_summ_name, hx_out_summ_name_code = codelist_match_summ(dict_hx_out, hx_out); print()

In [0]:
hx_out_all = hx_out['all']

hx_out_all = save_table(df=hx_out_all, out_name=f'{proj}_tmp_covariates_all_icd_only')
hx_out_1st = save_table(df=hx_out_1st, out_name=f'{proj}_tmp_covariates_1st_icd_only')
hx_out_1st_wide = save_table(df=hx_out_1st_wide, out_name=f'{proj}_tmp_covariates_1st_wide_icd_only')
hx_out_summ_name = save_table(df=hx_out_summ_name, out_name=f'{proj}_tmp_covariates_summ_name_icd_only')
hx_out_summ_name_code = save_table(df=hx_out_summ_name_code, out_name=f'{proj}_tmp_covariates_summ_name_code_icd_only')

# 5. Check and Save

In [0]:
hx_out_all = spark.table(f'{dsa}.{proj}_tmp_covariates_all_icd_only')
hx_out_1st = spark.table(f'{dsa}.{proj}_tmp_covariates_1st_icd_only')
hx_out_1st_wide = spark.table(f'{dsa}.{proj}_tmp_covariates_1st_wide_icd_only')
hx_out_summ_name = spark.table(f'{dsa}.{proj}_tmp_covariates_summ_name_icd_only')
hx_out_summ_name_code = spark.table(f'{dsa}.{proj}_tmp_covariates_summ_name_code_icd_only')

In [0]:
display(hx_out_all.filter(f.col("name")=="hypertension").filter(f.col("source")=="hes_apc").select("CODE","PERSON_ID").distinct().groupBy("CODE").count())

In [0]:
count_var(hx_out_1st_wide, 'PERSON_ID')

In [0]:
# check result
display(hx_out_1st_wide)

In [0]:
# check codelist match summary by name and source
display(hx_out_summ_name)

In [0]:
# check codelist match summary by name, source, and code
display(hx_out_summ_name_code)

# 6. Save

In [0]:
tmp1 = merge(hx_out_1st_wide, cohort.select('PERSON_ID'), ['PERSON_ID'], validate='1:1', assert_results=['both', 'right_only'], indicator=0); print()

# check
count_var(tmp1, 'PERSON_ID'); print()
print(len(tmp1.columns)); print()
print(pd.DataFrame({f'_cols': tmp1.columns}).to_string()); print()

In [0]:
# check final
display(tmp1)

In [0]:
save_table(df=tmp1, out_name=f'{proj}_out_covariates_icd_only', save_previous=True)