
# Curated Data - Data for R Pipeline
 
**Description** This notebook creates the curated for Aortic Stenosis, SAVR and TAVI.
 
**Authors** Fionna Chalmers, Anna Stevenson (Health Data Science Team, BHF Data Science Centre)

**Reviewers** âš  UNREVIEWED

**Notes** Note that TAVI code combinatons contain SAVR codes thus to derive a SAVR case, it must be established that a TAVI combination does not exist.

**Data Output**
- **`ccu056_out_codelists_inclusionsd`** : codelist for AS, SAVR and TAVI


**Data Output**


# 0. Setup

In [0]:
# pyspark libraries
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import Window

from functools import reduce

import databricks.koalas as ks
import pandas as pd
import numpy as np

import re
import io
import datetime

# plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import seaborn as sns


# versions
print("Matplotlib version: ", matplotlib.__version__)
print("Seaborn version: ", sns.__version__)
_datetimenow = datetime.datetime.now() # .strftime("%Y%m%d")
print(f"_datetimenow:  {_datetimenow}")


##0.1 Helpers

### Common Functions

In [0]:
%run "/Repos/shds/common/functions"

### Help Functions

In [0]:
%run "/Repos/shds/Fionna/help_functions"

##0.2 Parameters

In [0]:
%run "./CCU056-01-parameters"

##0.3 Data

In [0]:
hes_apc_long = spark.table(f'{dsa}.{proj}_cur_hes_apc_all_years_archive_long')
hes_apc_op_long = spark.table(f'{dsa}.{proj}_cur_hes_apc_all_years_archive_op_long')
hes_apc_op_otr_long = spark.table(f'{dsa}.{proj}_cur_hes_apc_all_years_archive_op_otr_long')
codelists_inclusions = spark.table(f'{dsa}.{proj}_out_codelists_inclusions')

skinny_unassembled = spark.table(f'{dsa}.{proj}_tmp_kpc_harmonised')
skinny_assembled = spark.table(f'{dsa}.{proj}_tmp_skinny')
lsoa = spark.table(f'{dsa}.{proj}_lsoa')

acs   = extract_batch_from_archive(parameters_df_datasets, 'nacsa')
tavi = extract_batch_from_archive(parameters_df_datasets, 'tavi')

In [0]:
display(codelists_inclusions)

# 1. HES APC - Aortic Stenosis

##Check

Just checking no instances in which a 3 digit code is present but the 4 digit is missing, as when we filter for out codelist we use the 4 digits.

Only 9 cases found and they do not apply to our project codelists.

In [0]:
# hes_apc_op_long = spark.table(f'{dsa}.{proj}_cur_hes_apc_all_years_archive_op_long')

# hes_op_wide = (hes_apc_op_long.groupBy("PERSON_ID","EPIKEY","EPISTART","ADMIDATE","OPERTN_POSITION").pivot("OPERTN_DIGITS").agg(f.first("CODE")))

# tmp1 = (
#     hes_op_wide
#     .withColumnRenamed('3','OPERTRN_3').withColumnRenamed('4','OPERTRN_4')
#     .withColumn("OPERTRN_4_3", f.col("OPERTRN_4").substr(1, 3))
#     .withColumn("check", f.when(f.col("OPERTRN_4_3")==f.col("OPERTRN_3"), 1).otherwise(0))
#     )

In [0]:
# display(tmp1.groupBy("check").count())

In [0]:
# display(tmp1.filter(f.col("check")==0))

In [0]:
 as_codes = (
     hes_apc_long
     .join((codelists_inclusions.filter(f.col("name")=="aortic_stenosis").select(f.col("code").alias("CODE"))),
           on="CODE",how="inner")
 )

In [0]:
display(as_codes.count())

In [0]:
display(as_codes)

In [0]:
save_table(df=as_codes,out_name=f'{proj}_tmp_cases_as_codes', save_previous=False)

# 2. HES APC - Operations

In [0]:
procedure_codes = (
     hes_apc_op_long
     .join((codelists_inclusions.filter(f.col("name").isin("savr","tavi")).select(f.col("code").alias("CODE")).distinct()),
           on="CODE",how="inner")
 )

In [0]:
display(procedure_codes.count())

In [0]:
display(procedure_codes)

In [0]:
save_table(df=procedure_codes,out_name=f'{proj}_tmp_cases_procedure_codes', save_previous=False)

##2.1 Operation Dates

In [0]:
hes_apc_op_otr_long_cohort = (
     hes_apc_op_otr_long
     .withColumnRenamed('OPERTN','CODE')
     .join((codelists_inclusions.filter(f.col("name").isin("savr","tavi")).select(f.col("code").alias("CODE")).distinct()),
           on="CODE",how="inner")
 )

In [0]:
display(hes_apc_op_otr_long_cohort)

In [0]:
save_table(df=hes_apc_op_otr_long_cohort,out_name=f'{proj}_tmp_cases_procedure_codes_operation_dates', save_previous=False)

#3. Audits

In [0]:
save_table(df=acs,out_name=f'{proj}_tmp_cases_acs', save_previous=False)
save_table(df=tavi,out_name=f'{proj}_tmp_cases_tavi', save_previous=False)

# Appendix

In [0]:
Skinny selection

In [0]:
#unassembled dummy data

dummy = skinny_unassembled.filter(f.col("PERSON_ID").isin(["000K1P6P6B5FXHH","00GSM26FLPY83DM"]))

In [0]:
display(skinny.filter(f.col("PERSON_ID").isin(["000K1P6P6B5FXHH","00GSM26FLPY83DM"])))

In [0]:
display(dummy)

In [0]:


  """
  Description:
    Function to produce the project-specific patient skinny assembled table, as per DATA_CURATION\\curr302_patient_skinny_record:
      **Description** Making a single record for each patient in primary and secondary care.
      **Author(s)** Sam Hollings
      **Reviewer(s)** Angela Wood

  Args:
    _unassembled (DataFrame): A dataframe containing the patient skinny unassembled table produced by the skinny_unassembled function.
    _overall_censor_date (str): A date string (e.g., '2020-01-01') containing the overall censor date used to exclude records before assembling. 
    _individual_censor_dates (DataFrame): A dataframe containing individual censor dates used to exclude records before assembling. 
    _prioritise_primary_care (bool): A boolean indicating whether to prioritise primary care records when assembling (default = True).
    
  Returns:
    pyspark.sql.DataFrame: A dataframe containing the patient skinny record.
    
  Example:
    ...    
  """   
  
  
  # ------------------------------------------------------------------------------------
  # dss_corporate
  # ------------------------------------------------------------------------------------    
  ethnic_hes = spark.table(f'dss_corporate.hesf_ethnicity')
  ethnic_gdppr = spark.table(f'dss_corporate.gdppr_ethnicity')
  
  
  
  varlist = ['DOB', 'SEX', 'ETHNIC']
      
  # remove those with missing ID, missing record date, and record date in the future  
  _harmonised = (
    harmonised
    .where(f.col('PERSON_ID').isNotNull())
    .where(f.col('RECORD_DATE').isNotNull())
    .where(f.col('RECORD_DATE') <= f.col('archived_on')))
  
  # ------------------------------------------------------------------------------------
  # _prioritise_primary_care
  # ------------------------------------------------------------------------------------       
  assert prioritise_primary_care in [0, 1]
  if(prioritise_primary_care == 0):
    # if zero (no) then null the RECORD_PRIMARY column
    # note: this edit avoids having to redefine windows later
    print(f'** NOT prioritising primary care records **')
    _harmonised = (_harmonised.withColumn('RECORD_PRIMARY', f.lit(None))) 
  else: print(f'** prioritising primary care records **')
  
  # ------------------------------------------------------------------------------------
  # characteristic selection preparation
  # ------------------------------------------------------------------------------------ 
  # define RECORD_PRIMARY_DOB, ensure HES APC is prioritised above HES AE and HES OP (which are based on an estimate using age)
  # define RECORD_SOURCEn
  _harmonised = (
    _harmonised
    .withColumn('RECORD_PRIMARY_DOB',
      f.when(f.col('RECORD_SOURCE').isin(['hes_ae', 'hes_op']), -1)
      .otherwise(f.col('RECORD_PRIMARY'))
    )
    .withColumn('RECORD_SOURCEn',
      f.when(f.col('RECORD_SOURCE') == 'gdppr', 1)
      .when(f.col('RECORD_SOURCE') == 'gdppr_snomed', 2)
      .when(f.col('RECORD_SOURCE') == 'hes_apc', 3)
      .when(f.col('RECORD_SOURCE') == 'hes_op', 4)
      .when(f.col('RECORD_SOURCE') == 'hes_ae', 5)
    ))

  # checks turned off for runtime
  # assert _unassembled.count() == _unassembled.where(f.col('RECORD_SOURCEn').isNotNull()).count()
  
  # define windows for row numbers
  _win_rownum_DOB = (
    Window
    .partitionBy('PERSON_ID')\
    .orderBy(['DOB_null', f.desc('RECORD_PRIMARY_DOB'), f.desc('RECORD_DATE'), 'RECORD_SOURCEn', 'RECORD_ID']))
  _win_rownum_SEX = (
    Window
    .partitionBy('PERSON_ID')
    .orderBy(['SEX_null', f.desc('RECORD_PRIMARY'), f.desc('RECORD_DATE'), 'RECORD_SOURCEn', 'RECORD_ID']))
  _win_rownum_ETHNIC = (
    Window
    .partitionBy('PERSON_ID')
    .orderBy(['ETHNIC_null', f.desc('RECORD_PRIMARY'), f.desc('RECORD_DATE'), 'RECORD_SOURCEn', 'RECORD_ID']))
  
  # create null indicators
  # add row numbers
  _harmonised = (
    _harmonised
    .withColumn('DOB_null',
      f.when(
        (f.col('DOB').isNull())
        | (f.trim(f.col('DOB')).isin(['']))
        | (f.col('DOB') < '1900-01-01')
        | (f.col('DOB') > f.col('archived_on'))
        | (f.col('DOB') > f.col('RECORD_DATE'))
      , 1).otherwise(0)
    )
    .withColumn('SEX_null',
      f.when( (f.col('SEX').isNull()) | (f.trim(f.col('SEX')).isin(['', '9', '0'])), 1).otherwise(0)
    )
    .withColumn('ETHNIC_null',
      f.when( (f.col('ETHNIC').isNull()) | (f.trim(f.col('ETHNIC')).isin(['', '9', '99', 'X', 'Z'])), 1).otherwise(0)
    )
    .withColumn('_rownum_DOB', f.row_number().over(_win_rownum_DOB))
    .withColumn('_rownum_SEX', f.row_number().over(_win_rownum_SEX))
    .withColumn('_rownum_ETHNIC', f.row_number().over(_win_rownum_ETHNIC)))
  
  # ------------------------------------------------------------------------------------
  # ties
  # ------------------------------------------------------------------------------------   
  # create indicators for tied records that have different values for a given variable  
  for ind, var in enumerate(varlist):
    # define window for tied records
    record_primary = 'RECORD_PRIMARY'
    if(var == 'DOB'): record_primary = 'RECORD_PRIMARY_DOB'
    _win_tie = Window\
      .partitionBy('PERSON_ID')\
      .orderBy(f'{var}_null', f.desc(record_primary), f.desc('RECORD_DATE'))
      
    # count distinct values of var (including null) within tied records
    _tie = (
      _harmonised
      .withColumn(f'_tie_{var}', f.dense_rank().over(_win_tie))
      .where(f.col(f'_tie_{var}') == 1)
      .groupBy('PERSON_ID')
      .agg(
        f.countDistinct(f.col(f'{var}')).alias(f'_n_distinct_{var}')
        , f.countDistinct(f.when(f.col(f'{var}').isNull(), 1)).alias(f'_null_{var}')
      )
      .withColumn(f'_tie_{var}', f.when((f.col(f'_n_distinct_{var}') + f.col(f'_null_{var}')) > 1, 1).otherwise(0))
      .select('PERSON_ID', f'_tie_{var}'))
  
    if(ind == 0): _tmp_ties = _tie
    else: _tmp_ties = (_tmp_ties.join(_tie, on=['PERSON_ID'], how='outer'))
    
  # ------------------------------------------------------------------------------------
  # characteristic selection
  # ------------------------------------------------------------------------------------       
  # take information from the first row identified above
  _tmp_selected = {}
  for var in varlist:
    _tmp = (
      _harmonised
      .select('PERSON_ID', 'RECORD_DATE', 'RECORD_SOURCE', f'{var}', f'_rownum_{var}')
      .where(f.col(f'_rownum_{var}') == 1)
      .withColumnRenamed('RECORD_DATE', f'_date_{var}')
      .withColumnRenamed('RECORD_SOURCE', f'_source_{var}')
      .select('PERSON_ID', f'{var}', f'_date_{var}', f'_source_{var}'))
    _tmp_selected[f'{var}'] = _tmp
  
  # ------------------------------------------------------------------------------------  
  # gdppr presence
  # ------------------------------------------------------------------------------------  
  # in_gdppr is extracted from the unfiltered table
  #   so in_gdppr may relate to records after date exclusions
  _tmp_in_gdppr = (
    harmonised
    .where(f.col('RECORD_SOURCE') == 'gdppr')
    .select('PERSON_ID')
    .distinct()
    .where(f.col('PERSON_ID').isNotNull())
    .withColumn('in_gdppr', f.lit(1)))  
  
  # ------------------------------------------------------------------------------------ 
  # ethnic lookup
  # ------------------------------------------------------------------------------------ 
  # ethnic_hes
  ethnic_hes = (
    ethnic_hes
    .select('ETHNICITY_CODE', 'ETHNICITY_DESCRIPTION')
    .withColumnRenamed('ETHNICITY_CODE', 'ETHNIC')
    .withColumnRenamed('ETHNICITY_DESCRIPTION', 'ETHNIC_DESC_HES'))

  #  ethnic_gdppr
  ethnic_gdppr = (
    ethnic_gdppr
    .select('Value', 'Label')
    .withColumnRenamed('Value', 'ETHNIC')
    .withColumnRenamed('Label', 'ETHNIC_DESC_GDPPR'))

  # checks
  assert ethnic_hes.count() == ethnic_hes.select('ETHNIC').distinct().count()\
    , "ethnic_hes is not well-defined"
  vallist = list(range(0, 10)) + list('ABCDEFGHJKLMNPRSXZ')
  tmp = (ethnic_hes.withColumn('chk', f.when(f.col('ETHNIC').isin(vallist), 1).otherwise(0)))
  assert tmp.count() == tmp.where(f.col('chk') == 1).count()\
    , "ethnic_hes does not have all required values defined"
  assert ethnic_gdppr.count() == ethnic_gdppr.select('ETHNIC').distinct().count()\
    , "ethnic_gdppr is not well-defined"
  vallist = list('ABCDEFGHJKLMNPRSTWZ')
  tmp = (ethnic_gdppr.withColumn('chk', f.when(f.col('ETHNIC').isin(vallist), 1).otherwise(0)))
  assert tmp.count() == tmp.where(f.col('chk') == 1).count()\
    , "ethnic_gdppr does not have all required values defined"

  # combine
  ethnic = (
    ethnic_hes
    .join(ethnic_gdppr, on='ETHNIC', how='outer')
    .withColumn('ETHNIC_DESC', f.coalesce(f.col('ETHNIC_DESC_HES'), f.col('ETHNIC_DESC_GDPPR')))
    .orderBy('ETHNIC'))

  # add ethnic cat
  _tmp_ethnic = (
    ethnic
    .select('ETHNIC', 'ETHNIC_DESC')
    .withColumn('ETHNIC_CAT',
     f.when(f.col('ETHNIC').isin(['0','A','B','C']), f.lit('White'))
      .when(f.col('ETHNIC').isin(['1','2','3','N','M','P']), f.lit('Black or Black British'))
      .when(f.col('ETHNIC').isin(['4','5','6','L','K','J','H']), f.lit('Asian or Asian British'))
      .when(f.col('ETHNIC').isin(['D','E','F','G']), f.lit('Mixed'))
      .when(f.col('ETHNIC').isin(['7','8','W','T','S','R']), f.lit('Other'))
      .when(f.col('ETHNIC').isin(['9','Z','X']), f.lit('Unknown'))
      .otherwise('Unknown')
    ))

  # check
  # tmpt = tab(ethnic_cat, 'ETHNIC_DESC', 'ETHNIC_CAT', var2_unstyled=1); print()
    
  # ------------------------------------------------------------------------------------  
  # _selected
  # ------------------------------------------------------------------------------------  
  _selected = (
    _tmp_selected['DOB']
    .join(_tmp_selected['SEX'], on=['PERSON_ID'], how='outer')
    .join(_tmp_selected['ETHNIC'], on=['PERSON_ID'], how='outer')
    .join(_tmp_in_gdppr, on=['PERSON_ID'], how='outer')
    .join(_tmp_ties, on=['PERSON_ID'], how='outer')
    .join(_tmp_ethnic, on=['ETHNIC'], how='left')
    .select('PERSON_ID', 'DOB', 'SEX', 'ETHNIC', 'ETHNIC_DESC', 'ETHNIC_CAT'
            , '_date_DOB', '_source_DOB', '_tie_DOB'
            , '_date_SEX', '_source_SEX', '_tie_SEX'
            , '_date_ETHNIC', '_source_ETHNIC', '_tie_ETHNIC'
            , 'in_gdppr'))

  return _selected