In [0]:
spark.sql('CLEAR CACHE')
spark.conf.set('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true')

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import Window

from functools import reduce

import databricks.koalas as ks
import pandas as pd
import numpy as np

import re
import io
import datetime

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import seaborn as sns

print("Matplotlib version: ", matplotlib.__version__)
print("Seaborn version: ", sns.__version__)
_datetimenow = datetime.datetime.now() # .strftime("%Y%m%d")
print(f"_datetimenow:  {_datetimenow}")

In [0]:
%run "../SHDS/common/functions"

# 0. Parameters

In [0]:
%run "./CCU056-01-parameters"

# 1. Data

In [0]:
cohort = spark.table(f'{dsa}.{proj}_tmp_main_cohort_final2')
hes_apc_long = spark.table(path_cur_hes_apc_long)
hes_apc = extract_batch_from_archive(parameters_df_datasets,"hes_apc")
hes_apc_op_otr_long_cohort=spark.table(f'{dsa}.{proj}_tmp_cases_procedure_codes_operation_dates')

In [0]:
hes_apc = hes_apc.withColumnRenamed("PERSON_ID_DEID","PERSON_ID")

In [0]:
hes_full_cohort = (cohort.select("PERSON_ID",f.col("OPERATION_DATE").alias("OPDATE"))
    .join(hes_apc_op_otr_long_cohort,on=["PERSON_ID","OPDATE"],how="left")
    .select("PERSON_ID",f.col("OPDATE").alias("OPERATION_DATE"),"EPIKEY")
    .distinct()
    .join(hes_apc,on=["PERSON_ID","EPIKEY"],how="left")
    .select("PERSON_ID","OPERATION_DATE","EPIKEY","ADMIMETH")
    )

In [0]:
save_table(hes_full_cohort,f'{proj}_hes_full_cohort')

In [0]:
hes_full_cohort = spark.table(f'{dsa}.{proj}_hes_full_cohort')

In [0]:
display(hes_full_cohort)

In [0]:
%%script echo skipping
hes_apc = extract_batch_from_archive(parameters_df_datasets, 'hes_apc')

tmp_hes_apc = (
  hes_apc  
  .select(['PERSON_ID_DEID', 'EPIKEY', 'EPISTART', 'ADMIDATE', 'DISDATE'] 
          + [col for col in list(hes_apc.columns) if re.match(r'^DIAG_(3|4)_\d\d$', col)])
  .withColumnRenamed('PERSON_ID_DEID', 'PERSON_ID')
  .orderBy('PERSON_ID', 'EPIKEY')
)

tmp_hes_apc_long = (
  reshape_wide_to_long_multi(tmp_hes_apc, i=['PERSON_ID', 'EPIKEY', 'EPISTART', 'ADMIDATE', 'DISDATE'], j='POSITION', stubnames=['DIAG_4_', 'DIAG_3_'])
  .withColumn('_tmp', f.substring(f.col('DIAG_4_'), 1, 3))
  .withColumn('_chk', udf_null_safe_equality('DIAG_3_', '_tmp').cast(t.IntegerType()))
  .withColumn('_DIAG_4_len', f.length(f.col('DIAG_4_')))
  .withColumn('_chk2', f.when((f.col('_DIAG_4_len').isNull()) | (f.col('_DIAG_4_len') <= 4), 1).otherwise(0))
)

tmp_hes_apc_long = reshape_wide_to_long_multi(tmp_hes_apc_long, i=['PERSON_ID', 'EPIKEY', 'EPISTART', 'ADMIDATE', 'DISDATE', 'POSITION'], j='DIAG_DIGITS', stubnames=['DIAG_'])\
  .withColumnRenamed('POSITION', 'DIAG_POSITION')\
  .withColumn('DIAG_POSITION', f.regexp_replace('DIAG_POSITION', r'^[0]', ''))\
  .withColumn('DIAG_DIGITS', f.regexp_replace('DIAG_DIGITS', r'[_]', ''))\
  .withColumn('DIAG_', f.regexp_replace('DIAG_', r'X$', ''))\
  .withColumn('DIAG_', f.regexp_replace('DIAG_', r'[.,\-\s]', ''))\
  .withColumnRenamed('DIAG_', 'CODE')\
  .where((f.col('CODE').isNotNull()) & (f.col('CODE') != ''))\
  .orderBy(['PERSON_ID', 'EPIKEY', 'DIAG_DIGITS', 'DIAG_POSITION'])

# adding in cohort to make table smaller and quicker to save
tmp_hes_apc_long = (
  cohort.select("PERSON_ID").join(tmp_hes_apc_long,on="PERSON_ID",how="left")
  .select('PERSON_ID', f.col('EPISTART'), 'ADMIDATE','DISDATE','CODE', 'DIAG_POSITION', 'DIAG_DIGITS','EPIKEY')
)

In [0]:
%%script echo skipping
save_table(df=tmp_hes_apc_long, out_name=f'{proj}_outcomes_tmp_hes_apc_long',save_previous=False)

In [0]:
tmp_hes_apc_long = spark.table(f'{dsa}.{proj}_outcomes_tmp_hes_apc_long')

In [0]:
%%script echo skipping
save_table(df=cohort_discharge_dates, out_name=f'{proj}_cohort_discharge_dates',save_previous=False)

In [0]:
cohort_discharge_dates = spark.table(f'{dsa}.{proj}_cohort_discharge_dates')

In [0]:
# reduce and rename columns
hes_apc_long_prepared = (
  hes_apc_long
  .select('PERSON_ID', f.col('EPISTART').alias('DATE'), 'CODE', 'DIAG_POSITION', 'DIAG_DIGITS')
)

# merge in individual censor dates
hes_apc_long_prepared = (
  hes_apc_long_prepared
  .join(individual_censor_dates_discharge, on='PERSON_ID', how='inner')
)


# filter to admissions post oepration and before study end date
hes_apc_long_prepared = (
  hes_apc_long_prepared
      .where((f.col('DATE') >= f.col('CENSOR_DATE_START')))
      .where((f.col('DATE') <= f.col('CENSOR_DATE_END')))
  
)

In [0]:


display(hes_apc_long_prepared)

In [0]:
hes_wide = (hes_apc_long_prepared
        .select("PERSON_ID","DATE","CODE","DIAG_POSITION","DIAG_DIGITS")
        .groupBy("PERSON_ID","DATE","DIAG_POSITION").pivot("DIAG_DIGITS").agg(f.first("CODE"))
        .withColumnRenamed("3","DIAG_3")
        .withColumnRenamed("4","DIAG_4")
        )

In [0]:
outcomes_hospitalisations_raw = (
    hes_apc_long_prepared
    .filter(f.col("DIAG_POSITION")==1) #first position only
        .join((codelist_outcomes_hf
           .select(f.col("code").alias("CODE"))
           .withColumn("outcomes_hosp_description",f.lit("cvd_hosp"))),
          on="CODE",how="left"
          )
    .withColumn("outcomes_hosp_description", f.when(f.col("outcomes_hosp_description").isNull(),f.lit("non_cvd_hosp")).otherwise(f.col("outcomes_hosp_description")))
    .join(cohort,on="PERSON_ID",how="left")

        )

In [0]:
save_table(df=outcomes_hospitalisations_raw, out_name=f'{proj}_outcomes_hospitalisations_raw',save_previous=False)

In [0]:
nacsa = spark.table(f'dars_nic_391419_j3w9t_collab.nicor_acs_combined_dars_nic_391419_j3w9t_archive')

In [0]:
(nacsa.columns)