# Curated Data - Skinny
 
**Description** This notebook creates the skinny patient table, which includes key patient characteristics.
 
**Authors** Tom Bolton, Fionna Chalmers, Anna Stevenson (Health Data Science Team, BHF Data Science Centre)

**Reviewers** âš  UNREVIEWED

**Acknowledgements** Based on CCU002_07 and subsequently CCU003_05-D04-skinny

**Notes**

**Data Output**
- **`ccu056_tmp_kpc_harmonised`** : Skinny unassembled
- **`ccu056_tmp_skinny`** : Skinny assembled (selected)

# 0. Setup

In [0]:
spark.sql('CLEAR CACHE')
spark.conf.set('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true')

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import Window

from functools import reduce

import databricks.koalas as ks
import pandas as pd
import numpy as np

import re
import io
import datetime

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import seaborn as sns

print("Matplotlib version: ", matplotlib.__version__)
print("Seaborn version: ", sns.__version__)
_datetimenow = datetime.datetime.now() # .strftime("%Y%m%d")
print(f"_datetimenow:  {_datetimenow}")

In [0]:
%run "/Repos/shds/common/functions"

In [0]:
%run "/Repos/shds/common/skinny_20221113"

# 1. Parameters

In [0]:
%run "./CCU056-01-parameters"

# 2. Data

In [0]:
gdppr   = extract_batch_from_archive(parameters_df_datasets, 'gdppr')
hes_apc = extract_batch_from_archive(parameters_df_datasets, 'hes_apc')
hes_ae  = extract_batch_from_archive(parameters_df_datasets, 'hes_ae')
hes_op  = extract_batch_from_archive(parameters_df_datasets, 'hes_op')

In [0]:
acs   = extract_batch_from_archive(parameters_df_datasets, 'nacsa')
tavi = extract_batch_from_archive(parameters_df_datasets, 'tavi')

# 3. Create Unassembled

##3.1 GDPPR & HES

In [0]:
kpc_harmonised = key_patient_characteristics_harmonise(gdppr=gdppr, hes_apc=hes_apc, hes_ae=hes_ae, hes_op=hes_op)

# temp save (~15 minutes)
save_table(df=kpc_harmonised, out_name=f'{proj}_tmp_kpc_harmonised', save_previous=True, data_base=dsa)

In [0]:
kpc_harmonised = spark.table(f'{dsa}.{proj}_tmp_kpc_harmonised')

##3.2 Audits

In [0]:
_acs = (acs
        .select('archived_on', f.col('PERSON_ID_DEID').alias('PERSON_ID') 
                , f.col('DATE_AND_TIME_OF_OPERATION').alias('RECORD_DATE')
                , f.col('MONTH_YEAR_OF_BIRTH').alias('DOB'), 'SEX')
        .distinct()
        .withColumn('DOB', f.to_date(f.col('DOB').cast(t.StringType()), "yyyy-MM"))
        .withColumn('RECORD_ID', f.lit(None))
        .withColumn('CODE', f.lit(None))
        .withColumn('ETHNIC', f.lit(None))
        .withColumn('RECORD_SOURCE', f.lit('nacsa'))
        )

_tavi = (tavi
        .select('archived_on', f.col('PERSON_ID_DEID').alias('PERSON_ID') 
                , f.col('7_01_DATE_AND_TIME_OF_OPERATION').alias('RECORD_DATE')
                , f.col('MONTH_YEAR_OF_BIRTH').alias('DOB')
                , f.col('1_07_SEX').alias('SEX')
                , f.col('1_08_ETHNIC_ORIGIN').alias('ETHNIC')
                )
        .distinct()
        .withColumn('DOB', f.to_date(f.col('DOB').cast(t.StringType()), "yyyy-MM"))
        .withColumn('RECORD_ID', f.lit(None))
        .withColumn('CODE', f.lit(None))
        .withColumn('RECORD_SOURCE', f.lit('tavi'))
        )
  


audits_harmonised = (_acs
               .unionByName(_tavi)
               .withColumn('RECORD_PRIMARY',f.lit(0))
               .withColumn("RECORD_DATE", f.date_format(f.col("RECORD_DATE"), "yyyy-MM-dd"))
               .select('PERSON_ID', 'archived_on', 'RECORD_SOURCE', 'RECORD_PRIMARY','RECORD_ID', 'RECORD_DATE', 
                        'DOB', 'SEX', 'ETHNIC', 'CODE'))

In [0]:
display(audits_harmonised)

## 3.3 Combine

In [0]:
all_unassembled = (audits_harmonised
               .unionByName(kpc_harmonised)
)

save_table(df=all_unassembled, out_name=f'{proj}_tmp_all_cases_unassembled', save_previous=True, data_base=dsa)