In [0]:
%run ./project_config

In [0]:
%run ./parameters

In [0]:
from pyspark.sql import functions as f
from functions.table_management import load_table, save_table
from functions.functions import tabstat, count_var

# 1 Load tables

In [0]:
# This table is a curated table that helps understand the pseudo id label used/assigned for each person in the datasets available
token_pseudo_id_lookup = load_table('token_pseudo_id_lookup')

# This curated table provides demographic information, harmonised from multiples sources. Some information (like address aka LSOA) might vary over time and different projects may want this curated in different ways. Please familiarise yourself with how these assets are curated in the HDS Documentation pages
demographics = load_table('demographics')

display(token_pseudo_id_lookup.limit(50))
display(demographics.limit(50))

# 2 Source Person IDs types

In [0]:
# Select relevant person_id flags
cohort_person_id = (
    token_pseudo_id_lookup
    .select(f.col('pseudo_id').alias('person_id'), 'valid_nhs_number')
)


# 3 Study dates

In [0]:
# Preparing date columns based on those defined in the params notebook. This will be important for joining any time-sensitive eligibility criteria, or measurements later in the pipeline
cohort_study_dates = (
    cohort_person_id
    .withColumn('cohort_entry_start_date', f.to_date(f.lit(cohort_entry_start_date)))
    .withColumn('cohort_entry_end_date', f.to_date(f.lit(cohort_entry_end_date)))
    .withColumn('follow_up_end_date', f.to_date(f.lit(follow_up_end_date)))
    .withColumn('production_date', f.to_date(f.lit(production_date)))
)

# 4 Append invariant demographics

In [0]:
# Joining the demographic information to the cohort date information 
cohort_demographics = (
    cohort_study_dates
    .join(
        demographics
        .select('person_id', 'sex', 'date_of_birth', 'ethnicity_5_group', 'death_flag', 'date_of_death'),
        on = 'person_id', how = 'left'
    )
)



# 5 Save table
Saving table back to the database. Usually best to do this after transformations but before any extensive inspection due to 'lazy programming' in spark

In [0]:

save_table(cohort_demographics, 'cohort_demographics')

# 6 Display

We read back the saved table to query and inspect further. This means most of the middle parts of the notebook do not need to be rerun every time we return to the notebook if we're only interested in the previously saved data

In [0]:


cohort_demographics = load_table('cohort_demographics')
display(cohort_demographics.limit(50))