In [1]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath(os.path.join('..', '..')))

import pickle as pkl
import src.visualisations.utils as utils
import importlib
import datetime

# reload the module so it picks up any changes
importlib.reload(utils)

from src.visualisations.utils import get_subject_id_filepath_lookup_table, get_subject_datapoint, time_delta_to_str

# Show full column values (no truncation)
pd.set_option("display.max_colwidth", None)

EHR_DATASET_DIR = "/data/scratch/qc25022/liver/tokenised_data/cprd_test/tuning"
CLINICAL_NOTE_DIR = None
EHR_VOCAB_FILEPATH = "/data/scratch/qc25022/liver/tokenised_data/cprd_test/vocab.csv"

In [2]:

subject_id_filepath_lookup_table = get_subject_id_filepath_lookup_table(EHR_DATASET_DIR, CLINICAL_NOTE_DIR)
subject_id_filepath_lookup_table.head()

Calculating patient to ehr shard mappings: 100%|██████████| 6/6 [00:00<00:00, 10.64it/s]


Unnamed: 0,subject_id,ehr_shard_filepath,clinical_note_shard_filepath
0,314437950164,/data/scratch/qc25022/liver/tokenised_data/cprd_test/tuning/shard_1.pkl,
1,314490951317,/data/scratch/qc25022/liver/tokenised_data/cprd_test/tuning/shard_1.pkl,
2,315413451254,/data/scratch/qc25022/liver/tokenised_data/cprd_test/tuning/shard_1.pkl,
3,316020151307,/data/scratch/qc25022/liver/tokenised_data/cprd_test/tuning/shard_1.pkl,
4,316763950330,/data/scratch/qc25022/liver/tokenised_data/cprd_test/tuning/shard_1.pkl,


In [3]:
vocab = pd.read_csv(EHR_VOCAB_FILEPATH, index_col=0)
vocab.head()

Unnamed: 0_level_0,str,count
token,Unnamed: 1_level_1,Unnamed: 2_level_1
0,<unknown>,367
1,<start>,0
2,<end>,0
3,Q6,1967162
4,Q9,1853235


In [4]:
# Pick a random subject id and load its data
random_row = subject_id_filepath_lookup_table.sample(1).iloc[0]
subject_id = random_row['subject_id']
ehr_shard_filepath = random_row['ehr_shard_filepath']
clinical_note_shard_filepath = random_row['clinical_note_shard_filepath']

# Load the data
datapoint = get_subject_datapoint(subject_id, ehr_shard_filepath, clinical_note_shard_filepath)
print("Subject ID: ", subject_id)
print("EHR Token Sequence Length: ", len(datapoint['ehr_data']['token_ids']))
print("Clinical Note Count: ", len(datapoint['clinical_note_data']) if datapoint['clinical_note_data'] is not None else 0)
print("Datapoint: ", datapoint)

Subject ID:  68323150566
EHR Token Sequence Length:  749
Clinical Note Count:  0
Datapoint:  {'subject_id': 68323150566, 'ehr_data': {'token_ids': [1, 192, 428, 165, 147, 71, 60, 6, 46, 10, 82, 6, 15, 9, 32, 9, 14, 11, 167, 731, 266, 958, 13, 71, 118, 87, 15, 6, 14, 8, 13, 87, 14, 7, 13, 15, 5, 22, 214, 25, 118, 18, 50, 41, 3, 39, 7, 49, 42, 7, 44, 7, 40, 9, 55, 9, 30, 6, 53, 6, 27, 3, 37, 12, 128, 9, 36, 3, 24, 3, 38, 8, 47, 4, 61, 5, 33, 6, 35, 6, 34, 12, 31, 7, 52, 5, 62, 43, 5, 29, 8, 45, 10, 26, 10, 74, 75, 105, 60, 6, 89, 32, 10, 46, 10, 205, 14, 11, 81, 13, 15, 11, 71, 179, 22, 86, 50, 33, 11, 41, 7, 55, 5, 44, 3, 49, 68, 11, 34, 7, 26, 7, 24, 12, 42, 11, 28, 8, 53, 11, 47, 4, 29, 3, 54, 11, 58, 5, 35, 7, 36, 4, 45, 8, 40, 6, 39, 11, 52, 8, 27, 300, 30, 10, 61, 4, 31, 11, 62, 43, 3, 128, 5, 37, 7, 38, 8, 18, 15, 10, 46, 10, 75, 32, 10, 82, 6, 14, 11, 13, 59, 219, 21, 1405, 18, 132, 18, 310, 20, 101, 16, 179, 105, 19, 88, 71, 15, 8, 13, 14, 6, 18, 45, 5, 30, 12, 43, 3, 62, 40, 3,

In [5]:
def print_subject_datapoint(subject_id: int):
    subject_row = subject_id_filepath_lookup_table[subject_id_filepath_lookup_table['subject_id'] == subject_id]
    ehr_shard_filepath = subject_row['ehr_shard_filepath'].values[0]
    clinical_note_shard_filepath = subject_row['clinical_note_shard_filepath'].values[0]
    datapoint = get_subject_datapoint(subject_id, ehr_shard_filepath, clinical_note_shard_filepath)

    # Find MEDS_BIRTH timestamp as the reference point
    birth_timestamp = None
    for token_id, timestamp in zip(datapoint['ehr_data']['token_ids'], datapoint['ehr_data']['timestamps']):
        if vocab.iloc[token_id]['str'] == 'MEDS_BIRTH':
            birth_timestamp = timestamp
            break
    
    # Create combined events list
    events = []
    
    # Add EHR events
    for token_id, timestamp in zip(datapoint['ehr_data']['token_ids'], datapoint['ehr_data']['timestamps']):
        events.append(('ehr', timestamp, vocab.iloc[token_id]['str']))
    
    # Add clinical note events
    if datapoint['clinical_note_data'] is not None:
        for note in datapoint['clinical_note_data']:
            note_timestamp = datetime.datetime.strptime(note['charttime'], '%Y-%m-%d %H:%M:%S').timestamp()
            events.append(('note', note_timestamp, note['text'], note['charttime']))
    
    # Sort chronologically
    events.sort(key=lambda x: (x[1] != 0, x[1]))
    
    # Print timeline with time differences between events
    print(f"=== Patient Timeline - Subject ID: {subject_id} ===")
    
    prev_timestamp = None
    for event in events:
        event_type, timestamp = event[0], event[1]
        
        # Determine time display
        if timestamp == 0:
            time_str = "None"  # Metadata events (start, gender, etc.)
        elif event_type == 'ehr' and event[2] == 'MEDS_BIRTH':
            time_str = "T0"    # Birth event - the reference point
        elif prev_timestamp is None or prev_timestamp == 0:
            # First real event after metadata/birth
            time_str = time_delta_to_str(timestamp - birth_timestamp) if birth_timestamp else "T0"
        elif timestamp == prev_timestamp:
            time_str = "-"     # Same timestamp as previous event
        else:
            time_str = time_delta_to_str(timestamp - prev_timestamp)
        
        if event_type == 'ehr':
            print(f"[{time_str:>15}] {event[2]}")
        else:  # note
            print(f"\n[{time_str:>15}] 📝 CLINICAL NOTE")
            print(f"                  {event[2]}")
            print("                  " + "-"*50)
        
        prev_timestamp = timestamp


In [8]:
print_subject_datapoint(699730651329)
# 809853250681 this is buggy, getting rid of many medical events for some reason
# 699730651329
# 587901451070

=== Patient Timeline - Subject ID: 699730651329 ===
[           None] <start>
[           None] GENDER//MALE
[           None] REGION//5
[           None] ETHNICITY//WHITE
[             T0] MEDS_BIRTH
[    +29.7 years] <time_interval_60mt->
[              -] MEDICAL//652..00
[       +28 days] <time_interval_20d-30d>
[              -] MEDICAL//652..00
[       +17 days] <time_interval_12d-20d>
[              -] MEDICAL//65O..00
[    +11.2 years] <time_interval_60mt->
[              -] MEDICAL//G84..00
[        +9 days] <time_interval_7d-12d>
[              -] MEDICAL//138..00
[              -] MEASUREMENT//66C..00
[              -] Q6
[              -] MEDICAL//weight
[              -] Q4
[              -] MEDICAL//ZV7..00
[              -] MEDICAL//current or ex-smoker
[              -] MEDICAL//Drinker - unspecified
[              -] 12.0
[              -] MEDICAL//122..00
[              -] MEDICAL//466..00
[              -] MEDICAL//height
[              -] Q6
[              -] MEDICA

In [20]:
print_subject_datapoint(10534984)

=== Patient Timeline - Subject ID: 10534984 ===
[           None] <start>
[           None] GENDER//M
[             T0] MEDS_BIRTH
[    +68.6 years] <time_interval_6mt->
[              -] LAB//51146//%
[              -] Q5
[              -] LAB//51200//%
[              -] Q3
[              -] LAB//51221//%
[              -] Q9
[              -] LAB//51222//g/dL
[              -] Q9
[              -] LAB//51244//%
[              -] Q2
[              -] LAB//51248//pg
[              -] Q7
[              -] LAB//51249//%
[              -] Q9
[              -] LAB//51250//fL
[              -] Q3
[              -] LAB//51254//%
[              -] Q2
[              -] LAB//51256//%
[              -] Q8
[              -] LAB//51265//K/uL
[              -] Q6
[              -] LAB//51277//%
[              -] Q1
[              -] LAB//51279//m/uL
[              -] Q9
[              -] LAB//51301//K/uL
[              -] Q7
[              -] LAB//50861//IU/L
[              -] Q7
[              -] 

In [27]:
print_subject_datapoint(12888412)


=== Patient Timeline - Subject ID: 12888412 ===
[           None] <start>
[           None] GENDER//F
[             T0] MEDS_BIRTH
[    +81.0 years] <time_interval_6mt->
[              -] LAB//50862//g/dL
[              -] Q6
[              -] LAB//50868//mEq/L
[              -] Q8
[              -] LAB//50882//mEq/L
[              -] Q8
[              -] LAB//50893//mg/dL
[              -] Q6
[              -] LAB//50902//mEq/L
[              -] Q1
[              -] LAB//50912//mg/dL
[              -] Q7
[              -] LAB//50920//UNK
[              -] LAB//50930//g/dL
[              -] Q5
[              -] LAB//50960//mg/dL
[              -] Q5
[              -] LAB//50970//mg/dL
[              -] Q6
[              -] LAB//50971//mEq/L
[              -] Q8
[              -] LAB//50975//UNK
[              -] LAB//50976//g/dL
[              -] Q4
[              -] LAB//50983//mEq/L
[              -] Q3
[              -] LAB//51006//mg/dL
[              -] Q7
[              -] LAB//5