In [1]:
import polars as pl
import yaml
import glob
import os
from pathlib import Path

In [2]:
DATE_FORMAT = "%d/%m/%Y"
SUBJECT_FILE = '/data/home/qc25022/cancer-extraction-pipeline/output/liver_study/subject_information.csv'
config_path = '/data/home/qc25022/cancer-extraction-pipeline/config.yaml'

In [3]:
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
PATHS = config['paths']
OUTPUTS = config['outputs']


In [4]:
observation_files = glob.glob(os.path.join(PATHS['observation_data_dir'], "*.txt"))
observation_dtypes = {
    "e_patid": pl.Int64,
    "obsdate": pl.String,      # Read as string, parse to date later
    "medcodeid": pl.String,
    "value": pl.Float64,
}
observations_df = pl.concat(
    [
        pl.scan_csv(
            f,
            separator="\t",
            has_header=True,
            schema_overrides=observation_dtypes
        ).select(observation_dtypes.keys())
        for f in observation_files
    ],
    how="vertical",
)

In [5]:
subjects_df = pl.scan_csv(SUBJECT_FILE).rename({"subject_id": "e_patid"})

In [6]:
subjects_df.fetch(10)

  subjects_df.fetch(10)


e_patid,is_case,cancerdate,site,e_pracid,region,gender,yob,ethnicity,split
i64,i64,str,str,i64,i64,str,i64,str,str
430727151309,0,,"""Control""",51309,1,"""male""",1954,"""White""","""train"""
226559150624,0,,"""Control""",50624,8,"""male""",1947,"""Unknown""","""train"""
1399713050807,0,,"""Control""",50807,9,"""male""",1933,"""White""","""train"""
666794550402,0,,"""Control""",50402,2,"""male""",1963,"""White""","""train"""
518528650618,1,"""2016-04-01T00:00:00.000000000""","""liver""",50618,5,"""male""",1936,"""White""","""train"""
993863750159,0,,"""Control""",50159,2,"""female""",1940,"""White""","""train"""
52637350490,0,,"""Control""",50490,5,"""female""",1937,"""White""","""train"""
316654450396,0,,"""Control""",50396,8,"""female""",1937,"""White""","""train"""
153103150767,0,,"""Control""",50767,5,"""female""",1930,"""White""","""train"""
1189211850680,0,,"""Control""",50680,7,"""male""",1945,"""White""","""train"""


In [7]:
obs_standardized_df = observations_df.with_columns(
    time=pl.col("obsdate").str.to_date(DATE_FORMAT, strict=False),
    code=pl.lit("medcodeid//") + pl.col("medcodeid"),
    numeric_value=pl.col("value"),
    # duration=pl.lit(None, dtype=pl.Float64),
).select(["e_patid", "time", "code", "numeric_value"])

In [24]:
subject_filter_df = subjects_df.select(
    "e_patid",
    # This correctly handles your "YYYY-MM-DDTHH:MM:SS..." format
    pl.col("cancerdate").str.to_datetime().cast(pl.Date)
)

In [25]:
subject_filter_df.fetch(5)

  subject_filter_df.fetch(5)


e_patid,cancerdate
i64,date
430727151309,
226559150624,
1399713050807,
666794550402,
518528650618,2016-04-01


In [38]:
filtered_medical_events_df = obs_standardized_df.join(
    subject_filter_df, on="e_patid", how="inner"
).filter(
    (pl.col("time") <= pl.col("cancerdate")) | (pl.col("cancerdate").is_null())
).drop("cancerdate")

In [39]:
filtered_medical_events_df.dtypes

  filtered_medical_events_df.dtypes


KeyboardInterrupt: 

# View unsorted data

In [3]:
path = '/data/scratch/qc25022/liver/intermediate_sorted/*.parquet'
lazydf = pl.scan_parquet(path)

In [4]:
lazydf.fetch(50)

  lazydf.fetch(50)


subject_id,time,code,numeric_value
i64,date,str,f64
17851000,1945-01-01,"""MEDS_BIRTH""",
17851000,1964-01-01,"""medcodeid//263435019""",
17851000,1965-01-01,"""medcodeid//263435019""",
17851000,1972-01-01,"""medcodeid//263435019""",
17851000,1987-11-27,"""medcodeid//14070451000006117""",
…,…,…,…
17851000,1998-03-12,"""medcodeid//254063019""",
17851000,1998-03-12,"""medcodeid//249721015""",
17851000,1998-03-12,"""medcodeid//787121000006116""",65.3
17851000,1998-03-12,"""medcodeid//253677014""",65.0
