In [45]:
from folder import StandardFolder
from polars_utils import *

import polars as pl
from pathlib import Path

prevprev_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/Stroke/stroke_n_updatedSep2023_28112023.csv').to_series().to_list()
print(f'{len(prevprev_hn) = }')

PREV_PATH = 'D:/Prut/Warehouses/output/Dec23/n/Stroke/stroke_n_updated_05032024.csv'

prev_hn = pl.read_csv(PREV_PATH).to_series().to_list()
print(f'{len(prev_hn) = }')

len(prevprev_hn) = 27735
len(prev_hn) = 28810


In [46]:
class StrokeIdentify(StandardFolder):
    def __init__(self, folder: str, streaming: bool = True) -> None:
        super().__init__(folder)
        self.streaming = streaming
        self.export_folder = Path('../output/Jun24/wh/complete')
        self.select_dx = [f'I6{i}' for i in range(10)] + ['G45']
        self.select_dx_re = '^' + '|^'.join(self.select_dx)
        # self.prev_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/Stroke/stroke_n_updatedSep2023_28112023.csv').to_series().to_list()
        self.ran_all = False

    def get_dx(self, select: list = None):
        folder_path = self.dx
        to_concat = []
        for path in folder_path.iterdir():
            file = (
                scan_file(path)
                .select(pl.col(['ENC_HN', 'D001KEY', 'D035KEY']))
                .pipe(parse_dates, 'D001KEY')
            )
            if select is not None:
                file = file.filter(pl.col('D035KEY').str.contains(self.select_dx_re))
            file = file.group_by(pl.col(['ENC_HN', 'D001KEY'])).agg(pl.col('D035KEY')).with_columns(pl.col('D035KEY').list.unique().list.sort().list.join(', '))
            to_concat.append(file.collect(streaming=self.streaming))
        self.dx_df = pl.concat(to_concat).unique()


    def get_demo(self):
        folder_path = self.demo
        # cols = ['ENC_HN', 'D020AT3', 'H2L1KEY', 'H6L1KEY', 'H6L1DES']
        cols = ['ENC_HN', 'D020AT3']
        # new_col_names = ['ENC_HN', 'DOB', 'Sex', 'Province_ID', 'Province_Thai']
        new_col_names = ['ENC_HN', 'DOB']
        to_concat = []
        for path in folder_path.iterdir():
            file = scan_file(path)
            if set(cols).issubset(set(file.columns)):
                file = file.select(cols).collect(streaming=self.streaming).pipe(parse_dates, 'D020AT3') # New bug: only works in dataframes, so must collect first
                to_concat.append(file)
        self.demo_df = pl.concat(to_concat).unique()
        self.demo_df = self.demo_df.rename(dict(zip(cols, new_col_names)))

    def run_all(self):
        self.get_dx(select=self.select_dx)
        print('dx')
        self.get_demo()
        print('demo')
        
        self.ran_all = True

    def merge(self):
        if not self.ran_all:
            raise Exception('Please run all first.')
        
        self.merged_df = (
            self.dx_df
            .join(self.demo_df, on=['ENC_HN'], how='left')
            .unique()
            
        )


        # Remove previous
            # .pipe(self.remove_previous_hn)
            # .pipe(print_n)
            # Clip dates
            # .pipe(clip_dates, date_col='D001KEY', start_month=7, start_year=2023, end_month=12, end_year=2023)
            # .pipe(print_n)



In [47]:
# Cases in our other cohorts
FOLDER_OC = "H:/Shared drives/Datalake/Data/20240630_fu"
# Cases not in any cohorts, i.e. new cases
FOLDER_NC = "H:/Shared drives/Datalake/Loosely_criteria_data/newcase_202406(Jan-June2024)/Data for new case iden 202406"

s1 = StrokeIdentify(folder=FOLDER_OC)
s1.run_all()
s1.merge()

er_emr not included.
summary_discharge not included.
dimension table not included.
dx
demo


In [48]:
s1.merged_df

ENC_HN,D001KEY,D035KEY,DOB
str,date,str,date
"""74CBCEF52D1992…",2014-02-06,"""I693""",1939-07-05
"""5F49688AD67C70…",2014-09-29,"""I693""",1945-01-01
"""717DC3A956DD17…",2014-03-31,"""I694""",1936-03-03
"""77CEFE38B88F9B…",2014-08-23,"""I678""",1945-08-23
"""6FF6F4B3B13210…",2014-09-15,"""I693""",1949-06-26
"""3AFED8E1C41060…",2014-06-14,"""I64""",1940-01-15
"""6BDC5C378327A4…",2014-08-08,"""I694""",1929-01-01
"""59C7F8592B38CF…",2014-12-22,"""I693""",1947-04-15
"""CA44EB0C31B69A…",2014-05-15,"""I694""",1948-01-01
"""FE277D635C2305…",2014-02-05,"""I693""",1941-03-03


In [49]:
# Folder is different in structure again!!! must do separaely.
s2 = pl.read_parquet(r"H:\Shared drives\Datalake\Loosely_criteria_data\newcase_202406(Jan-June2024)\Data for new case iden 202406\Diagnosis\DX_202401_202406_encoded.parquet.gzip")
dob = s2.select(pl.col(['ENC_HN', 'D020AT3'])).pipe(parse_dates, 'D020AT3')
s2 = (
    s2
    .select(pl.col(['ENC_HN', 'D001KEY', 'D035KEY', 'D020AT3']))
    .pipe(parse_dates, 'D001KEY')
    .pipe(parse_dates, 'D020AT3')
)

select = s1.select_dx
select_dx_re = '^' + '|^'.join(select)
s2 = s2.filter(pl.col('D035KEY').str.contains(select_dx_re))
s2 = s2.group_by(pl.col(['ENC_HN', 'D001KEY'])).agg(pl.col('D035KEY')).with_columns(pl.col('D035KEY').list.unique().list.sort().list.join(', '))
s2 = s2.join(dob, on=['ENC_HN'], how='left').unique()
s2 = s2.rename({'D020AT3': 'DOB'})
print('ok')
s2

ok


ENC_HN,D001KEY,D035KEY,DOB
str,date,str,date
"""9C721641945228…",2024-03-14,"""I652""",1955-05-21
"""0E9D652F91BF79…",2024-03-18,"""I64""",1950-01-01
"""76D1BA7E6612F6…",2024-03-21,"""I639, I679""",1962-07-03
"""44E92E20FEFFE7…",2024-01-22,"""G450""",1949-12-03
"""9B48572F991B2B…",2024-01-18,"""I693""",1944-12-08
"""04B0F3CCBF7F8B…",2024-01-27,"""I694""",1951-01-06
"""714668A5EE9F9A…",2024-01-10,"""I694""",1953-01-01
"""F32DC80AC33863…",2024-02-15,"""I694""",1943-01-02
"""0103044B31145C…",2024-02-22,"""I608""",2009-12-02
"""00D41C89613516…",2024-04-29,"""I694""",1929-01-01


In [50]:
# merge s1 s2
merged_df = pl.concat([s1.merged_df, s2]).unique()
merged_df

ENC_HN,D001KEY,D035KEY,DOB
str,date,str,date
"""4C2CEC5CC61AF2…",2020-06-18,"""G459""",1957-12-16
"""F04BB1DD15D6A6…",2023-02-16,"""I693""",1942-04-30
"""21B2BF9CCAD23D…",2022-04-04,"""I610""",1963-06-23
"""DD6F676AFA60DE…",2018-07-12,"""I694""",1944-04-10
"""36AE0BA4A35ACC…",2018-02-09,"""I64""",1976-01-02
"""8AE2BE66E16A1D…",2016-08-31,"""G450""",1951-10-09
"""61614D50F2AD42…",2017-10-09,"""I694""",1949-07-05
"""499733E61033C0…",2013-04-30,"""I694""",1943-01-01
"""D91498A2E8BAE1…",2020-07-15,"""I620""",2004-10-31
"""C11183776C8F40…",2024-01-16,"""I694""",1953-09-12


Due to rules set by others on how to draw the flowchart, the following functions cannot be incorporated above.

In [51]:
def remove_previous_hn(lf: pl.LazyFrame, prev_hn=prev_hn) -> pl.LazyFrame:
    return lf.filter(~pl.col('ENC_HN').is_in(prev_hn))

def print_n(df: pl.DataFrame) -> pl.DataFrame:
        print(df['ENC_HN'].n_unique())
        return df

def remove_previous_stroke(df: pl.DataFrame) -> pl.DataFrame:
    # Remove patients who are diagnosed with stroke before entering the cohort
    return df.sort('D001KEY').group_by('ENC_HN', maintain_order=True).first().filter(~pl.col('D035KEY').map_elements(lambda x: 'I69' in x))


In [52]:
# Flow box 1
merged_df.pipe(clip_dates, date_col='D001KEY', start_month=1, start_year=2024, end_month=6, end_day=30, end_year=2024).pipe(remove_previous_hn).pipe(print_n)
# Flow box 2
s1 = merged_df.pipe(remove_previous_stroke).pipe(clip_dates, date_col='D001KEY', start_month=1, start_year=2024, end_month=6, end_day=30, end_year=2024)
s1.pipe(remove_previous_hn).pipe(print_n)
# Flow box 3
s2 = s1.pipe(remove_previous_hn).filter((pl.col('D001KEY') - pl.col('DOB')) >= pl.duration(days=365*18))
s2.pipe(print_n)
# Save output
prev = pl.read_csv(PREV_PATH).pipe(parse_dates, 'Date')
new = s2[['ENC_HN', 'D001KEY', 'D035KEY']].rename({'D001KEY': 'Date', 'D035KEY': 'ICD10'})
output = pl.concat([new, prev])
output.pipe(print_n)

output.write_csv('D:/Prut/Warehouses/output/Jun24/n/stroke/stroke_n_updated_08092024.csv')

2547
1006
983
29793


Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("D035KEY").map_elements(lambda x: ...)
with this one instead:
  + 'I69'.is_in(pl.col("D035KEY"))

  return df.sort('D001KEY').group_by('ENC_HN', maintain_order=True).first().filter(~pl.col('D035KEY').map_elements(lambda x: 'I69' in x))


In [54]:
983+28810

29793