In [1]:
from folder import StandardFolder
from polars_utils import *

import polars as pl
from pathlib import Path

# NEXT: add 2005-2009 then exclude later?
prev_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/AF/AF_201001_202306.csv').to_series().to_list()
ecg_holter_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/AF/af_for_aj_eak_180124.csv')
print(f'{len(prev_hn) = }')

len(prev_hn) = 21989


In [32]:
class AFIdentify(StandardFolder):
    def __init__(self, folder: str, streaming: bool = True) -> None:
        super().__init__(folder)
        self.streaming = streaming
        self.export_folder = Path('../output/Dec23/wh/complete')
        self.select_dx = ['I48']
        self.select_dx_re = '^' + '|^'.join(self.select_dx)
        self.meds_to_select = pl.read_csv('../std/meds_AF.csv')
        # self.prev_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/Stroke/stroke_n_updatedSep2023_28112023.csv').to_series().to_list()
        self.ran_all = False

    def get_dx(self, select: list = None):
        folder_path = self.dx
        to_concat = []
        for path in folder_path.iterdir():
            file = (
                scan_file(path)
                .select(pl.col(['ENC_HN', 'D001KEY', 'D035KEY']))
                .pipe(parse_dates, 'D001KEY')
            )
            if select is not None:
                file = file.filter(pl.col('D035KEY').str.contains(self.select_dx_re))
            file = file.group_by(pl.col(['ENC_HN', 'D001KEY'])).agg(pl.col('D035KEY')).with_columns(pl.col('D035KEY').list.unique().list.sort().list.join(', '))
            to_concat.append(file.collect(streaming=self.streaming))
        self.dx_df = pl.concat(to_concat).unique()


    def get_demo(self):
        folder_path = self.demo
        cols = ['ENC_HN', 'D020AT3', 'H2L1KEY', 'H6L1KEY', 'H6L1DES']
        new_col_names = ['ENC_HN', 'DOB', 'Sex', 'Province_ID', 'Province_Thai']
        to_concat = []
        for path in folder_path.iterdir():
            file = scan_file(path)
            if set(cols).issubset(set(file.columns)):
                file = file.select(cols).collect(streaming=self.streaming).pipe(parse_dates, 'D020AT3') # New bug: only works in dataframes, so must collect first
                to_concat.append(file)
        self.demo_df = pl.concat(to_concat).unique()
        self.demo_df = self.demo_df.rename(dict(zip(cols, new_col_names)))

    def run_all(self):
        self.get_dx(select=self.select_dx)
        print('dx')
        self.get_demo()
        print('demo')
        self.get_meds()
        print('meds')
        
        self.ran_all = True

    def merge(self):
        if not self.ran_all:
            raise Exception('Please run all first.')
        
        self.merged_df = (
            self.dx_df
            .join(self.demo_df, on=['ENC_HN'], how='left')
            .join(self.meds_df, on=['ENC_HN', 'D001KEY'], how='outer_coalesce')
            .unique()
            
        )

    def get_meds(self):
        folder_path = self.bill
        select = self.meds_to_select.to_series().to_list()
        to_concat = []
        for path in folder_path.iterdir():
            file = (
                scan_file(path)
                # .filter(pl.col('ENC_HN').is_in(self.hn_list))
                )
            if {'PER_DATE_2', 'SERVICE_ID', 'CAL_SER_AMT'}.issubset(file.columns):
                file = file.rename({'PER_DATE_2': 'D001KEY', 'SERVICE_ID': 'D033KEY', 'CAL_SER_AMT': 'M1022'})
            file = (
                file
                .select(pl.col(['ENC_HN', 'D001KEY', 'D033KEY', 'M1022']))
                .pipe(parse_dates, 'D001KEY')
            )
            if select is not None:
                file = file.filter(pl.col('D033KEY').is_in(select))

            to_concat.append(file.collect(streaming=self.streaming))
        
        # pivot
        self.meds_df = pl.concat(to_concat).pivot(index=['ENC_HN', 'D001KEY'], values='M1022', columns='D033KEY', aggregate_function='max').unique()

        # rename
        for k, v in zip(self.meds_to_select['CODE'], self.meds_to_select['dosed_name']):
            if k in self.meds_df.columns:
                self.meds_df = self.meds_df.rename({k:v})


        # Remove previous
            # .pipe(self.remove_previous_hn)
            # .pipe(print_n)
            # Clip dates
            # .pipe(clip_dates, date_col='D001KEY', start_month=7, start_year=2023, end_month=12, end_year=2023)
            # .pipe(print_n)



In [52]:
s = AFIdentify(folder='D:/Datalake/Data/20231231_fu_nc')
s.run_all()
s.merge()

readme not included.
dx
demo
meds


Due to rules set by others on how to draw the flowchart, the following functions cannot be incorporated above.

In [4]:
def remove_previous_hn(lf: pl.LazyFrame, prev_hn=prev_hn) -> pl.LazyFrame:
    return lf.filter(~pl.col('ENC_HN').is_in(prev_hn))

def print_n(df: pl.DataFrame) -> pl.DataFrame:
        print(df['ENC_HN'].n_unique())
        return df


In [53]:
s1 = s.merged_df.pipe(clip_dates, date_col='D001KEY', start_month=7, start_year=2023, end_month=12, end_year=2023).pipe(remove_previous_hn).pipe(print_n)


2621


In [29]:
icd_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/AF/af_for_aj_eak_180124.csv').filter(pl.col('leg') == 'icd').to_series().to_list()

In [67]:
panda = s1.filter(pl.col('ENC_HN').is_in(icd_hn)).to_pandas()

In [75]:
panda[panda[panda.columns[panda.columns.str.endswith('mg')]].notna().any(axis=1)]['ENC_HN'].nunique()

140

In [8]:
# Flow box 1
s1 = s.merged_df.pipe(clip_dates, date_col='D001KEY', start_month=7, start_year=2023, end_month=12, end_year=2023).pipe(remove_previous_hn).pipe(print_n)

# Flow box 3
s2 = s1.pipe(remove_previous_hn).filter((pl.col('D001KEY') - pl.col('DOB')) >= pl.duration(days=365*18))
s2.pipe(print_n)
# Save output
prev = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/Stroke/stroke_n_updatedSep2023_28112023.csv').pipe(parse_dates, 'Date')
new = s2[['ENC_HN', 'D001KEY', 'D035KEY']].rename({'D001KEY': 'Date', 'D035KEY': 'ICD10'})
output = pl.concat([new, prev])
output.pipe(print_n)

output.write_csv('D:/Prut/Warehouses/output/Dec23/n/Stroke/af_n_updated_12032024.csv')

875
873
28511


ignore below

In [4]:
prev_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/AF/AF_201001_202306.csv')
ecg_holter_hn = pl.read_csv('D:/Prut/Warehouses/output/Dec23/n/AF/df_af_use_send_CR_210324.csv')

In [14]:
from datetime import datetime
d = "2022-01-16"
d= datetime.strftime(datetime.strptime(d, '%Y-%m-%d'), '%d/%m/%y')

'16/01/22'

In [75]:
lt = prev_hn[['ENC_HN', 'Diag', 'DiagDate']]
lt.columns = ['ENC_HN', 'how', 'Date']
lt = lt.with_columns(pl.col('Date').map_elements(lambda x: datetime.strftime(datetime.strptime(x, '%Y-%m-%d'), '%d/%m/%y')))
lt = lt[['ENC_HN', 'Date']]
lt

ENC_HN,Date
str,str
"""1B39D7D247FAF7…","""16/01/22"""
"""F44F7AD3764FD9…","""31/08/22"""
"""4798862B04EA72…","""22/06/21"""
"""FAB13CDF2898F4…","""22/06/21"""
"""D6DCF6F5555960…","""12/10/21"""
"""D5460D16700001…","""04/02/21"""
"""F1F523E2DCCA5F…","""31/05/21"""
"""78A94DE45A1987…","""13/01/21"""
"""D85CEDCFCBA668…","""13/03/21"""
"""F0A2E9BFFFD979…","""08/09/22"""


In [70]:
rt = ecg_holter_hn
rt_drop = rt.filter(~pl.col('ENC_HN').is_in(lt.to_series().to_list()))

In [71]:
icd = AFIdentify(folder='D:/Datalake/Data/20231231_fu_nc')
icd.get_dx(select=['I48'])
rt_icd = clip_dates(icd.dx_df, 'D001KEY', start_month=7, start_day=31, start_year=2023, end_year=2023).filter(~pl.col('ENC_HN').is_in(lt.to_series().to_list()))
rt_icd = rt_icd.with_columns(how = pl.lit('icd')).rename({'D001KEY': 'Date'}).select(pl.col(['ENC_HN','how', 'Date'])).with_columns(pl.col('Date').map_elements(lambda x: datetime.strftime(x, '%d/%m/%y')))
rt_icd

readme not included.


ENC_HN,how,Date
str,str,str
"""9D130DB828AD02…","""icd""","""27/09/23"""
"""77583B105476FB…","""icd""","""31/10/23"""
"""E25944D42FFCFB…","""icd""","""12/10/23"""
"""FEC8F461342585…","""icd""","""21/10/23"""
"""C8D2D19B630B5C…","""icd""","""22/11/23"""
"""062202AD8E828E…","""icd""","""29/09/23"""
"""786C8E80C82CB8…","""icd""","""24/12/23"""
"""C157693532A7E9…","""icd""","""12/10/23"""
"""C0566817264F1B…","""icd""","""16/08/23"""
"""25A79F62B8270E…","""icd""","""14/09/23"""


In [72]:
rt = pl.concat([rt_drop, rt_icd])
rt = rt.group_by('ENC_HN').agg(pl.col('Date').min())

In [76]:
cc = pl.concat([lt, rt])
assert cc.to_series().n_unique() == cc.shape[0]
cc

ENC_HN,Date
str,str
"""1B39D7D247FAF7…","""16/01/22"""
"""F44F7AD3764FD9…","""31/08/22"""
"""4798862B04EA72…","""22/06/21"""
"""FAB13CDF2898F4…","""22/06/21"""
"""D6DCF6F5555960…","""12/10/21"""
"""D5460D16700001…","""04/02/21"""
"""F1F523E2DCCA5F…","""31/05/21"""
"""78A94DE45A1987…","""13/01/21"""
"""D85CEDCFCBA668…","""13/03/21"""
"""F0A2E9BFFFD979…","""08/09/22"""


In [28]:
cc.write_csv('D:/Prut/Warehouses/output/Dec23/n/AF/af_n_21032024.csv')