# Clopidogrel-Stroke Cohort - October 
(Not derived from the stroke cohort)
(New criteria)

In [1]:
from folder import StandardFolder
from polars_utils import *
import polars as pl
from pathlib import Path
import matplotlib.pyplot as plt

**Please correct me if I'm wrong / Suggest other ICD10s to add https://icd.who.int/browse10/2016/en#/I60-I69**

- Clopidogrel (P2Y12 inhibitor) inhibits platelet aggregation.

- Clopidogrel is mainly converted into its active metabolite through CYP2C19.

This study aims to look at patients who have CYP2C19 polymorphism and their possible complications.

- Poor Metabolisers
    - I63: Ischemic stroke

**Exclude OACs (DOACs, Warfarin)**

Ward mapping: https://docs.google.com/spreadsheets/d/1qzxQRRRC0Vs576MvMdiJMfCCxkfvwZlQpzI1Ap9r3fQ/edit#gid=440115548





## UPDATES

**Starting with clopidogrel**
- Recurrence is moved downwards w.r.t. the flowchart

**Clopidogrel only**
- No longer including ticagrelor or cilostazol

**Excluding TIA**
- G45: TIA

**Excluding AFib**
- From AFib cohort




In [2]:
target_icds = ['I63'] # No more G45

warehouse_folder = StandardFolder(folder='D:/Datalake/Data/20231231_fu_nc')

admission_visit_cols = ['ENC_HN', 'D001KEY', 'D108KEY']
stroke_unit = 'MDJ1' # in admissions
emergency_department = 'OER101' # in visits

dx_cols = ['ENC_HN', 'D001KEY', 'D035KEY', 'D108KEY', 'D195KEY'] # Primary diagnosis is when D195KEY is "1"
sample_file = pl.read_parquet(list(warehouse_folder.dx.iterdir())[-1])
assert {stroke_unit, emergency_department}.issubset(set([x.strip(' ') for x in sample_file['D108KEY'].unique() if x ]))

readme not included.


In [3]:
# oac_df = pl.read_csv('../std/meds_AF.csv')
# oac_df

In [4]:
class ClopidogrelStrokeIdentify(StandardFolder):
    def __init__(
            self, 
            folder: str = 'D:/Datalake/Data/20231231_fu_nc', 
            export_folder: str = 'D:/Prut/Warehouses/output/Dec23/n/Clopidogrel',
            select_dx: list[str] = target_icds, 
            select_dept: list[str] = [emergency_department, stroke_unit],
            streaming: bool = True
            ) -> None:
        super().__init__(folder)

        self.streaming = streaming
        self.folder = Path(folder)
        self.export_folder = Path(export_folder)
        self.select_dx = select_dx
        self.select_dx_re = '^' + '|^'.join(self.select_dx)
        self.clopidogrel = pl.read_csv('../std/clopidogrel_med_code.csv').to_series(0).to_list() # ['PLAG-T-', 'CLAP-T-', 'COPL-T-', 'CLOD-T-', 'PLAP-T-', 'PLAP-T1', 'PLAV-T-']
        self.ticagrelor = ['BRIL-T-',]
        self.cilostazol = ['CIBZ-T-', 'CILO-T-', 'PLSR-C-', 'PLET-W-', 'PLET1T-', 'PLET-T-']
        self.oac = pl.read_csv('../std/meds_AF.csv')['CODE'].to_list()
        self.select_dept = select_dept
        self.primary_dx = '1'
        self.has_oac = []
        self.ran_all = False
        self.n_list = []


    def get_dx(self):
        folder_path = self.dx
        to_concat = []
        for path in folder_path.iterdir():
            file = (
                scan_file(path)
                .select(pl.col(['ENC_HN', 'D001KEY', 'D035KEY', 'D108KEY', 'D195KEY']))
                .filter(pl.col('D035KEY').str.contains(self.select_dx_re))
                .filter(pl.col('D108KEY').is_in(self.select_dept))
                .filter(pl.col('D195KEY') == self.primary_dx)
                .pipe(parse_dates, 'D001KEY')
            )
            # file = file.group_by(pl.col(['ENC_HN', 'D001KEY'])).agg(pl.col('D035KEY')).with_columns(pl.col('D035KEY').list.unique().list.sort().list.join(', '))
            to_concat.append(file.collect(streaming=self.streaming))
        self.dx_df = pl.concat(to_concat).unique()

    def get_meds(self):
        folder_path = self.bill
        to_concat = []
        for path in folder_path.iterdir():
            file = scan_file(path)
            # Deal with alternative file structures
            if {'PER_DATE_2', 'SERVICE_ID', 'CAL_SER_AMT'}.issubset(file.columns):
                file = file.rename({'PER_DATE_2': 'D001KEY', 'SERVICE_ID': 'D033KEY', 'CAL_SER_AMT': 'M1022'})
            file = (
                file
                .select(pl.col(['ENC_HN', 'D001KEY', 'D033KEY', 'M1022']))
                .pipe(parse_dates, 'D001KEY')
            )
            # Store people who have ever recived an OAC
            self.has_oac.extend(file.filter(pl.col('D033KEY').is_in(self.oac)).select('ENC_HN').unique().collect().to_series().to_list())

            # Select clopidogrel, ticagrelor and cilostazol
            # file = file.filter(pl.col('D033KEY').is_in(self.clopidogrel + self.ticagrelor + self.cilostazol))
            file = file.filter(pl.col('D033KEY').is_in(self.clopidogrel))

            to_concat.append(file.collect(streaming=self.streaming))

        self.meds_df = pl.concat(to_concat)

        # pivot
        # self.meds_df = self.meds_df.pivot(index=['ENC_HN', 'D001KEY'], values='M1022', columns='D033KEY', aggregate_function='max').unique()

    def get_demo(self):
        folder_path = self.demo
        cols = ['ENC_HN', 'D020AT3', 'H2L1KEY']
        new_col_names = ['ENC_HN', 'DOB', 'Sex']
        to_concat = []
        for path in folder_path.iterdir():
            file = scan_file(path)
            if set(cols).issubset(set(file.columns)):
                file = file.select(cols).collect(streaming=self.streaming).pipe(parse_dates, 'D020AT3') # New bug: only works in dataframes, so must collect first
                to_concat.append(file)
        self.demo_df = pl.concat(to_concat).unique()
        self.demo_df = self.demo_df.rename(dict(zip(cols, new_col_names)))

    def get_deaths(self):
        folder_path = self.deaths
        to_concat = []
        for path in folder_path.iterdir():
            file = (
                scan_file(path)
                .select(pl.col('ENC_HN', 'D001KEY')).pipe(parse_dates, 'D001KEY').rename({'D001KEY': 'Death_date'})
            )
            to_concat.append(file.collect(streaming=self.streaming))
        self.deaths_df = pl.concat(to_concat).unique()

    def run_all(self):
        self.get_dx()
        print('dx')
        # self.get_demo()
        # print('demo')
        self.get_meds()
        print('meds')
        self.get_deaths()
        print('deaths')
        
        self.ran_all = True

    def merge(self):
        if not self.ran_all:
            raise Exception('Please run all first.')
        
        self.merged_df = (
            self.meds_df
            # .join(self.demo_df, on=['ENC_HN'], how='left')
            .join(self.dx_df, on=['ENC_HN', 'D001KEY'], how='left') # switched orders of med and dx dfs
            .join(self.deaths_df, on=['ENC_HN'], how='left') # changed to left 07-10-24
            .unique()
            
        )


In [5]:
s = ClopidogrelStrokeIdentify()
s.run_all()
s.merge()

readme not included.
dx
meds
deaths


### The following numbers are valid for data between 2010-2023.

Number of patients who received all forms of clopidogrel

In [6]:
assert s.meds_df['ENC_HN'].n_unique() == s.merged_df['ENC_HN'].n_unique() # sanity check for left join
print(s.merged_df['ENC_HN'].n_unique())

29867


Visits at each ward

In [7]:
s.merged_df['D108KEY'].value_counts()

D108KEY,count
str,u32
,607658
"""MDJ1""",1868
"""OER101""",276


Patients at each ward (duplicates included)

In [8]:
s.merged_df.select(pl.col('ENC_HN', 'D108KEY')).group_by('D108KEY').agg(pl.col('ENC_HN').n_unique())

D108KEY,ENC_HN
str,u32
"""MDJ1""",928
"""OER101""",152
,29844


In [9]:
clop_df_1 = s.merged_df.filter(pl.col('D108KEY').is_in(['MDJ1', 'OER101']))
clop_hn_1 = clop_df_1['ENC_HN'].unique().to_list()
print(f'Number of patients who are given clopidogrel with a selected primary diagnosis at these two wards: {len(clop_hn_1)}')

Number of patients who are given clopidogrel with a selected primary diagnosis at these two wards: 1035


Remove AFib

In [10]:
af_df = (
    pl.read_csv(r'D:\Prut\Warehouses\output\Jun24\n\af\af_cohort_n=24928_Jun2024.csv')
    .with_columns(pl.col('First_date').cast(pl.Date))
    .filter(pl.col('First_date').dt.year() <= 2023)
)
print(f'Number of AFib patients up to 2023: {len(af_df)}')
af_hn = af_df.to_series().to_list()

Number of AFib patients up to 2023: 23337


In [11]:
clop_hn_2 = set(clop_hn_1) - set(af_hn)
print(f'Number of patients receiving clopidogrel who does not have AFib: {len(clop_hn_2)}')

Number of patients receiving clopidogrel who does not have AFib: 897


In [12]:
clop_hn_3 = clop_hn_2 - set(s.has_oac)
clop_df_3 = clop_df_1.filter(pl.col('ENC_HN').is_in(clop_hn_3))
assert clop_df_3['ENC_HN'].n_unique() == len(clop_hn_3) # sanity check
print(f'Number of patients receiving clopidogrel who does not have AFib or does not recevie OACs: {len(clop_hn_3)}')

Number of patients receiving clopidogrel who does not have AFib or does not recevie OACs: 843


## Exporting here

In [13]:
clop_df_3

ENC_HN,D001KEY,D033KEY,M1022,D035KEY,D108KEY,D195KEY,Death_date
str,date,str,str,str,str,str,date
"""5503812C4CC9D4…",2012-10-09,"""PLAP-T1""","""0""","""I633""","""MDJ1""","""1""",
"""2B0E1D905C20F8…",2013-07-11,"""PLAP-T1""","""3""","""I635""","""MDJ1""","""1""",
"""5F49688AD67C70…",2013-12-13,"""PLAP-T1""","""0""","""I638""","""MDJ1""","""1""",
"""A7C5ED58AF853D…",2013-07-02,"""PLAP-T1""","""15""","""I634""","""MDJ1""","""1""",2019-08-07
"""2378AD195A9EA0…",2014-10-22,"""PLAP-T1""","""10""","""I633""","""MDJ1""","""1""",
"""52D42A333B94F5…",2014-10-13,"""PLAP-T1""","""3""","""I634""","""MDJ1""","""1""",
"""7CA13FD56852A9…",2014-09-03,"""PLAP-T1""","""3""","""I638""","""MDJ1""","""1""",
"""DFAF4870DF6BF7…",2014-09-24,"""PLAP-T1""","""13""","""I635""","""MDJ1""","""1""",
"""F144A5A0FB96ED…",2015-06-14,"""CLOD-T-""","""15""","""I639""","""MDJ1""","""1""",
"""059D3E1EE71E9F…",2016-07-29,"""PLAV-T-""","""4""","""I639""","""OER101""","""1""",


---

Added Nov 2024:

### More Exclusion Criteria
#### Already excluded above:
- OAC
- AFib
  
#### To Do
- Cardioembolism I634
- Malignancies C00 - C97
- Hepatic disease K70 - K77
- eGFR < 30
- Plt < 100k or > 450k
- Hct < 25%
- Acute illness / Drug abuse / Sickle cell / Cognitive impairments --- not excluded here.

And add deaths.

In [14]:
# Pull all ICD10s
exclude_more_re = '^I634|^C|^K7'

# Labs required


---

Number of **patients** with **readmissions**, as defined as more than one visit of primarily diagnosed stroke.

In [15]:
_readmission_df = clop_df_3.select(['ENC_HN', 'D001KEY']).unique().group_by('ENC_HN').len().filter(pl.col('len') > 1)
readmission_hn = _readmission_df.select('ENC_HN').unique().to_series().to_list()
print(len(readmission_hn))

60


In [16]:
_readmission_df.select('ENC_HN').unique().write_csv(r'D:\Prut\Warehouses\output\Jun24\wh\intermediate\readmission_df.csv')

Organise

In [17]:
single_admission_df = clop_df_3.filter(~pl.col('ENC_HN').is_in(readmission_hn))
readmission_df = clop_df_3.filter(pl.col('ENC_HN').is_in(readmission_hn))

print(f'{len(single_admission_df) = }')
print(f'{len(readmission_df) = }')
print(f'{single_admission_df['ENC_HN'].n_unique() = }')
print(f'{readmission_df['ENC_HN'].n_unique() = }')

len(single_admission_df) = 1506
len(readmission_df) = 253
single_admission_df['ENC_HN'].n_unique() = 783
readmission_df['ENC_HN'].n_unique() = 60


Deaths

Deaths in single admissions

In [18]:
single_admission_df.filter(pl.col('Death_date').is_not_null())['ENC_HN'].n_unique()

print(f'Percentage of deaths: {
    single_admission_df.filter(pl.col('Death_date').is_not_null())['ENC_HN'].n_unique() / \
    single_admission_df['ENC_HN'].n_unique() \
    :.1%
}'
)

Percentage of deaths: 5.1%


Death in readmissions

In [19]:
readmission_df.filter(pl.col('Death_date').is_not_null())['ENC_HN'].n_unique()

print(f'Percentage of deaths: {
    readmission_df.filter(pl.col('Death_date').is_not_null())['ENC_HN'].n_unique() / \
    readmission_df['ENC_HN'].n_unique() \
    :.1%
}'
)

Percentage of deaths: 3.3%
