# Clopidogrel-Stroke Cohort
(Not derived from the stroke cohort)

In [1]:
from folder import StandardFolder
from polars_utils import *
import polars as pl
from pathlib import Path
import matplotlib.pyplot as plt

**Please correct me if I'm wrong / Suggest other ICD10s to add https://icd.who.int/browse10/2016/en#/I60-I69**

- Clopidogrel (P2Y12 inhibitor) inhibits platelet aggregation.

- Clopidogrel is mainly converted into its active metabolite through CYP2C19.

This study aims to look at patients who have CYP2C19 polymorphism and their possible complications.

- Poor Metabolisers
    - I63: Ischemic stroke

- Rapid metabolisers
    - I60: Subarachnoid hemorrhage
    - I61: Intracerebral hemorrhage
    - I62: Other nontraumatic intracranial haemorrhage

Ward mapping: https://docs.google.com/spreadsheets/d/1qzxQRRRC0Vs576MvMdiJMfCCxkfvwZlQpzI1Ap9r3fQ/edit#gid=440115548

In [2]:
target_icds = ['I60', 'I61', 'I62', 'I63']

warehouse_folder = StandardFolder(folder='D:/Datalake/Data/20231231_fu_nc')

admission_visit_cols = ['ENC_HN', 'D001KEY', 'D108KEY']
stroke_unit = 'MDJ1' # in admissions
emergency_department = 'OER101' # in visits

dx_cols = ['ENC_HN', 'D001KEY', 'D035KEY', 'D108KEY', 'D195KEY'] # Primary diagnosis is when D195KEY is "1"
sample_file = pl.read_parquet(list(warehouse_folder.dx.iterdir())[-1])
assert {stroke_unit, emergency_department}.issubset(set([x.strip(' ') for x in sample_file['D108KEY'].unique() if x ]))

readme not included.


In [3]:
class ClopidogrelStrokeIdentify(StandardFolder):
    def __init__(
            self, 
            folder: str = 'D:/Datalake/Data/20231231_fu_nc', 
            export_folder: str = 'D:/Prut/Warehouses/output/Dec23/n/Clopidogrel',
            select_dx: list[str] = target_icds, 
            select_dept: list[str] = [emergency_department, stroke_unit],
            streaming: bool = True
            ) -> None:
        super().__init__(folder)

        self.streaming = streaming
        self.folder = Path(folder)
        self.export_folder = Path(export_folder)
        self.select_dx = select_dx
        self.select_dx_re = '^' + '|^'.join(self.select_dx)
        self.clopidogrel = pl.read_csv('../std/clopidogrel_med_code.csv').to_series(0).to_list()
        self.select_dept = select_dept
        self.primary_dx = '1'
        self.ran_all = False
        self.n_list = []


    def get_dx(self):
        folder_path = self.dx
        to_concat = []
        for path in folder_path.iterdir():
            file = (
                scan_file(path)
                .select(pl.col(['ENC_HN', 'D001KEY', 'D035KEY', 'D108KEY', 'D195KEY']))
                .filter(pl.col('D035KEY').str.contains(self.select_dx_re))
                .filter(pl.col('D108KEY').is_in(self.select_dept))
                .filter(pl.col('D195KEY') == self.primary_dx)
                .pipe(parse_dates, 'D001KEY')
            )
            # file = file.group_by(pl.col(['ENC_HN', 'D001KEY'])).agg(pl.col('D035KEY')).with_columns(pl.col('D035KEY').list.unique().list.sort().list.join(', '))
            to_concat.append(file.collect(streaming=self.streaming))
        self.dx_df = pl.concat(to_concat).unique()

    def get_meds(self):
        folder_path = self.bill
        to_concat = []
        for path in folder_path.iterdir():
            file = scan_file(path)
            # Deal with alternative file structures
            if {'PER_DATE_2', 'SERVICE_ID', 'CAL_SER_AMT'}.issubset(file.columns):
                file = file.rename({'PER_DATE_2': 'D001KEY', 'SERVICE_ID': 'D033KEY', 'CAL_SER_AMT': 'M1022'})
            file = (
                file
                .select(pl.col(['ENC_HN', 'D001KEY', 'D033KEY', 'M1022']))
                .pipe(parse_dates, 'D001KEY')
            )
            # Select clopidogrel
            file = file.filter(pl.col('D033KEY').is_in(self.clopidogrel))

            to_concat.append(file.collect(streaming=self.streaming))

        self.meds_df = pl.concat(to_concat)

        # pivot
        # self.meds_df = self.meds_df.pivot(index=['ENC_HN', 'D001KEY'], values='M1022', columns='D033KEY', aggregate_function='max').unique()

    def get_demo(self):
        folder_path = self.demo
        cols = ['ENC_HN', 'D020AT3', 'H2L1KEY']
        new_col_names = ['ENC_HN', 'DOB', 'Sex']
        to_concat = []
        for path in folder_path.iterdir():
            file = scan_file(path)
            if set(cols).issubset(set(file.columns)):
                file = file.select(cols).collect(streaming=self.streaming).pipe(parse_dates, 'D020AT3') # New bug: only works in dataframes, so must collect first
                to_concat.append(file)
        self.demo_df = pl.concat(to_concat).unique()
        self.demo_df = self.demo_df.rename(dict(zip(cols, new_col_names)))

    def run_all(self):
        self.get_dx()
        print('dx')
        # self.get_demo()
        # print('demo')
        self.get_meds()
        print('meds')
        
        self.ran_all = True

    def merge(self):
        if not self.ran_all:
            raise Exception('Please run all first.')
        
        self.merged_df = (
            self.dx_df
            # .join(self.demo_df, on=['ENC_HN'], how='left')
            .join(self.meds_df, on=['ENC_HN', 'D001KEY'], how='inner')
            .unique()
            
        )


In [4]:
s = ClopidogrelStrokeIdentify()
s.run_all()
s.merge()

readme not included.
dx
meds


### The following numbers are valid for data between 2010-2023.

A flowchart will follow later after your initial comments and my adjustments.

Number of **patients** with a **primary diagnosis** of _stroke_, from **visits** to the **ER** or **admissions** to the **stroke unit**.

In [5]:
print(s.dx_df['ENC_HN'].n_unique())
# s.dx_df.head()

4479


Number of **patients** with **readmissions**, as defined as more than one visit of primarily diagnosed stroke.

In [6]:
readmission_hn = s.dx_df.select(['ENC_HN', 'D001KEY']).unique().group_by('ENC_HN').len().filter(pl.col('len') > 1)['ENC_HN']
print(len(readmission_hn))

501


Distribution of ICD-10 of readmission (I63 is ischemic stroke)

In [7]:
temp = s.dx_df.filter(pl.col('ENC_HN').is_in(readmission_hn)).select(['D035KEY'])
temp = temp.with_columns(pl.col('D035KEY').str.slice(0, 3).alias('ICD10'))
temp = temp['ICD10'].value_counts().sort('ICD10')
temp

ICD10,count
str,u32
"""I60""",8
"""I61""",203
"""I62""",41
"""I63""",849


Number of **patients** that received clopidogrel

In [8]:
print(s.meds_df['ENC_HN'].n_unique())
# s.meds_df.head()

29867


Number of patients that have **ever received clopidogrel** at **any time** and has had a **stroke recurrence**.

In [9]:
s.meds_df.filter(pl.col('ENC_HN').is_in(readmission_hn))['ENC_HN'].n_unique()

257