# Note Events

In [4]:
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
from importlib import reload

np.warnings = warnings

In [5]:
import sys
sys.path.append('..')  # Replace with the actual path to ROOT

# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
#ROOT_DIR = Path('')

## Load data

In [6]:
from datetime import timedelta

import re

import requests
import certifi
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
df_noteevents = pd.read_csv(Path(ROOT_DIR / 'data' /  "NOTEEVENTS.csv"))

  df_noteevents = pd.read_csv(Path(ROOT_DIR / 'data' /  "NOTEEVENTS.csv"))


In [8]:
df_admissions = pd.read_csv(Path(ROOT_DIR / 'data' /  "ADMISSIONS.csv"))

In [9]:
df_labevents = pd.read_csv(Path(ROOT_DIR / 'data' /  "LABEVENTS.csv"))

## Data Exploration

In the TEXT, we can see that the dates and PHI have been converted for confidentiality. There are '\n' characters, numbers and punctuation.

We will filter Notes that are made 6hours after admit time.

There appears to be a lot of notes with NA charttimes. We will keep these for now. 


In [6]:
df_noteevents.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [None]:
#df_admissions.columns
#df_noteevents.columns

In [10]:
# merging admissions with noteevents to get ADMITTIME
df_noteevents = pd.merge(df_noteevents, df_admissions[['HADM_ID', 'ADMITTIME']], on='HADM_ID', how='left')

# convert to datetime
df_noteevents['CHARTTIME'] = pd.to_datetime(df_noteevents['CHARTTIME'])
df_noteevents['ADMITTIME'] = pd.to_datetime(df_noteevents['ADMITTIME'])

In [11]:
# Filter to 6 hours from ADMITTIME
# keep NA values, as not sure how to handle them yet
df_noteevents_filtered = df_noteevents[
    (df_noteevents['CHARTTIME'].isna()) | 
    (df_noteevents['CHARTTIME'] <= df_noteevents['ADMITTIME'] + timedelta(hours=6))
]

In [12]:
print(df_noteevents.shape[0], ": Noteevents" )
print(df_noteevents_filtered.shape[0], ": Noteevents Filtered")

2083180 : Noteevents
436942 : Noteevents Filtered


In [13]:
# total NA values in dataframe
df_noteevents_filtered['CHARTTIME'].isna().sum().sum()

316566

In [14]:
# Categories of notes
df_noteevents_filtered['CATEGORY'].unique()

array(['Discharge summary', 'Echo', 'ECG', 'Physician ', 'Nursing',
       'Respiratory ', 'General', 'Social Work', 'Nutrition', 'Consult',
       'Case Management ', 'Pharmacy', 'Rehab Services', 'Radiology',
       'Nursing/other'], dtype=object)

At this point, we have to make a choice on what notes to use. We have to remove notes relating to discharge summary, in order to handle data leakage. We could use all the notes by concatenating them.

We are particularly interested in 'General', 'Physician', 'Nursing', 'Radiology', 'Nursing/other'.

In [15]:
# remove all 'Discharge Summary' notes to handle data leakage
df_noteevents_filtered = df_noteevents_filtered[df_noteevents_filtered['CATEGORY'] != 'Discharge summary']

# categories to drop
categories_to_drop = ['Nutrition', 'Rehab Services', 'Social Work']

# Filter out rows with the specified categories
df_noteevents_filtered = df_noteevents_filtered[~df_noteevents_filtered['CATEGORY'].isin(categories_to_drop)]

In [16]:
def get_proportions_na_category(df):
  print("Total count:", df.shape[0])
  print("\nTotal Charttime NA count:", df['CHARTTIME'].isna().sum().sum())
  
  print("\nTotal Charttime NA proportion:", ((df['CHARTTIME'].isna().sum().sum() / df.shape[0]) * 100).round(2) )

  category_proportions = (df['CATEGORY'].value_counts(normalize=True) * 100).round(2)
  print("\nCATEGORY proportions: \n", category_proportions)
  
  na_counts = df['CHARTTIME'].isna().groupby(df['CATEGORY']).sum().astype(int)
  total_counts = df.groupby('CATEGORY').size()
  percentage_na = (na_counts / total_counts) * 100
  print("\nProportion of CHARTTIME NA by CATEGORY: \n", percentage_na)

In [17]:
get_proportions_na_category(df_noteevents_filtered)

Total count: 377178

Total Charttime NA count: 256883

Total Charttime NA proportion: 68.11

CATEGORY proportions: 
 CATEGORY
ECG                 55.43
Radiology           19.44
Echo                12.14
Nursing/other        7.48
Physician            2.58
Nursing              2.52
General              0.21
Respiratory          0.19
Consult              0.01
Case Management      0.00
Pharmacy             0.00
Name: proportion, dtype: float64

Proportion of CHARTTIME NA by CATEGORY: 
 CATEGORY
Case Management       0.000000
Consult               0.000000
ECG                 100.000000
Echo                100.000000
General               5.222930
Nursing              14.566888
Nursing/other         0.000000
Pharmacy            100.000000
Physician             5.911937
Radiology             0.000000
Respiratory           5.000000
dtype: float64


In [58]:
df_noteevents[df_noteevents['HADM_ID']==167853][['CHARTTIME','CATEGORY', 'DESCRIPTION','TEXT', 'ADMITTIME']].values

array([[NaT, 'Discharge summary', 'Report',
        'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n',
        Timestamp('2151-07

In [59]:
df_noteevents_filtered[df_noteevents_filtered['HADM_ID']==167853][['CHARTTIME','CATEGORY', 'DESCRIPTION','TEXT', 'ADMITTIME']].values

array([[NaT, 'Echo', 'Report',
        'PATIENT/TEST INFORMATION:\nIndication: Aortic valve disease. ?endocarditis\nHeight: (in) 67\nWeight (lb): 110\nBSA (m2): 1.57 m2\nBP (mm Hg): 106/72\nStatus: Inpatient\nDate/Time: [**2151-8-2**] at 17:09\nTest: Portable TTE(Complete)\nDoppler: Complete pulse and color flow\nContrast: None\nTechnical Quality: Adequate\n\n\nINTERPRETATION:\n\nFindings:\n\nLEFT ATRIUM: The left atrium is moderately dilated. The left atrium is\nelongated.\n\nRIGHT ATRIUM/INTERATRIAL SEPTUM: The right atrium is markedly dilated.\n\nLEFT VENTRICLE: Left ventricular wall thicknesses are normal. The left\nventricular cavity size is normal. There is mild global left ventricular\nhypokinesis. Overall left ventricular systolic function is moderately\ndepressed.\n\nRIGHT VENTRICLE: The right ventricular free wall is hypertrophied. The right\nventricular cavity is moderately dilated. There is moderate global right\nventricular free wall hypokinesis.\n\nAORTA: The aortic root 

The last note, a report starting with 'FOCUS; ADMISSION NOTE', is likely to be of significant use. 

We have to be aware of the medical terminologies used during the triage procedure, and clarify that this is standardised.

Medical Terminologies from this example: 
* "MS CHANGES" refers to changes in mental state. 

* "PMH" refers to past medical history
* "HX" refers to history
  * hypertention (HTN)
  * chronic atrial fibrillation (Chronic AFIB)
  * transient ischemic attack (TIA)

* "HPI" refers to History of Present Illness
* ED: Emergency Department
* MICU: Medical Intensive Care Unit

In [111]:

out = df_noteevents_filtered[df_noteevents_filtered['HADM_ID'] == sepsis_patient_03]['TEXT']
out.to_csv('sepsis_text.csv')


## Sepsis Note events
Manually inspecting a patient diagnosed with sepsis:

In [20]:
# get sepsis admissions
df_sepsis_patients = pd.read_csv(Path(ROOT_DIR / 'data' / 'sepsis_patients.csv'))
sepsis_admissions = df_sepsis_patients['HADM_ID'].values

In [21]:
# get notes for sepsis admisions 
df_noteevents_sepsis = df_noteevents_filtered[df_noteevents_filtered['HADM_ID'].isin(sepsis_admissions)]

# get 3 test admissions
sepsis_patient_01 = df_noteevents_sepsis.iloc[0]['HADM_ID']
sepsis_patient_02 = df_noteevents_sepsis.iloc[1]['HADM_ID']
sepsis_patient_03 = df_noteevents_sepsis.iloc[2]['HADM_ID']

In [114]:
# proportion of categories for sepsis patients
get_proportions_na_category(df_noteevents_sepsis)

Total count: 37406

Total Charttime NA count: 23705

Total Charttime NA proportion: 63.37

Category proportions: 
 CATEGORY
ECG                 49.35
Radiology           23.96
Echo                14.03
Nursing/other        4.77
Physician            4.02
Nursing              3.19
General              0.36
Respiratory          0.33
Case Management      0.01
Name: proportion, dtype: float64

Percentage NA: 
 CATEGORY
Case Management       0.0
ECG                 100.0
Echo                100.0
General               0.0
Nursing               0.0
Nursing/other         0.0
Physician             0.0
Radiology             0.0
Respiratory           0.0
dtype: float64


All the CHARTTIMES with NA timestamps are from ECG and Echo notes.

In [44]:
df_noteevents_filtered[df_noteevents_filtered['HADM_ID']==sepsis_patient_01][['CHARTTIME','CATEGORY', 'DESCRIPTION','TEXT', 'ADMITTIME']].values

array([[NaT, 'Echo', 'Report',
        'PATIENT/TEST INFORMATION:\nIndication: Cerebrovascular event/TIA. Streptococcal bacteremia. Evaluate for endocarditis.\nHeight: (in) 72\nWeight (lb): 187\nBSA (m2): 2.07 m2\nBP (mm Hg): 147/87\nHR (bpm): 90\nStatus: Inpatient\nDate/Time: [**2158-2-16**] at 13:48\nTest: Portable TEE (Complete)\nDoppler: Full Doppler and color Doppler\nContrast: None\nTechnical Quality: Adequate\n\n\nINTERPRETATION:\n\nFindings:\n\nLEFT ATRIUM: Dilated LA. Mild spontaneous echo contrast in the LAA. No\nthrombus in the LAA.\n\nRIGHT ATRIUM/INTERATRIAL SEPTUM: No mass or thrombus in the RA or RAA. No ASD\nby 2D or color Doppler.\n\nAORTA: No atheroma in descending aorta.\n\nAORTIC VALVE: Bicuspid aortic valve. Mildly thickened aortic valve leaflets.\nNo masses or vegetations on aortic valve. Trace AR.\n\nMITRAL VALVE: Mildly thickened mitral valve leaflets. No mass or vegetation on\nmitral valve. Mild (1+) MR.\n\nTRICUSPID VALVE: Normal tricuspid valve leaflets with 

In [45]:
df_noteevents_filtered[df_noteevents_filtered['HADM_ID']==sepsis_patient_02][['CHARTTIME','CATEGORY', 'DESCRIPTION','TEXT', 'ADMITTIME']].values

array([[NaT, 'Echo', 'Report',
        "PATIENT/TEST INFORMATION:\nIndication: Left ventricular function.\nHeight: (in) 63\nWeight (lb): 126\nBSA (m2): 1.59 m2\nBP (mm Hg): 88/53\nHR (bpm): 115\nStatus: Inpatient\nDate/Time: [**2119-6-7**] at 14:59\nTest: Portable TTE (Complete)\nDoppler: Full Doppler and color Doppler\nContrast: None\nTechnical Quality: Adequate\n\n\nINTERPRETATION:\n\nFindings:\n\nThis study was compared to the prior study of [**2118-1-26**].\n\n\nLEFT ATRIUM: Elongated LA.\n\nRIGHT ATRIUM/INTERATRIAL SEPTUM: Normal RA size.\n\nLEFT VENTRICLE: Normal LV wall thickness, cavity size and regional/global\nsystolic function (LVEF >55%). Estimated cardiac index is high (>4.0L/min/m2).\nTDI E/e' < 8, suggesting normal PCWP (<12mmHg). Doppler parameters are most\nconsistent with normal LV diastolic function. No resting LVOT gradient.\n\nRIGHT VENTRICLE: Normal RV chamber size and free wall motion.\n\nAORTIC VALVE: Mildly thickened aortic valve leaflets (3). No AS. No AR.\n\n

In [81]:
df_noteevents_filtered[df_noteevents_filtered['HADM_ID']==sepsis_patient_03][['CATEGORY','TEXT']].values

array([['Echo',
        'PATIENT/TEST INFORMATION:\nIndication: Acute MR worsening,ESRD, on HD, NSLS lung Ca, DM2,HTN,respiratory distress\nHeight: (in) 59\nWeight (lb): 136\nBSA (m2): 1.57 m2\nBP (mm Hg): 126/61\nHR (bpm): 95\nStatus: Inpatient\nDate/Time: [**2200-10-22**] at 15:14\nTest: Portable TTE (Complete)\nDoppler: Full Doppler and color Doppler\nContrast: None\nTechnical Quality: Adequate\n\n\nINTERPRETATION:\n\nFindings:\n\nThis study was compared to the prior study of [**2200-1-2**].\n\n\nLEFT ATRIUM: Normal LA size.\n\nRIGHT ATRIUM/INTERATRIAL SEPTUM: Normal RA size.\n\nLEFT VENTRICLE: Mild symmetric LVH with normal cavity size. Moderately\ndepressed LVEF. No resting LVOT gradient.\n\nRIGHT VENTRICLE: Normal RV chamber size and free wall motion.\n\nAORTA: Normal aortic diameter at the sinus level. Normal ascending aorta\ndiameter.\n\nAORTIC VALVE: Mildly thickened aortic valve leaflets (3). No AS. Trace AR.\n\nMITRAL VALVE: Mildly thickened mitral valve leaflets. No MVP. Ph

"MEDICAL CONDITION", "IMPRESSION:" and "FINDINGS:" have other relevant information.

There are medical terminologies that we will have to make sure are present:
* etoh: ethanol, referring to alcohol-related conditions
* sob: shortness of breath
* WET READ:  refers to a preliminary report or reading of a radiological study. It's a quick review meant to identify any urgent or significant findings.
* "HD-dependent CKD": Hemodialysis-dependent Chronic Kidney Disease
* "RLL dx by CXR": Right Lower Lobe (RLL) of the lungs, determined by "Chest X-Ray" (CXR)
* "sp ett check placement": status post surgery (SP or SPS), check endotracheal tube (ett)


Each note appears to follow a standardised structure. We can use these headings to reduce dimensionality:
* Reason:
* PATIENT/TEST INFORMATION:
  * Indication:
* INTERPRETATION:
* Findings:
* MEDICAL CONDITION:
* HISTORY:
* IMPRESSION:
* COMPARISON:

* FINAL REPORT:
* FINAL REPORT HISTORY:
* FINDINGS:

## Pivot Text

In [24]:
grouped = df_noteevents_filtered.groupby(['HADM_ID', 'CATEGORY'])
concatenated_notes = grouped['TEXT'].agg(lambda x: ' '.join(x)).reset_index()

# Pivot the dataframe to have categories as columns
df_pivot_text = concatenated_notes.pivot(index='HADM_ID', columns='CATEGORY', values='TEXT').reset_index()
df_pivot_text

CATEGORY,HADM_ID,Case Management,Consult,ECG,Echo,General,Nursing,Nursing/other,Physician,Radiology,Respiratory
0,100001.0,,,,,,,,,[**2117-9-11**] 11:12 AM\n CHEST (PA & LAT) ...,
1,100003.0,,,Sinus rhythm\nProlonged QT interval is nonspec...,PATIENT/TEST INFORMATION:\nIndication: Left ve...,,,,Chief Complaint: GIB/HOTN\n I saw and exami...,[**2150-4-17**] 3:32 PM\n LIVER OR GALLBLADDER...,
2,100006.0,,,Sinus tachycardia\nLeft axis deviation - anter...,PATIENT/TEST INFORMATION:\nIndication: Left ve...,,,PMICU Nursing Progress Notet 1630-1900\n\nPlea...,,[**2108-4-6**] 11:45 AM\n CHEST (PORTABLE AP) ...,
3,100007.0,,,Sinus rhythm\nAtrial premature complex\nConsid...,,,,,,,
4,100009.0,,,Sinus bradycardia. Left atrial abnormality. ...,PATIENT/TEST INFORMATION:\nIndication: Abnorma...,,,,,[**2162-5-16**] 7:23 PM\n CHEST (PA & LAT) ...,
...,...,...,...,...,...,...,...,...,...,...,...
55991,199993.0,,,Sinus rhythm\n- frequent premature ventricular...,,,,CSRU Admission Note\nS/O:Admit from [**Hospita...,,,
55992,199994.0,,,Sinus rhythm\nLeft atrial abnormality\nRight b...,,,,,,[**2188-7-7**] 5:21 PM\n CHEST (PORTABLE AP) ...,
55993,199995.0,,,Sinus rhythm\nLeft ventricular hypertrophy wit...,PATIENT/TEST INFORMATION:\nIndication: Endocar...,,,,,[**2137-12-11**] 10:07 PM\n CHEST (PA & LAT) ...,
55994,199998.0,,,Sinus rhythm\nConsider left atrial abnormality...,PATIENT/TEST INFORMATION:\nIndication: Abnorma...,,,,,[**2119-2-18**] 5:59 PM\n CT CHEST W/O CONTRAS...,


In [22]:
import sys
sys.path.append("..")

# import utility functions
import src.utils as util
reload(util)

import src.load_parquet as load_parquet
reload(load_parquet)

# setup OS agnostic pathnames
ROOT_DIR = Path('..')

# Note: to run main notebook from root directory, use:
# ROOT_DIR = Path('')

import src.DataLoader as DataLoader
reload(DataLoader)

dataLoader = DataLoader.DataLoader(ROOT_DIR)

In [23]:
df_desc_icd, df_desc_labitems, df_desc_items = dataLoader.load_descriptions()
df_diagnoses_icd = dataLoader.load_diagnoses_icd(df_desc_icd)
df_demographic = dataLoader.load_demographic(df_diagnoses_icd)
df_demographic = dataLoader.demographic_clean_AGE(df_demographic)
df_labevents = dataLoader.load_labevents(df_demographic)
df_labevents = dataLoader.labevents_compute_TIME(df_labevents, df_demographic)

# t4
dataLoader.create_train_data(df_labevents, df_demographic, df_desc_labitems, hours=4, feature_no=20)

path_t4_df = ROOT_DIR / 'data' / 'model_input' / "t4.csv"  
t4_df = pd.read_csv(path_t4_df)
t4_df

Removed 8210 admissions with AGE < 18


Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,IS_SEPSIS,GENDER_NUM,ITEMID_51279,ITEMID_51222,ITEMID_51221,ITEMID_50931,ITEMID_51006,...,ITEMID_50912,ITEMID_51237,ITEMID_50893,ITEMID_50882,ITEMID_50813,ITEMID_50821,ITEMID_51265,ITEMID_50820,ITEMID_50818,ITEMID_51275
0,3,145834.0,76,0,0,2.79,8.2,25.6,281.0,43.0,...,2.5,1.7,7.4,11.0,8.0,329.0,253.0,7.26,28.0,46.4
1,4,185777.0,47,0,1,3.80,11.5,34.2,140.0,9.0,...,0.5,1.0,8.9,24.0,-999.0,-999.0,207.0,-999.00,-999.0,31.3
2,9,150750.0,41,0,0,5.46,15.4,45.2,129.0,16.0,...,1.2,1.1,-999.0,30.0,-999.0,-999.0,258.0,-999.00,-999.0,21.7
3,11,194540.0,50,0,1,4.31,12.5,36.9,121.0,12.0,...,0.7,1.1,-999.0,25.0,-999.0,-999.0,229.0,-999.00,-999.0,28.3
4,13,143045.0,39,0,1,4.08,12.3,35.6,169.0,13.0,...,0.6,1.2,8.9,23.0,-999.0,-999.0,216.0,-999.00,-999.0,44.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46059,8666,129517.0,43,0,0,-999.00,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,7.41,-999.0,-999.0
46060,11861,120052.0,21,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,7.22,-999.0,-999.0
46061,13542,115692.0,84,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,7.51,-999.0,-999.0
46062,25337,116249.0,81,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,7.44,-999.0,-999.0


In [65]:
merged_df = t4_df.merge(df_pivot_text, on='HADM_ID', how='left')
merged_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,IS_SEPSIS,GENDER_NUM,ITEMID_51279,ITEMID_51222,ITEMID_51221,ITEMID_50931,ITEMID_51006,...,Case Management,Consult,ECG,Echo,General,Nursing,Nursing/other,Physician,Radiology,Respiratory
0,3,145834.0,76,0,0,2.79,8.2,25.6,281.0,43.0,...,,,Sinus rhythm\nP-R interval increased\nLate R w...,PATIENT/TEST INFORMATION:\nIndication: S/P Car...,,,,,[**2101-10-20**] 10:23 PM\n CHEST (PORTABLE AP...,
1,4,185777.0,47,0,1,3.80,11.5,34.2,140.0,9.0,...,,,Sinus tachycardia. Borderline low limb lead vo...,PATIENT/TEST INFORMATION:\nIndication: R/O End...,,,[**2191-3-16**] 0500\nGeneral: Pt in to EW fro...,,[**2191-3-15**] 7:55 PM\n LIVER OR GALLBLADDER...,
2,9,150750.0,41,0,0,5.46,15.4,45.2,129.0,16.0,...,,,Sinus rhythm\nPossible LVH with ST-T changes\n...,,,,NURSING NOTE 13:30-7PM\nADMITTED FROM ER S/P C...,,[**2149-11-9**] 11:50 AM\n CHEST (PORTABLE AP)...,
3,11,194540.0,50,0,1,4.31,12.5,36.9,121.0,12.0,...,,,Sinus bradycardia with sinus arrhythmia\nST-T ...,,,,,,[**2178-4-16**] 5:01 AM\n MR HEAD W & W/O CONT...,
4,13,143045.0,39,0,1,4.08,12.3,35.6,169.0,13.0,...,,,Sinus rhythm/ Non-diagnostic inferior and late...,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46059,8666,129517.0,43,0,0,-999.00,-999.0,-999.0,-999.0,-999.0,...,,,Sinus rhythm\nLead(s) unsuitable for analysis:...,,,,,,,
46060,11861,120052.0,21,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,,,"Baseline sinus tachycardia. Otherwise, finding...",,,,admission note\nNeuro: Pt. alert and oriented ...,,[**2126-11-11**] 10:34 PM\n CHEST (PORTABLE AP...,
46061,13542,115692.0,84,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,,,Sinus rhythm. Left atrial abnormality. Left bu...,PATIENT/TEST INFORMATION:\nIndication: Hypoten...,,,,,[**2106-2-24**] 12:40 PM\n PELVIS (AP ONLY) IN...,
46062,25337,116249.0,81,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,,,Sinus rhythm.\nInferior infarct - age undeterm...,,,,,,[**2120-1-10**] 9:24 PM\n HUMERUS (AP & LAT) L...,


## Splitting notes by headers

In [94]:
# get notes for a single admission
individual_test_notes = df_noteevents_filtered[df_noteevents_filtered['HADM_ID']==df_noteevents_sepsis.iloc[2]['HADM_ID']][['TEXT']].values

In [97]:
def split_notes_by_header(note):
    # Define structure headings
    topics = [
        'Reason:',
        'REASON FOR THIS EXAMINATION:',
        'PATIENT/TEST INFORMATION:',
        'Admitting Diagnosis:',
        'Indication:',
        'INTERPRETATION:',
        'Findings:',
        'MEDICAL CONDITION:',
        'IMPRESSION:',
        'COMPARISON:',
        'FINAL REPORT',
        'HISTORY:',
        'Conclusions:',
        'FINDINGS:',
        'ADMISSION NOTE',
        'Chief Complaint:'
    ]

    # Create a regular expression pattern to split the note
    pattern = '|'.join(map(re.escape, topics))

    # Use the re.split function to split the note
    segments = re.split(pattern, note)
    segments = [seg.strip() for seg in segments if seg and seg.strip()]

    # Create a dictionary with topics as keys
    topic_dict = {}
    for topic in topics:
        if topic in note:
            index = note.index(topic) + len(topic)
            next_topic_index = [note.index(t) for t in topics if t in note[index:]]
            end_index = min(next_topic_index) if next_topic_index else None
            topic_dict[topic] = note[index:end_index].strip()

    return topic_dict

# Test function with individual
notes_topics = [split_notes_by_header(note[0]) for note in individual_test_notes]
notes_topics

[{'PATIENT/TEST INFORMATION:': '',
  'Indication:': 'Acute MR worsening,ESRD, on HD, NSLS lung Ca, DM2,HTN,respiratory distress\nHeight: (in) 59\nWeight (lb): 136\nBSA (m2): 1.57 m2\nBP (mm Hg): 126/61\nHR (bpm): 95\nStatus: Inpatient\nDate/Time: [**2200-10-22**] at 15:14\nTest: Portable TTE (Complete)\nDoppler: Full Doppler and color Doppler\nContrast: None\nTechnical Quality: Adequate',
  'INTERPRETATION:': '',
  'Findings:': 'This study was compared to the prior study of [**2200-1-2**].\n\n\nLEFT ATRIUM: Normal LA size.\n\nRIGHT ATRIUM/INTERATRIAL SEPTUM: Normal RA size.\n\nLEFT VENTRICLE: Mild symmetric LVH with normal cavity size. Moderately\ndepressed LVEF. No resting LVOT gradient.\n\nRIGHT VENTRICLE: Normal RV chamber size and free wall motion.\n\nAORTA: Normal aortic diameter at the sinus level. Normal ascending aorta\ndiameter.\n\nAORTIC VALVE: Mildly thickened aortic valve leaflets (3). No AS. Trace AR.\n\nMITRAL VALVE: Mildly thickened mitral valve leaflets. No MVP. Physiol

In [100]:
def get_medical_conditions_for_hadm_id(hadm_id, df):
    # Filter the dataframe for the given HADM_ID
    individual_test_notes = df[df['HADM_ID'] == hadm_id][['TEXT']].values

    # Process the notes using the split_notes_by_topic function
    notes_topics = [split_notes_by_header(note[0]) for note in individual_test_notes]

    # Extract 'MEDICAL CONDITION:' entries from the processed notes
    medical_conditions = [note.get('MEDICAL CONDITION:', None) for note in notes_topics]

    # Filter out any None or empty string values
    medical_conditions = [condition for condition in medical_conditions if condition]

    return medical_conditions

In [101]:
get_medical_conditions_for_hadm_id(sepsis_patient_03, df_noteevents_filtered)

['61 year old woman with HD-dependent CKD, RLL dx by CXR today',
 '63 year old woman with acute sob, dialysis patient',
 '63 year old woman with dyspnea, ams, eval for acute CT process']

In [56]:
history = [note.get('HISTORY:', None) for note in notes_topics]
history

[None,
 None,
 None,
 None,
 'Catheter placement.',
 'Acute shortness of breath in this dialysis patient.  Endotracheal\n tube placement.',
 'Dyspnea and altered mental status.']

# Preprocessing Note Events


Text cleaning: 
* Remove special characters
* Converting entire text to uppercase
* Handling medical abbreviations
    *  Consider expanding them, or ensuring that they're standardised
    *  Remove extra whitespace

In [58]:
def remove_patterns(text):
    patterns_to_remove = [
        r'CLIP #',
        r'BY DIFFERENT PHYSICIAN #',
        r'BY SAME PHYSICIAN #'
    ]
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text, flags=re.I)
    return text

In [57]:
def clean_text(text):
    # text = re.sub(r'[^a-z0-9\s]', '', text.upper())
    text = text.upper() # convert to upper case
    text = re.sub(r'_{2,}', '', text) # remove sequence of underscores
    text = re.sub(r'\[\*\*.*?\*\*\]', '', text) # remove confidential text
    text = re.sub(r'\s+', ' ', text).strip() # replace multiple space characters
    text = remove_patterns(text)
    text = re.sub(r'\s+', ' ', text).strip() # replace multiple space characters
    # text = text.replace("\n", " ")  # Replace with space
    return text

In [59]:
# Consider expanding abbreviations
# Due to the large number of them, we likely won't utilise this

abbreviation_map = {
    "htn": "hypertension",
    "tia": "transient ischemic attack"
}

def expand_abbreviations(text):
    for abbr, expansion in abbreviation_map.items():
        text = text.replace(abbr, expansion)
    return text


In [60]:
def preprocess_text(text):
    text = clean_text(text)
    
    # text = expand_abbreviations(text)
    return text

In [58]:
individual_test_notes

array([['PATIENT/TEST INFORMATION:\nIndication: Acute MR worsening,ESRD, on HD, NSLS lung Ca, DM2,HTN,respiratory distress\nHeight: (in) 59\nWeight (lb): 136\nBSA (m2): 1.57 m2\nBP (mm Hg): 126/61\nHR (bpm): 95\nStatus: Inpatient\nDate/Time: [**2200-10-22**] at 15:14\nTest: Portable TTE (Complete)\nDoppler: Full Doppler and color Doppler\nContrast: None\nTechnical Quality: Adequate\n\n\nINTERPRETATION:\n\nFindings:\n\nThis study was compared to the prior study of [**2200-1-2**].\n\n\nLEFT ATRIUM: Normal LA size.\n\nRIGHT ATRIUM/INTERATRIAL SEPTUM: Normal RA size.\n\nLEFT VENTRICLE: Mild symmetric LVH with normal cavity size. Moderately\ndepressed LVEF. No resting LVOT gradient.\n\nRIGHT VENTRICLE: Normal RV chamber size and free wall motion.\n\nAORTA: Normal aortic diameter at the sinus level. Normal ascending aorta\ndiameter.\n\nAORTIC VALVE: Mildly thickened aortic valve leaflets (3). No AS. Trace AR.\n\nMITRAL VALVE: Mildly thickened mitral valve leaflets. No MVP. Physiologic MR\n(w

In [57]:
notes_preprocessed = [preprocess_text(note[0]) for note in individual_test_notes]
notes_preprocessed

['PATIENT/TEST INFORMATION: INDICATION: ACUTE MR WORSENING,ESRD, ON HD, NSLS LUNG CA, DM2,HTN,RESPIRATORY DISTRESS HEIGHT: (IN) 59 WEIGHT (LB): 136 BSA (M2): 1.57 M2 BP (MM HG): 126/61 HR (BPM): 95 STATUS: INPATIENT DATE/TIME: AT 15:14 TEST: PORTABLE TTE (COMPLETE) DOPPLER: FULL DOPPLER AND COLOR DOPPLER CONTRAST: NONE TECHNICAL QUALITY: ADEQUATE INTERPRETATION: FINDINGS: THIS STUDY WAS COMPARED TO THE PRIOR STUDY OF . LEFT ATRIUM: NORMAL LA SIZE. RIGHT ATRIUM/INTERATRIAL SEPTUM: NORMAL RA SIZE. LEFT VENTRICLE: MILD SYMMETRIC LVH WITH NORMAL CAVITY SIZE. MODERATELY DEPRESSED LVEF. NO RESTING LVOT GRADIENT. RIGHT VENTRICLE: NORMAL RV CHAMBER SIZE AND FREE WALL MOTION. AORTA: NORMAL AORTIC DIAMETER AT THE SINUS LEVEL. NORMAL ASCENDING AORTA DIAMETER. AORTIC VALVE: MILDLY THICKENED AORTIC VALVE LEAFLETS (3). NO AS. TRACE AR. MITRAL VALVE: MILDLY THICKENED MITRAL VALVE LEAFLETS. NO MVP. PHYSIOLOGIC MR (WITHIN NORMAL LIMITS). TRICUSPID VALVE: MILDLY THICKENED TRICUSPID VALVE LEAFLETS. INDET

In [56]:
merged_df['Nursing'].isna().sum()
merged_df['Nursing'].isna().sum().sum()
merged_df['Nursing']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
46059    NaN
46060    NaN
46061    NaN
46062    NaN
46063    NaN
Name: Nursing, Length: 46064, dtype: object

In [68]:
# preprocess Nursing/other
merged_df['Nursing_other_processed'] = merged_df['Nursing/other'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)


In [69]:
merged_df

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,IS_SEPSIS,GENDER_NUM,ITEMID_51279,ITEMID_51222,ITEMID_51221,ITEMID_50931,ITEMID_51006,...,Consult,ECG,Echo,General,Nursing,Nursing/other,Physician,Radiology,Respiratory,Nursing_other_processed
0,3,145834.0,76,0,0,2.79,8.2,25.6,281.0,43.0,...,,Sinus rhythm\nP-R interval increased\nLate R w...,PATIENT/TEST INFORMATION:\nIndication: S/P Car...,,,,,[**2101-10-20**] 10:23 PM\n CHEST (PORTABLE AP...,,
1,4,185777.0,47,0,1,3.80,11.5,34.2,140.0,9.0,...,,Sinus tachycardia. Borderline low limb lead vo...,PATIENT/TEST INFORMATION:\nIndication: R/O End...,,,[**2191-3-16**] 0500\nGeneral: Pt in to EW fro...,,[**2191-3-15**] 7:55 PM\n LIVER OR GALLBLADDER...,,0500 GENERAL: PT IN TO EW FROM HOME WITH C/O F...
2,9,150750.0,41,0,0,5.46,15.4,45.2,129.0,16.0,...,,Sinus rhythm\nPossible LVH with ST-T changes\n...,,,,NURSING NOTE 13:30-7PM\nADMITTED FROM ER S/P C...,,[**2149-11-9**] 11:50 AM\n CHEST (PORTABLE AP)...,,NURSING NOTE 13:30-7PM ADMITTED FROM ER S/P CV...
3,11,194540.0,50,0,1,4.31,12.5,36.9,121.0,12.0,...,,Sinus bradycardia with sinus arrhythmia\nST-T ...,,,,,,[**2178-4-16**] 5:01 AM\n MR HEAD W & W/O CONT...,,
4,13,143045.0,39,0,1,4.08,12.3,35.6,169.0,13.0,...,,Sinus rhythm/ Non-diagnostic inferior and late...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46059,8666,129517.0,43,0,0,-999.00,-999.0,-999.0,-999.0,-999.0,...,,Sinus rhythm\nLead(s) unsuitable for analysis:...,,,,,,,,
46060,11861,120052.0,21,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,,"Baseline sinus tachycardia. Otherwise, finding...",,,,admission note\nNeuro: Pt. alert and oriented ...,,[**2126-11-11**] 10:34 PM\n CHEST (PORTABLE AP...,,ADMISSION NOTE NEURO: PT. ALERT AND ORIENTED X...
46061,13542,115692.0,84,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,,Sinus rhythm. Left atrial abnormality. Left bu...,PATIENT/TEST INFORMATION:\nIndication: Hypoten...,,,,,[**2106-2-24**] 12:40 PM\n PELVIS (AP ONLY) IN...,,
46062,25337,116249.0,81,0,1,-999.00,-999.0,-999.0,-999.0,-999.0,...,,Sinus rhythm.\nInferior infarct - age undeterm...,,,,,,[**2120-1-10**] 9:24 PM\n HUMERUS (AP & LAT) L...,,


# Tokenization

Tokenization and removal of stop words.

In [2]:
print(certifi.where())
nltk.__path__
nltk.download()

/Users/alexturner/CITS5553/Project/data-science-capstone-project/.venv/lib/python3.10/site-packages/certifi/cacert.pem


In [71]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [72]:
# Tokenize without punctuation
def nltk_tokenize(text):
    tokens = word_tokenize(text)
    return [token for token in tokens if token.isalpha()]  # Only keep words

# Tokenize with punctuation
def nltk_tokenize_punctuation(text):
    return word_tokenize(text)

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

def remove_stopwords_harsh(text):
        text = re.sub('\*\*[^]*\*\*\]', '', text)
        text = re.sub('<[^>]*>', '', text)
        text = re.sub('[\W]+', ' ', text.lower()) 
        text = re.sub(" \d+", " ", text)
        return " ".join([i for i in text.split() if i not in stopwords])

In [92]:
merged_df['Nursing_other_tokens'] = np.array(merged_df['Nursing_other_processed'].apply(lambda x: nltk_tokenize(x) if isinstance(x, str) else x))
merged_df['Nursing_other_tokens']

0                                                      NaN
1        [GENERAL, PT, IN, TO, EW, FROM, HOME, WITH, FE...
2        [NURSING, NOTE, ADMITTED, FROM, ER, CVA, AT, H...
3                                                      NaN
4                                                      NaN
                               ...                        
46059                                                  NaN
46060    [ADMISSION, NOTE, NEURO, PT, ALERT, AND, ORIEN...
46061                                                  NaN
46062                                                  NaN
46063                                                  NaN
Name: Nursing_other_tokens, Length: 46064, dtype: object

In [93]:
merged_df['Nursing_other_tokens_better'] = merged_df['Nursing_other_tokens'].apply(lambda x: remove_stopwords(x) if isinstance(x, list) else x)
merged_df['Nursing_other_tokens_better']

0                                                      NaN
1        [GENERAL, PT, EW, HOME, FEVER, CHILLS, SOB, CO...
2        [NURSING, NOTE, ADMITTED, ER, CVA, HOME, FOUND...
3                                                      NaN
4                                                      NaN
                               ...                        
46059                                                  NaN
46060    [ADMISSION, NOTE, NEURO, PT, ALERT, ORIENTED, ...
46061                                                  NaN
46062                                                  NaN
46063                                                  NaN
Name: Nursing_other_tokens_better, Length: 46064, dtype: object

In [None]:
#from gensim import corpora, models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize

# 2. LDA Preparation
dictionary = corpora.Dictionary(merged_df['Nursing_other_tokens_better'])
corpus = [dictionary.doc2bow(text) for text in merged_df['Nursing_other_tokens_better']]


In [None]:

# 3. Apply LDA
num_topics = 10  # Choose based on domain knowledge and experimentation
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

# 4. Feature Creation
def topic_distribution(text):
    bow = dictionary.doc2bow(text)
    dist = lda.get_document_topics(bow)
    return [tup[1] for tup in dist]

topic_features = data['tokens'].apply(topic_distribution).tolist()
topic_df = pd.DataFrame(topic_features, columns=[f'topic_{i}' for i in range(num_topics)])

# Combine with original data
data_combined = pd.concat([data, topic_df], axis=1)

# 5. Model Training with LDA features and lab results
X = data_combined[['WBC_count', 'Creatinine'] + [f'topic_{i}' for i in range(num_topics)]]
y = data_combined['is_sepsis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


## Lemmatization: 
Convert words to their base form. For example, "running" becomes "run".



# Vectorization:
## TF IDF

Vectorization: Convert the text data into numerical form.  For LSTM, you can use word embeddings such as Word2Vec, GloVe, or embeddings from transformers like BERT.

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=5000)

In [None]:
# fit vectorizer to text data
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

In [None]:
# (a lazy function) specify fields from dataset
# Every operation returns back a new key. Takes a dictionary, gives a key back

# binaryEncode
# OHEv
# testtrainsplit
# dropnans
# drop columns - drop target
# runmodels (LR, adaboost, RF)
# print metrics


# topic modelling
vectorizer = TfidfVectorizer()
vectorizer.fit_transform()
num_topics = 15
lda_model = LatentDirichletAllocation(n_components=2, max_iter=10)


# use topics instead
# categorises or groups words together (with LDA)
# co-ocurrence of the words - heart and cardiac
# reduces the dimensionality

# will get accuracy for each model

# LDA with count vectorizer

In [107]:
merged_df['Nursing_other_tokens_better'][1]

['GENERAL',
 'PT',
 'EW',
 'HOME',
 'FEVER',
 'CHILLS',
 'SOB',
 'COUGH',
 'DOE',
 'FATIGUE',
 'PT',
 'ALSO',
 'EPISTAXIS',
 'REPORTS',
 'CHRONIC',
 'PROBLEM',
 'X',
 'YEARS',
 'EW',
 'PT',
 'NEEDED',
 'PUT',
 'NRM',
 'SATS',
 'DROPPED',
 'LOW',
 'PT',
 'HIV',
 'DX',
 'CIRRHOSIS',
 'LIVER',
 'BX',
 'IDDM',
 'SINCE',
 'HYPOTHYROIDISM',
 'EGD',
 'NEG',
 'VARICES',
 'CV',
 'BP',
 'HR',
 'SINUS',
 'RESP',
 'LUNGS',
 'DIM',
 'THROUGHOUT',
 'CRACKLES',
 'HEARD',
 'LLL',
 'PT',
 'FINDS',
 'DIFFICULT',
 'TAKE',
 'DEEP',
 'BREATH',
 'PT',
 'REMOVES',
 'NRB',
 'MASK',
 'DESATS',
 'LOW',
 'RESP',
 'MANAGER',
 'PLACED',
 'NASAL',
 'CANULA',
 'HUMIDIFIED',
 'MASK',
 'SATS',
 'REMAINED',
 'UPPER',
 'PT',
 'TOLERATING',
 'HIGH',
 'BEGIN',
 'WEAN',
 'POSSIBLE',
 'PT',
 'SPUTUM',
 'ORDERED',
 'COLLECTED',
 'WRITING',
 'PT',
 'FOLEY',
 'VOIDING',
 'AMBER',
 'COLORED',
 'URINE',
 'BED',
 'PAN',
 'URIN',
 'SPEC',
 'SENT',
 'LAB',
 'INCONTINANT',
 'SMALL',
 'AMOUNT',
 'STOOL',
 'UPON',
 'ARRIVAL',
 'FLOOR'

In [114]:
#documents = []
merged_df['Nursing_other_tokens_better'] = merged_df['Nursing_other_tokens_better'].fillna()
documents = merged_df['Nursing_other_tokens_better']


# Vectorize the documents - Convert Text Data to Document-Term Matrix:
# For LDA, it's typically better to use raw term counts rather than TF-IDF
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
data_vectorized = vectorizer.fit_transform(documents)

# Train LDA model
# n_components is the number of topics
num_topics = 2
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10)
lda.fit(data_vectorized)

# Print topics
def print_top_words(model, vectorizer, n_words=10):
    words = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        top_words = [words[i] for i in topic.argsort()[:-n_words - 1:-1]]
        print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")

print_top_words(lda, vectorizer)

# Assign Topics to Documents
# transform original matrix to get the topic distribution for each document:
doc_topic_dist = lda.transform(data_vectorized)

# A matrix where:
# each row corresponds to a document in your original dataset 
# each column gives the probability of that document belonging to a particular topic
doc_topic_dist

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
# Extract keywords from topics
def display_topics(model, feature_names, n_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        keywords = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
        topics[f"Topic {topic_idx}"] = keywords
    return topics

n_words = 5  # Number of top words to extract from each topic
feature_names = vectorizer.get_feature_names_out()
topics_keywords = display_topics(lda, feature_names, n_words)

In [None]:
# Filtering topics by keywords
relevant_topics = []
for topic, words in topics_keywords.items():
    if any(word.upper() in keywords for word in words):
        relevant_topics.append(topic)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

from nltk.stem import WordNetLemmatizer


def preprocess_text_test(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

# Triage notes 
merged_df.Nursing_other_processed
triage_notes = ['Sample text 1 and this', 'Sample text 2']
lab_results = [[1.2, 3.4], [2.3, 4.5]] 

# Step 1: Preprocess the Text Data
triage_notes = [preprocess_text_test(note) for note in triage_notes]

# Step 2: Vectorize the Text Data
vectorizer = CountVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(triage_notes)

# Step 3: Perform LDA for Topic Modeling
num_topics = 15
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
X_topics = lda_model.fit_transform(X_text)

# Step 4: Combine Topic Features with Numerical Data
X_numeric = np.array(lab_results)
X_combined = np.hstack((X_numeric, X_topics))

X_combined

array([[1.2       , 3.4       , 0.81333325, 0.01333333, 0.01333333,
        0.01333342, 0.01333333, 0.01333333, 0.01333333, 0.01333333,
        0.01333333, 0.01333333, 0.01333333, 0.01333333, 0.01333333,
        0.01333333, 0.01333333],
       [2.3       , 4.5       , 0.02222226, 0.02222222, 0.02222222,
        0.68888885, 0.02222222, 0.02222222, 0.02222222, 0.02222222,
        0.02222222, 0.02222222, 0.02222222, 0.02222222, 0.02222222,
        0.02222222, 0.02222222]])

## Keywords

In [97]:
keywords = [
    "SUPPLEMENTAL OXYGEN", "VENTILATOR", "ALTERED MENTAL STATUS", "ABSCESS", 
    "ACUTE", "ALTERED", "BACTEREMIA", "CELLULITIS", "CYSTITIS", "DIABETES", 
    "FAILURE", "LACTIC", "LEUKOCYTOSIS", "PNA", "PNEUMONIA", "PYELONEPHRITIS", 
    "RESPIRATORY", "SEPSIS", "SEPTIC", "UROSEPSIS", "UTI"
]

In [None]:
# Assuming df is your DataFrame with chart events notes stored in a column named 'TEXT'
features = pd.DataFrame(index=df.index, columns=keywords)
for keyword in keywords:
    features[keyword] = df['TEXT'].str.contains(keyword, case=False, regex=False).astype(int)


In [100]:
# Initialize TF-IDF Vectorizer with the keywords as vocabulary
vectorizer_keywords = TfidfVectorizer(vocabulary=keywords)

# list of medical notes
tfidf_matrix = vectorizer_keywords.fit_transform(sepsis_text)

# Convert this matrix to a DataFrame for better visibility 
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer_keywords.get_feature_names_out())
df_tfidf

Unnamed: 0,SUPPLEMENTAL OXYGEN,VENTILATOR,ALTERED MENTAL STATUS,ABSCESS,ACUTE,ALTERED,BACTEREMIA,CELLULITIS,CYSTITIS,DIABETES,...,LACTIC,LEUKOCYTOSIS,PNA,PNEUMONIA,PYELONEPHRITIS,RESPIRATORY,SEPSIS,SEPTIC,UROSEPSIS,UTI
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
sepsis_text = pd.DataFrame(df_noteevents_sepsis['TEXT']).reset_index(drop=True)
sepsis_text

Unnamed: 0,TEXT
0,PATIENT/TEST INFORMATION:\nIndication: Cerebro...
1,PATIENT/TEST INFORMATION:\nIndication: Left ve...
2,PATIENT/TEST INFORMATION:\nIndication: Acute M...
3,PATIENT/TEST INFORMATION:\nIndication: Endocar...
4,PATIENT/TEST INFORMATION:\nIndication: Endocar...
...,...
37412,Umbilical Line Placement\nUmbilical line place...
37413,Neonatology Attending\n\n740 gram 25 [**1-29**...
37414,Neonatology Attending\n\nCorrection: maternal ...
37415,Respiratory Care Note\nPt arrived to the NICU ...


In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
df = pd.DataFrame({'diagnoises_text': ['patient has', 'second note']})
df.columns = ['diag']

class CustomLLM(CustomLLM):
  @property
  
  response = requests.post(URI, json=request)
  return response.json()['results'][0]['text']

text_splitter = 
documents - text_splitter.
# if you load sentences, it gives you a list of numbers
# embed whole csv into a vector
# how to encode this



In [None]:
# Sentence transformers - will group sentences together
# Sentence BERT
from sentence_transformers import SentenceTransformer

model_name = 'BAII/bge-large-ev-v1.5'

model = SentenceTransformer(model_name)

embeddings = model.encode(sentences)

ix=0
iy=1
# cosine similarity
np.dot(embeddings[ix], embeddings[iy])/(np.linalg.norm(embeddings[ix]) * np.linalg.norm(embeddings[iy]))

# distance
#higher distance, the worse it is
np.linalg.norm(embeddings[ix]-embeddings[iy])

In [None]:
import transformers
from transformers import pipeline

classifier()

classifier("question", ['response', 'response2', 'sharks'])

# zero shot if it's not trained on anything

# embeddings are many dimensions
embeddings.shape

# projector.tensorflow.org

Doing the same thing for full sentences. Sepsis group would be be concentrated to the edges. 
