# **Safe and Personalized Medication Recommendation Project (MARS)**
## 1. MIMIC-III Data Preprocessing

#### This notebook is the first part of cleaning and organizing data from various MIMIC-III database tables for AI modeling.
### Tables Processed in This Notebook:

* **PRESCRIPTIONS**: Contains medication orders and prescriptions.
* **PATIENTS**: Identifies each unique patient (SUBJECT_ID).
* **ADMISSIONS**: Details each hospital admission (HADM_ID) for a patient.
* **LABEVENTS**: Includes all laboratory measurements, covering both inpatient and outpatient data.
* **NOTEEVENTS**: Contains clinical notes for patients.

For more information on these tables, refer to the MIMIC-III documentation from this link: [https://mimic.mit.edu/docs/iii/](http://)

## A. Data Loading and Cleaning

In [None]:
import pandas as pd

patients = pd.read_csv('/kaggle/input/mimic3/PATIENTS.csv')
admissions = pd.read_csv('/kaggle/input/mimic3/ADMISSIONS.csv')
labevents = pd.read_csv('/kaggle/input/lab-events-mimic-iii/LABEVENTS.csv')
prescriptions = pd.read_csv('/kaggle/input/mimic-iii/PRESCRIPTIONS.csv')
diagnoses_icd = pd.read_csv('/kaggle/input/diagnosis-icd-mimic-iii/DIAGNOSES_ICD.csv')
noteevents = pd.read_csv('/kaggle/input/noteevents-mimic-iii/NOTEEVENTS.csv')

In [None]:
seendf = pd.read_csv('/kaggle/input/seen-fine-tuning-data/Rich Fine Tuning Data.csv')
seendf

## B. Subsampling Data to deal with fewer records due to the database size

In [None]:
import numpy as np

# Step 1: Get unique patient IDs from prescriptions
unique_patients = prescriptions['SUBJECT_ID'].unique()

# Step 2: Get IDs that were used before and should be excluded
seen_patients = seendf['SUBJECT_ID'].unique()

# Step 3: Exclude previously used IDs from the unique patients
available_patients = np.setdiff1d(unique_patients, seen_patients)

# Step 4: Randomly select a subset of patient IDs from the remaining patients
subset_size = int(0.013 * len(available_patients))
selected_patients = np.random.choice(available_patients, size=subset_size, replace=False)

# Step 5: Filter the DataFrame to include all rows corresponding to selected patients
presc_selected = prescriptions[prescriptions['SUBJECT_ID'].isin(selected_patients)]
adm_selected = admissions[admissions['SUBJECT_ID'].isin(selected_patients)]
patient_selected = patients[patients['SUBJECT_ID'].isin(selected_patients)]
noteevents_selected = noteevents[noteevents['SUBJECT_ID'].isin(selected_patients)]
labevents_selected = labevents[labevents['SUBJECT_ID'].isin(selected_patients)]
diagnoses_icd_selected = diagnoses_icd[diagnoses_icd['SUBJECT_ID'].isin(selected_patients)]


In [None]:
import numpy as np
# Step 1: Get unique patient IDs
unique_patients = prescriptions['SUBJECT_ID'].unique()

# Step 2: Randomly select a subset of patient IDs
subset_size = int(0.013 * len(unique_patients))
selected_patients = np.random.choice(unique_patients, size=subset_size, replace=False)

# Step 3: Filter the DataFrame to include all rows corresponding to selected patients
presc_selected = prescriptions[prescriptions['SUBJECT_ID'].isin(selected_patients)]

adm_selected = admissions[admissions['SUBJECT_ID'].isin(selected_patients)]

patient_selected = patients[patients['SUBJECT_ID'].isin(selected_patients)]

noteevents_selected = noteevents[noteevents['SUBJECT_ID'].isin(selected_patients)]

labevents_selected = labevents[labevents['SUBJECT_ID'].isin(selected_patients)]

diagnoses_icd_selected = diagnoses_icd[diagnoses_icd['SUBJECT_ID'].isin(selected_patients)]

## C. Data Transformation

In [None]:
# Convert date columns to datetime format
patient_selected['DOB'] = pd.to_datetime(patient_selected['DOB'])
adm_selected['ADMITTIME'] = pd.to_datetime(adm_selected['ADMITTIME'])
labevents_selected['CHARTTIME'] = pd.to_datetime(labevents_selected['CHARTTIME'])

In [None]:
# Calculate age at the time of each admission
adm_selected = adm_selected.merge(patient_selected[['SUBJECT_ID', 'DOB', 'GENDER']], on='SUBJECT_ID')
adm_selected['AGE'] = (adm_selected['ADMITTIME'].dt.year - adm_selected['DOB'].dt.year)

## D. Grouping Data

In [None]:
# Merge admissions with diagnoses
combined_data = adm_selected.merge(diagnoses_icd_selected[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']], on=['SUBJECT_ID', 'HADM_ID'], how='left')
combined_data

In [None]:
combined_data.info()

In [None]:
# Merge combined data with prescriptions
# Ensuring that prescriptions have been filtered for relevant columns if there are many unnecessary ones
presc_selected = presc_selected.merge(combined_data[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 'DIAGNOSIS', 'GENDER', 'AGE', 'ICD9_CODE']], on=['SUBJECT_ID', 'HADM_ID'], how='left')
presc_selected.drop_duplicates(inplace=True)
presc_selected

In [None]:
presc_selected.info()

### Merging lab events with the combined dataset

In [None]:
labevents_selected

### Extracting lab results for the selected patients (method 1)


#### This method consumes less time but returns less lab results for a single patient visit ( it doesn't return all lab results)

In [None]:
import pandas as pd
from tqdm import tqdm

# Reduce the DataFrame size by selecting only necessary columns
adm_reduced = adm_selected[['SUBJECT_ID', 'ADMITTIME', 'HADM_ID']]
labevents_reduced = labevents_selected[['SUBJECT_ID','HADM_ID','CHARTTIME', 'ITEMID','VALUE', 'VALUEUOM', 'FLAG']]
# Assuming DataFrame 'labevents_reduced' is pre-sorted by 'CHARTTIME'
labevents_reduced['CHARTTIME'] = pd.to_datetime(labevents_reduced['CHARTTIME'])
presc_selected['STARTDATE'] = pd.to_datetime(presc_selected['STARTDATE'])

# Using a copy to ensure that we don't modify the original DataFrame unintentionally
labevents_reduced = labevents_reduced.copy()

results = []

# Use tqdm for the progress bar
for index, presc_row in tqdm(presc_selected.iterrows(), total=presc_selected.shape[0]):
    subid = presc_row['SUBJECT_ID']
    hadm_id = presc_row['HADM_ID']
    startdate = presc_row['STARTDATE']
    
    # Filter labevents for the same subject and hospital admission
    filtered_labevents = labevents_reduced[(labevents_reduced['SUBJECT_ID'] == subid) & (labevents_reduced['HADM_ID'] == hadm_id)].copy()
    
    if not filtered_labevents.empty:
        # Calculate the time difference safely
        filtered_labevents['TIME_DIFF'] = (filtered_labevents['CHARTTIME'] - startdate).abs()
        # Check if all entries are NaN
        if filtered_labevents['TIME_DIFF'].notna().any():
            closest_labevent = filtered_labevents.loc[filtered_labevents['TIME_DIFF'].idxmin()]
            
            # Append found data to results
            results.append({
                'SUBJECT_ID': subid,
                'HADM_ID': hadm_id,
                'STARTDATE' : presc_row['STARTDATE'],
                'ENDDATE' : presc_row['ENDDATE'],
                'ADMISSION_TYPE' : presc_row['ADMISSION_TYPE'],
                'ADMISSION_LOCATION' : presc_row['ADMISSION_LOCATION'],
                'DISCHARGE_LOCATION' : presc_row['DISCHARGE_LOCATION'],
                'DIAGNOSIS' : presc_row['DIAGNOSIS'],
                'ICD9_CODE' : presc_row['ICD9_CODE'],
                'GENDER' : presc_row['GENDER'],
                'AGE' : presc_row['AGE'],
                'DRUG': presc_row['DRUG'],
                'DRUG_TYPE': presc_row['DRUG_TYPE'],
                'PROD_STRENGTH': presc_row['PROD_STRENGTH'],
                'DOSE_VAL_RX': presc_row['DOSE_VAL_RX'],
                'DOSE_UNIT_RX': presc_row['DOSE_UNIT_RX'],
                'FORM_VAL_DISP': presc_row['FORM_VAL_DISP'],
                'FORM_UNIT_DISP': presc_row['FORM_UNIT_DISP'],
                'ROUTE': presc_row['ROUTE'],
                # Add other fields as necessary from presc_row and closest_labevent
                'LAB_CHARTTIME': closest_labevent['CHARTTIME'],
                'LAB_ITEMID': closest_labevent['ITEMID'],
                'LAB_VALUE': closest_labevent['VALUE'],
                'LAB_VALUEUOM': closest_labevent['VALUEUOM'],
                'LAB_FLAG': closest_labevent['FLAG']
            })
        else:
            # Handle cases where no valid lab event is found
            results.append({
                'SUBJECT_ID': subid,
                'HADM_ID': hadm_id,
                'STARTDATE' : presc_row['STARTDATE'],
                'ENDDATE' : presc_row['ENDDATE'],
                'ADMISSION_TYPE' : presc_row['ADMISSION_TYPE'],
                'ADMISSION_LOCATION' : presc_row['ADMISSION_LOCATION'],
                'DISCHARGE_LOCATION' : presc_row['DISCHARGE_LOCATION'],
                'DIAGNOSIS' : presc_row['DIAGNOSIS'],
                'ICD9_CODE' : presc_row['ICD9_CODE'],
                'GENDER' : presc_row['GENDER'],
                'AGE' : presc_row['AGE'],
                'DRUG': presc_row['DRUG'],
                'DRUG_TYPE': presc_row['DRUG_TYPE'],
                'PROD_STRENGTH': presc_row['PROD_STRENGTH'],
                'DOSE_VAL_RX': presc_row['DOSE_VAL_RX'],
                'DOSE_UNIT_RX': presc_row['DOSE_UNIT_RX'],
                'FORM_VAL_DISP': presc_row['FORM_VAL_DISP'],
                'FORM_UNIT_DISP': presc_row['FORM_UNIT_DISP'],
                'ROUTE': presc_row['ROUTE'],
                # Set placeholders or None for missing lab event data
                'LAB_CHARTTIME': None,
                'LAB_ITEMID': None,
                'LAB_VALUE': None,
                'LAB_VALUEUOM': None,
                'LAB_FLAG': None
            })
    else:
        # Handle cases where no lab events match the filtering criteria
        results.append({
            'SUBJECT_ID': subid,
            'HADM_ID': hadm_id,
            'STARTDATE' : presc_row['STARTDATE'],
            'ENDDATE' : presc_row['ENDDATE'],
            'ADMISSION_TYPE' : presc_row['ADMISSION_TYPE'],
            'ADMISSION_LOCATION' : presc_row['ADMISSION_LOCATION'],
            'DISCHARGE_LOCATION' : presc_row['DISCHARGE_LOCATION'],
            'DIAGNOSIS' : presc_row['DIAGNOSIS'],
            'ICD9_CODE' : presc_row['ICD9_CODE'],
            'GENDER' : presc_row['GENDER'],
            'AGE' : presc_row['AGE'],
            'DRUG': presc_row['DRUG'],
            'DRUG_TYPE': presc_row['DRUG_TYPE'],
            'PROD_STRENGTH': presc_row['PROD_STRENGTH'],
            'DOSE_VAL_RX': presc_row['DOSE_VAL_RX'],
            'DOSE_UNIT_RX': presc_row['DOSE_UNIT_RX'],
            'FORM_VAL_DISP': presc_row['FORM_VAL_DISP'],
            'FORM_UNIT_DISP': presc_row['FORM_UNIT_DISP'],
            'ROUTE': presc_row['ROUTE'],
            # Set placeholders or None for missing lab event data
            'LAB_CHARTTIME': None,
            'LAB_ITEMID': None,
            'LAB_VALUE': None,
            'LAB_VALUEUOM': None,
            'LAB_FLAG': None
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display some of the results
results_df


### Extracting lab results for the selected patients (method 2)

#### This method consumes more time but efficient with returning mosst lab results for a single patient visit 

In [None]:
import pandas as pd
from tqdm import tqdm

# Ensure necessary columns are in reduced DataFrames for processing
adm_reduced = adm_selected[['SUBJECT_ID', 'ADMITTIME', 'HADM_ID']]
labevents_reduced = labevents_selected[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM', 'FLAG']]

# Filter for abnormal lab results only
#labevents_reduced = labevents_reduced[labevents_reduced['FLAG'] == 'abnormal']

# Convert date columns to datetime format
labevents_reduced['CHARTTIME'] = pd.to_datetime(labevents_reduced['CHARTTIME'])
presc_selected['STARTDATE'] = pd.to_datetime(presc_selected['STARTDATE'])

# Create an empty list to store results
results = []

# Track used lab events to avoid reusing
used_lab_ids = set()

# Loop over each prescription row to find a matching unique lab test
for index, presc_row in tqdm(presc_selected.iterrows(), total=presc_selected.shape[0]):
    subid = presc_row['SUBJECT_ID']
    hadm_id = presc_row['HADM_ID']
    startdate = presc_row['STARTDATE']
    
    # Filter abnormal lab events for the same subject and admission
    filtered_labevents = labevents_reduced[(labevents_reduced['SUBJECT_ID'] == subid) & 
                                           (labevents_reduced['HADM_ID'] == hadm_id)].copy()
    
    # Calculate the time difference between prescription start date and each lab test time
    filtered_labevents['TIME_DIFF'] = (filtered_labevents['CHARTTIME'] - startdate).abs()
    
    # Sort by time difference to get the closest first and reset index to maintain unique row IDs
    filtered_labevents = filtered_labevents.sort_values(by='TIME_DIFF').reset_index(drop=True)
    
    lab_match_found = False
    for lab_index, lab_row in filtered_labevents.iterrows():
        # Create a unique identifier for the lab test (using relevant columns)
        lab_id = (lab_row['ITEMID'], lab_row['CHARTTIME'], lab_row['VALUE'], lab_row['VALUEUOM'])
        
        # Check if this lab test has already been used
        if lab_id not in used_lab_ids:
            # Mark this lab test as used and set lab_match_found to True
            used_lab_ids.add(lab_id)
            lab_match_found = True
            
            # Append the result with all details of the matched lab test
            results.append({
                'SUBJECT_ID': subid,
                'HADM_ID': hadm_id,
                'STARTDATE': presc_row['STARTDATE'],
                'ENDDATE': presc_row['ENDDATE'],
                'ADMISSION_TYPE': presc_row['ADMISSION_TYPE'],
                'ADMISSION_LOCATION': presc_row['ADMISSION_LOCATION'],
                'DISCHARGE_LOCATION': presc_row['DISCHARGE_LOCATION'],
                'DIAGNOSIS': presc_row['DIAGNOSIS'],
                'ICD9_CODE': presc_row['ICD9_CODE'],
                'GENDER': presc_row['GENDER'],
                'AGE': presc_row['AGE'],
                'DRUG': presc_row['DRUG'],
                'DRUG_TYPE': presc_row['DRUG_TYPE'],
                'PROD_STRENGTH': presc_row['PROD_STRENGTH'],
                'DOSE_VAL_RX': presc_row['DOSE_VAL_RX'],
                'DOSE_UNIT_RX': presc_row['DOSE_UNIT_RX'],
                'FORM_VAL_DISP': presc_row['FORM_VAL_DISP'],
                'FORM_UNIT_DISP': presc_row['FORM_UNIT_DISP'],
                'ROUTE': presc_row['ROUTE'],
                # Add details of the matched lab event
                'LAB_CHARTTIME': lab_row['CHARTTIME'],
                'LAB_ITEMID': lab_row['ITEMID'],
                'LAB_VALUE': lab_row['VALUE'],
                'LAB_VALUEUOM': lab_row['VALUEUOM'],
                'LAB_FLAG': lab_row['FLAG']
            })
            break  # Break the loop once a unique lab test is assigned to this prescription

    # If no unused lab test found, append the prescription row with None for lab details
    if not lab_match_found:
        results.append({
            'SUBJECT_ID': subid,
            'HADM_ID': hadm_id,
            'STARTDATE': presc_row['STARTDATE'],
            'ENDDATE': presc_row['ENDDATE'],
            'ADMISSION_TYPE': presc_row['ADMISSION_TYPE'],
            'ADMISSION_LOCATION': presc_row['ADMISSION_LOCATION'],
            'DISCHARGE_LOCATION': presc_row['DISCHARGE_LOCATION'],
            'DIAGNOSIS': presc_row['DIAGNOSIS'],
            'ICD9_CODE': presc_row['ICD9_CODE'],
            'GENDER': presc_row['GENDER'],
            'AGE': presc_row['AGE'],
            'DRUG': presc_row['DRUG'],
            'DRUG_TYPE': presc_row['DRUG_TYPE'],
            'PROD_STRENGTH': presc_row['PROD_STRENGTH'],
            'DOSE_VAL_RX': presc_row['DOSE_VAL_RX'],
            'DOSE_UNIT_RX': presc_row['DOSE_UNIT_RX'],
            'FORM_VAL_DISP': presc_row['FORM_VAL_DISP'],
            'FORM_UNIT_DISP': presc_row['FORM_UNIT_DISP'],
            'ROUTE': presc_row['ROUTE'],
            # Placeholder for missing lab data
            'LAB_CHARTTIME': None,
            'LAB_ITEMID': None,
            'LAB_VALUE': None,
            'LAB_VALUEUOM': None,
            'LAB_FLAG': None
        })

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results)

# Display some of the results
results_df


In [None]:
# Saving the Dataset
results_df.to_csv(r"Prescriptions with Labtest 14-11.csv")

In [None]:
results_df['SUBJECT_ID'].nunique()

In [None]:
# Get the unique patient IDs
unique_patients = results_df['SUBJECT_ID'].unique()

# Save the unique IDs to a text file
with open('unique patients 13-9.txt', 'w') as f:
    for patient_id in unique_patients:
        f.write(f"{patient_id}\n")


In [None]:
#df.to_csv(r"Reduced Mimic noisy 13-9.csv")
adm_selected.to_csv(r"Reduced admission 13-9.csv")
patient_selected.to_csv(r"Reduced Patients 13-9.csv")

## New session for grouping the last dataset with note events table
#### please restart the kernel and upload the last combined dataset (data with prescripions nad lab events)

In [None]:
import pandas as pd
import numpy as np

noteevents = pd.read_csv('/kaggle/input/noteevents-mimic-iii/NOTEEVENTS.csv')
results_df = pd.read_csv('/kaggle/input/mimic-pre-final-27-11/MIMIC PRE FINAL 27-11.csv')

# Step 1: Get unique patient IDs
unique_patients = results_df['SUBJECT_ID'].unique()

noteevents_selected = noteevents[noteevents['SUBJECT_ID'].isin(unique_patients)]

In [None]:
#results_df = results_df.drop(['Unnamed: 0'], axis=1)
results_df

***Run the following cell only if you have reduced the dataset size previously, otherwise it will crash your memory***

In [None]:
# This step can be computationally intensive, consider optimizing by pre-filtering noteevents
noteevents_filtered = noteevents_selected[['SUBJECT_ID', 'HADM_ID', 'TEXT']]
results_df = results_df.merge(noteevents_filtered, on=['SUBJECT_ID', 'HADM_ID'], how='left')
results_df

In [None]:
results_df.to_csv("MIMIC without diag.csv.csv")

## Old Symptoms Extraction from NoteEvents

#### This method extracts symptoms from patient's clinical notes based on some regex patterns

In [None]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm

# Load the English NLP model
nlp = spacy.load('en_core_web_sm')

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Define more specific patterns to capture common ways symptoms might be described in clinical notes
patterns = [
    [{"LOWER": "complaints"}, {"LOWER": "of"}],  # to match "complaints of..."
    [{"LOWER": "complaint"}, {"IS_SPACE": True, "OP": "*"}, {"LOWER": "is"}],  # "Chief complaint is..."
    [{"LOWER": "presenting"}, {"LOWER": "symptoms"}],  # "presenting symptoms are..."
    [{"LOWER": "presents"}, {"LOWER": "with"}],  # "patient presents with..."
    [{"LOWER": "symptoms"}, {"LOWER": "include"}],  # "symptoms include..."
    [{"LOWER": "symptom"}, {"LOWER": "of"}]  # For any singular references to symptoms
]

# Add the pattern to the matcher
matcher.add("SYMPTOM_DETECTION", patterns)

# Function to apply the matcher to the text and extract a reasonable amount of following content
def extract_symptoms(text):
    doc = nlp(text)
    matches = matcher(doc)
    symptoms = []
    for match_id, start, end in matches:
        extended_end = end + 10 if end + 10 < len(doc) else len(doc)
        span = doc[start:extended_end]
        symptoms.append(span.text)
    return symptoms

# Use tqdm to show a progress bar when applying the function across the DataFrame
tqdm.pandas(desc="Extracting symptoms")
noteevents_selected['extracted_symptoms'] = noteevents_selected['TEXT'].progress_apply(extract_symptoms)

# Save or display the updated DataFrame
noteevents_selected.to_csv('updated_noteevents.csv', index=False)
noteevents_selected[['TEXT', 'extracted_symptoms']].head()

## New Symptoms Extraction from NoteEvents

#### This method extracts symptoms from patient's clinical notes based on a set of symptoms given to be extracted if they are found

In [None]:
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Step 2: Initialize Matcher for pattern-based symptom extraction
matcher = Matcher(nlp.vocab)
patterns = [
    [{"LOWER": "complaints"}, {"LOWER": "of"}, {"OP": "*"}],  # Capture "complaints of [symptoms]"
    [{"LOWER": "complaint"}, {"IS_SPACE": True, "OP": "*"}, {"LOWER": "is"}, {"OP": "*"}],  # "Chief complaint is..."
    [{"LOWER": "presenting"}, {"LOWER": "symptoms"}, {"OP": "*"}],  # "presenting symptoms are..."
    [{"LOWER": "presents"}, {"LOWER": "with"}, {"OP": "*"}],  # "presents with [symptoms]"
    [{"LOWER": "symptoms"}, {"LOWER": "include"}, {"OP": "*"}],  # "symptoms include [symptoms]"
    [{"LOWER": "symptom"}, {"LOWER": "of"}, {"OP": "*"}]  # For any singular references to symptoms
]
matcher.add("SYMPTOM_DETECTION", patterns)

# Step 3: Define list of symptoms for keyword matching
all_symptoms = ['heart murmur', 'hardening', 'neck pain', 'spasm','ulcers on tongue',
                'orthopnea', 'stomatitis', 'abscess', 'pus filled pimples', 'deaf',
                'walking difficulty', 'ileitis','acute gastritis', 'hematoma',
                'mal de mer', 'skin rash','inclusion body myositis', "Kernig's sign",
                'radiation sickness','wasting', 'scratching', 'laryngopharyngitis',
                'lumbago', 'squint','parotitis', 'smell disturbance', 'balanitis',
                'exanthema',
       'oedema', 'mucoid sputum', 'drying and tingling lips', 'uraturia',
       'growing pains', 'breathing difficulty', 'hyperlipoidemia',
       'familial hypercholesterolemia', 'haemothorax', 'hematocele',
       'acute glossitis', 'fasciculation', 'acute liver failure',
       'trismus', 'stomach rumbles', 'minimal brain dysfunction',
       'kraurosis', 'esophagitis', 'griping', 'ophthalmia neonatorum',
       'swelling of stomach', 'epidemic encephalitis', 'sweating',
       'stomach upset', 'spotting  urination', 'birth pangs', 'wrick',
       'polyuria', 'chafe', "Paget's disease", 'pulmonary congestion',
       'primary dysmenorrhea', 'oophoritis', 'hydrarthrosis',
       'Marie-Strumpell disease', 'hearing loss', 'toxic look (typhos)',
       'receiving blood transfusion', 'oesophagitis', 'lymphangitis',
       'watering from eyes', 'skin peeling', 'carditis', 'dacryocystitis',
       'keloid', 'phantom limb syndrome', 'regional ileitis',
       'stomachache', 'bagascosis', 'hypersplenism',
       'malabsorption syndrome', 'photophobia', 'sweaty', 'ADD',
       'movement impairment', 'haemoptysis', 'formication', 'agony',
       'lead colic', 'myositis trichinosa', 'myodynia', 'migraine',
       'memory issue', 'haematocyturia', 'encephalitis', 'neuralgy',
       'hip joint pain', 'heartburn', 'ophthalmitis', 'hyperlipaemia',
       'coma', 'lipemia', 'swelled lymph nodes', 'dischromic  patches',
       'pain', 'cervical root syndrome', 'nauseated', 'megrim',
       'glossodynia exfoliativa', 'afterpains', 'hypocalcemia',
       'prominent veins on calf', 'effect', 'abnormal menstruation',
       "Reiter's disease", 'cough', 'rusty sputum',
       'inflammatory bowel disease', 'ammoniuria', 'pleuralgia',
       'yellow crust ooze', 'tumidity', 'encephalomyelitis',
       'ulcerative colitis', "Noonan's syndrome", 'swelling', 'thirsty',
       'rheumatoid spondylitis', 'lethargy', 'tooth pain', 'colitis',
       'panencephalitis', 'looseness', 'looseness of the bowels',
       'acetonemia', 'yellowing of eyes', 'back pain', 'cephalitis',
       'cerebral edema', 'jaundice of the newborn', 'otitis interna',
       'swelling joints', 'ague', 'nephrosis', 'urtication',
       'Williams syndrome', 'proctitis', 'anaemia', 'mastalgia',
       'steatorrhea', 'weak', 'autism', 'ache', 'proctalgia',
       'epididymitis', 'chorditis', 'iridoncus', 'blackheads',
       'otitis media', 'hemoglobinuria', 'tympanitis',
       'lateral humeral epicondylitis', 'scleredema',
       'temporal arteritis', 'urodynia', 'skin eruption', 'ear pain',
       'Van Bogaert encephalitis', 'hyperlipidaemia',
       'periodic apnea of the newborn', 'smarting', 'prodrome',
       'uricaciduria', 'yellowish skin', 'shin splints', 'laminitis',
       'red spots over body', 'hypocalcaemia', 'ochronosis',
       'swollen legs', "writer's cramp", 'haemoglobinuria',
       'loss of balance', 'pain behind the eyes', 'sleepy',
       'Persian Gulf illness', 'tendinitis', 'cystoid macular edema',
       'hydrophobia', 'kaliuresis', 'speech issue', 'intumescency',
       'small dents in nails', 'SOB', 'backache', 'thyroiditis',
       'eosinopenia', 'attention deficit hyperactivity disorder',
       'inclusion body encephalitis', 'infectious polyneuritis',
       'tracheobronchitis', "Kaposi's varicelliform eruption", 'callus',
       'photalgia', 'vasculitis', 'areflexia', 'keratitis', 'mood swings',
       'enanthem', 'rubor', 'kraurosis vulvae', 'tinnitus',
       'abscessed tooth', 'febricity', 'alveolitis', 'chloasma',
       'peritonitis', 'anasarca', 'alkalinuria', "Crohn's disease",
       'nodal skin eruptions', 'glossodynia', 'multiple neuritis',
       'podalgia', 'polymyositis', 'ketoaciduria', 'abdominal pain',
       "Montezuma's revenge", 'blister', 'graphospasm', 'tenonitis',
       'aching', 'muscle spasm', 'fever', 'knee pain',
       'chronic gastritis', 'leg pain', 'vaccination',
       'West Nile encephalitis', 'air sickness', 'pollinosis',
       'palpitation', 'pockmark', 'tonsillitis', 'kinetosis', 'palsy',
       'MBD', 'angiitis', 'scalenus syndrome', 'wheeziness', 'phrenitis',
       'hypercalcemia', 'chronic glossitis', 'vomiting', 'megacardia',
       'polyneuritis', 'toxic shock', 'lipaemia', 'hypercholesteremia',
       'excessive hunger', 'head pain', 'rectum pain', 'cholecystitis',
       'alkaluria', 'thoracic outlet syndrome', 'endocarditis',
       'apyretic tetanus', 'ventricular fibrillation', 'hyperlipidemia',
       "Quincke's edema", 'eruption', 'hypercalciuria',
       'radiation syndrome', 'FAS', 'hydrops', 'patches in throat',
       'melasma', 'stomach bleeding', 'myometritis', 'hyperkalemia',
       'scleritis', "Graves' disease", 'inflammatory nails',
       'fetal alcohol syndrome', 'odynophagia', 'lipoidaemia',
       'yellow urine', 'pancreatitis', 'regional enteritis', 'itching',
       'shortness of breath', 'oscheocoele', 'burning', 'blepharism',
       'vesiculitis', 'tendonitis', 'episcleritis', 'tetanilla',
       'keratomalacia', "Fallot's syndrome", 'cardiomegaly', 'hemoptysis',
       'aftereffect', 'headache', "Fallot's tetralogy", 'anemia',
       'squeamishness', 'black tongue', 'febrility', 'cellulitis',
       'retrobulbar neuritis', 'spastic colon', 'myoglobinuria',
       'subacute inclusion body encephalitis', 'dyspnoea',
       'breathlessness', 'swollen extremeties', "Reiter's syndrome",
       'tic douloureux', 'prodroma', 'cardiac murmur', 'cicatrice',
       'pericarditis', 'lipidemia', 'myoclonus', 'secondary dysmenorrhea',
       'obesity', 'constipation', 'irritable bowel syndrome',
       'restless legs syndrome', 'prostatitis', 'salpingitis',
       'defecation issue', 'hypercholesterolemia',
       'maple syrup urine disease', 'diuresis', 'purulence', 'stinging',
       'twitching', 'Gilles de la Tourette syndrome', 'chest pain',
       'dehydration', 'congestion', 'arthralgia', 'lipoidemia',
       'burning micturition', 'the shits', 'pyrexia', 'unsteadiness',
       'subacute sclerosing panencephalitis', 'phalangitis', 'sneezing',
       "Munchausen's syndrome", 'crib death', 'giddiness', 'uratemia',
       'hyperemia', 'pang', 'sialadenitis', 'stomach pain', 'hiccup',
       'cheloid', 'smartness', 'hairy tongue', 'runny nose', 'vaginitis',
       'hydrothorax', 'peritoneal inflammation', 'periodic edema',
       'furry tongue', 'muscle weakness', "Klinefelter's syndrome",
       'snotty nose', 'tarsitis', 'roseola', 'bloody stool', 'rachitis',
       'trichiniasis', 'urticaria', 'posthitis', 'pelvis pain',
       'writing difficulty', 'cot death', 'weight loss',
       'history of alcohol consumption', 'Ramsay Hunt syndrome',
       'vasovesiculitis', 'haematuria', 'gastralgia', 'lymphedema',
       'balanoposthitis', 'epiglottitis', 'mucous colitis', 'throb',
       'haemorrhoid', "housemaid's knee", 'jejunoileitis', 'lump',
       'nausea', 'hay fever', 'suffering', 'otitis', 'pancarditis',
       'tetany', 'dark urine', 'keratoiritis', "Landry's paralysis",
       'tenosynovitis', 'physiological jaundice of the newborn',
       'anxiety', 'aortitis', 'hyperkinetic syndrome', 'appendicitis',
       'cervicitis', 'altered sensorium', "Takayasu's arteritis",
       'barking cough', 'periarteritis', 'cramp', 'hyperglycemia',
       'silver like dusting', 'aerodontalgia', 'iridocyclitis',
       'kernicterus', 'osteomyelitis', 'menorrhagia',
       'pain during bowel movements', 'rebound tenderness',
       'pansinusitis', 'pain in anal region', 'nephrotic syndrome',
       'cluster headache', 'metritis', 'tennis elbow', 'burn',
       'equine encephalomyelitis', 'phlebitis', 'intumescence',
       'increased appetite', 'hot flash', 'hoarse', 'acrocyanosis',
       'gut pain', 'infant death', 'dry mouth', 'crepitation rale',
       'histamine headache', 'retinitis', 'diarrhoea', "Conn's syndrome",
       'oscheocele', 'subacute sclerosing leukoencephalitis',
       'swallowing difficulty', 'snuffles', 'costalgia',
       'blurred and distorted vision', 'hives', 'acute encephalitis',
       'haematocoele', 'muscae volitantes', 'crick', 'enanthema',
       'icterus neonatorum', 'thermalgesia', 'weak heart', 'joint pain',
       'atrophy', 'fecal impaction', 'continuous feel of urine',
       'restlessness', 'stomach ache', 'dyspepsia', 'dizzy',
       'secondary amenorrhea', 'sleeping sickness', 'hemoglobinemia',
       'leukoencephalitis', 'polyarteritis', "Bosin's disease",
       'arteritis', 'skin pain', 'funiculitis', 'hyperaemia',
       'brain edema', 'proteinuria', 'taste disturbance',
       'laryngotracheobronchitis', 'parametritis', "Moeller's glossitis",
       'albuminuria', 'bellyache', 'fibromyositis', 'hypoglycaemia',
       'labour pains', 'pinkeye', 'conjunctivitis', 'hyperpyrexia',
       'sore throat', 'enlarged thyroid', 'branched chain ketoaciduria',
       'haematocele', 'blood in sputum', 'irritation in anus', 'flush',
       'withdrawal symptom', 'coryza', 'bubo', 'equine encephalitis',
       'cheilitis', 'giant hives', 'vertigo', 'fast heart rate',
       'hurting', 'throe', 'acidity', 'distention of abdomen', 'furring',
       'puffy face and eyes', 'myalgia', 'palpitations',
       'premenstrual syndrome', 'cicatrix', 'haemoglobinemia',
       'calcification', 'costochondritis', 'ulitis', 'infantile autism',
       'passage of gases', 'lateral epicondylitis', 'seasickness',
       'vulvovaginitis', 'cold hands and feets', 'visual disturbances',
       "farmer's lung", 'bruising', 'nephralgia', 'Ekbom syndrome',
       'hypoglycemia', 'systolic murmur', 'sternutation', 'callosity',
       'inflammation', 'folliculitis', 'rhinitis', 'fructosuria',
       'the trots', "Reye's syndrome", 'upset stomach',
       'Gulf War syndrome', 'irritability', 'sting', 'ulalgia',
       'pleurodynia', 'neuritis', 'sciatica', 'clubbing', 'monocytosis',
       'Klinefelter syndrome', 'red sore around nose', 'hyperglycaemia',
       'efflorescence', 'tension headache', 'eczema vaccinatum',
       'herpes encephalitis', 'causalgia', 'jejunitis',
       'primary amenorrhea', 'abdomen pain', 'encephalomeningitis',
       'fluid overload', 'encephalitis lethargica', 'sleep disturbance',
       'hemicrania', 'numbness', 'uveitis', 'XXY-syndrome',
       'swollen blood vessels', 'high fever', 'referred pain',
       'intermittent cramp', 'glossitis', 'glucosuria',
       'exophthalmic goiter', 'tetralogy of Fallot', 'aminoaciduria',
       'musca volitans', 'toxic shock syndrome', 'trigeminal neuralgia',
       'amyotrophy', 'weight gain', 'mask of pregnancy',
       'ankylosing spondylitis', 'trichinosis', 'apnea',
       'redness of eyes', 'exophthalmos', 'puffiness',
       'irregular sugar level', 'jaundice', 'ketonuria', 'hemothorax',
       'necrotizing enteritis', 'pneumonitis', 'continuous sneezing',
       'attention deficit disorder', 'fibrillation', 'intestinal colic',
       'murmur', 'blepharitis', 'labyrinthitis', 'vulvitis',
       'hypernatremia', 'gastritis', 'thrombocytosis',
       "Tourette's syndrome", 'foul smell of urine',
       'weakness of one body side', 'movement stiffness',
       'intermittent tetanus', 'otitis externa', 'splenitis',
       'odontalgia', 'qualm', 'nasal congestion', 'hiccough',
       'car sickness', 'rubella panencephalitis', 'meningoencephalitis',
       'chafing', 'tenesmus', 'slurred speech', 'naupathia',
       'tendosynovitis', 'cramps', 'dropsy', 'iritis', 'tumidness',
       'charley horse', 'light-headed', "thresher's lung", 'rhinorrhea',
       'head ache', 'lymphogranuloma', 'extra marital contacts',
       'belly pain', 'muscle wasting', 'bagassosis', 'cerebromeningitis',
       'hemosiderosis', 'mastoiditis', 'radiculitis', 'spermatocele',
       'bursitis', 'earache', 'TSS', 'uvulitis', 'blepharospasm',
       'lymphuria', 'mild fever', 'edema', 'internal itching',
       'dyschezia', 'chemosis', 'malaise', 'megalocardia', 'meralgia',
       'spinning movements', 'myelitis', 'irregularity', 'colpocystitis',
       "Horner's syndrome", 'paresthesia', 'opisthotonos',
       'lethargic encephalitis', 'endometritis', 'natriuresis',
       'amenorrhea', "Dawson's encephalitis", 'sunken eyes', 'toothache',
       'rheumatic aortitis', 'dermatomyositis', 'hyponatremia',
       'minimal brain damage', 'haematoma', 'bloat',
       'lack of concentration', 'corditis', 'fatigue', 'dizziness',
       'dysmenorrhea', "Koplik's spots", 'neuralgia',
       'atrial fibrillation', 'cyanosis', 'chronic pain', 'hemorrhoid',
       'sleep apnea', 'tendonous synovitis', 'chorioretinitis',
       'ureteritis', 'osteitis deformans', 'engorgement', 'hyperlipemia',
       'weakness in limbs', 'chiralgia', 'diverticulitis',
       'subacute bacterial endocarditis', 'melagra', 'iridokeratitis',
       'clavus', 'loss of appetite', "Tietze's syndrome", 'valvulitis',
       'lightheadedness', 'shivering', 'purulency', 'fart', 'tired',
       'sclerosing leukoencephalitis', 'metralgia', 'quartan', 'pyuria',
       'glossalgia', 'distress', 'lazy eye', 'ketosis', 'stridor',
       'papilledema', 'angioedema', 'cephalalgia', 'hypoproteinemia',
       'keratoconjunctivitis', 'myocardial inflammation', 'ophthalmia',
       'bummer', 'adenitis', 'brittle nails', 'belch', 'otalgia',
       'Munchausen syndrome', 'stuffiness', 'hypermenorrhea', 'kaluresis',
       'chesty cough', 'SSPE', 'chill', 'pulseless disease',
       'throat irritation', 'cervical disc syndrome', 'mastitis',
       'phlegm', 'enlarged heart', "Jacquemier's sign", 'loss of smell',
       'Waterhouse-Friderichsen syndrome', 'oliguria', 'stiff neck',
       'ADHD', 'bunion', 'hematocyturia', 'muscle pain', 'torment',
       'spondylitis', 'painful walking', 'dry socket', 'morning sickness',
       'exanthem', 'orchitis', 'soreness', 'airsickness',
       'haemosiderosis', 'pyrosis', 'chills and fever', 'motion sickness',
       'ketonemia', 'sinus pressure', 'hematuria', "painter's colic",
       'eosinophilia', 'myocarditis', 'fibrositis', 'waterworks',
       'endocervicitis', 'amyotrophia', 'chills', 'ovaritis',
       'indigestion', 'dyspnea', 'keratoscleritis', 'hypokalemia',
       'renal colic', 'sneeze', 'lymphadenitis', 'orchidalgia',
       'sudden infant death syndrome', 'hypercalcaemia', 'cholangitis',
       'hypercalcinuria', 'urination issue', 'colpitis', 'vellication',
       'paraesthesia', 'labor pains', 'postnasal drip', 'queasiness',
       'depression', 'sick headache', 'endarteritis', 'tabes',
       'acetonuria', 'spots', 'catarrh', 'tracheitis', 'laryngitis',
       'tic', 'Zollinger-Ellison syndrome', 'osteitis',
       'herpes simplex encephalitis', 'intertrigo', 'epicondylitis',
       'icterus', 'family history', 'lipidaemia', 'coughing',
       'festination', 'Kayser-Fleischer ring', 'keratalgia', 'PMS',
       'aura', 'vesicular stomatitis', 'obstipation', 'prickly heat',
       'allergic rhinitis', 'meningism', 'water on the knee', 'scurring',
       'twitch', 'tumescence', 'hyperlipoidaemia', 'myositis',
       'sinusitis', 'heat rash', 'miliaria', 'glycosuria', 'labor pain',
       'diarrhea', 'feverishness', 'enteritis', 'bladder discomfort',
       'gripes', 'gastric problems', 'nettle rash', 'floater',
       'vision issue', 'hematocoele', 'colic', 'lumbar pain',
       'thrombophlebitis', 'tenderness', 'excruciation', 'torture',
       'synovitis', 'sinus headache', 'wasting away', 'sword-cut',
       'Chinese restaurant syndrome', 'receiving unsterile injections']
unique_symptoms = [symptom.strip().lower() for symptom in all_symptoms]

# Step 4: Function to handle negations and extract symptoms (both single and multi-word phrases)
def is_negated(token):
    """Check if the symptom is negated by looking for a negation modifier."""
    for child in token.children:
        if child.dep_ == 'neg':
            return True
    return False

def extract_symptoms(text):
    doc = nlp(text)

    symptoms_found = []

    # 1. Pattern-based matching for sentence-like symptoms
    matches = matcher(doc)
    for match_id, start, end in matches:
        # Extend the span after the match to capture the symptom phrase, skipping the starters
        if doc[start:end].text.startswith(('presents', 'complaints', 'symptoms', 'complaint')):
            symptom_start = end  # Start capturing symptoms after the pattern
            symptom_span = doc[symptom_start:symptom_start + 10] if symptom_start + 10 < len(doc) else doc[symptom_start:]
            symptoms_found.append(symptom_span.text.strip())

    # 2. Keyword-based symptom extraction for single symptoms
    for token in doc:
        if token.text.lower() in unique_symptoms and not is_negated(token):
            symptoms_found.append(token.text)

    return list(set(symptoms_found))  # Remove duplicates

# Step 5: Apply the function to the note events and extract symptoms
tqdm.pandas(desc="Extracting symptoms")
noteevents_selected['extracted_symptoms'] = noteevents_selected['TEXT'].progress_apply(extract_symptoms)

# Step 6: Save the extracted symptoms
noteevents_selected.to_csv('updated_noteevents_with_combined_symptoms.csv', index=False)

# Display the extracted symptoms
print(noteevents_selected[['TEXT', 'extracted_symptoms']].head())

In [None]:
# Returning only the rows which they have symptoms to visualize the extracted symptoms 
filtered_noteevents = noteevents_selected[noteevents_selected['extracted_symptoms'].apply(lambda x: len(x) > 0)]
filtered_noteevents

In [None]:
results_df

In [None]:
# Drop rows with missing STARTDATE values
results_df = results_df.dropna(subset=['STARTDATE'])

# Drop rows with missing CHARTDATE values in filtered_noteevents
filtered_noteevents = filtered_noteevents.dropna(subset=['CHARTDATE'])

results_df['STARTDATE'] = pd.to_datetime(results_df['STARTDATE'])
filtered_noteevents['CHARTDATE'] = pd.to_datetime(filtered_noteevents['CHARTDATE'])

# Example to display the minimum difference in dates for a subset
results_df['nearest_chartdate'] = pd.merge_asof(
    results_df.sort_values('STARTDATE'),
    filtered_noteevents[['CHARTDATE']].sort_values('CHARTDATE'),
    left_on='STARTDATE',
    right_on='CHARTDATE',
    direction='nearest',
    allow_exact_matches=True
)['CHARTDATE']

# Calculate the time difference
results_df['date_diff'] = (results_df['STARTDATE'] - results_df['nearest_chartdate']).abs()

# Check statistics of date differences
print(results_df['date_diff'].describe())

In [None]:
filtered_noteevents = filtered_noteevents.dropna(subset=['HADM_ID'])
filtered_noteevents['HADM_ID'] = filtered_noteevents['HADM_ID'].astype('int64')


# Ensure that SUBJECT_ID and HADM_ID have the same data type in both DataFrames
results_df['SUBJECT_ID'] = results_df['SUBJECT_ID'].astype('int64')
results_df['HADM_ID'] = results_df['HADM_ID'].astype('int64')

filtered_noteevents['SUBJECT_ID'] = filtered_noteevents['SUBJECT_ID'].astype('int64')
filtered_noteevents['HADM_ID'] = filtered_noteevents['HADM_ID'].astype('int64')

# Adjust tolerance to limit the maximum time difference
import pandas as pd

merged_df = pd.merge_asof(
    results_df.sort_values('STARTDATE'),
    filtered_noteevents[['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'extracted_symptoms']].sort_values('CHARTDATE'),
    left_on='STARTDATE',
    right_on='CHARTDATE',
    by=['SUBJECT_ID', 'HADM_ID'],
    direction='nearest',
    tolerance=pd.Timedelta(days=7)  # Only consider matches within 7 days
)

print("Non-null symptoms count:", merged_df['extracted_symptoms'].notna().sum())


In [None]:
merged_df.isna().sum()

In [None]:
merged_df[merged_df['extracted_symptoms']== np.nan]

In [None]:
import pandas as pd

# Ensure correct data types and copy dataframes to avoid SettingWithCopyWarning
results_df = results_df.copy()
filtered_noteevents = filtered_noteevents.copy()

results_df['SUBJECT_ID'] = results_df['SUBJECT_ID'].astype(int)
results_df['HADM_ID'] = results_df['HADM_ID'].astype(int)
results_df['STARTDATE'] = pd.to_datetime(results_df['STARTDATE'])

filtered_noteevents['SUBJECT_ID'] = filtered_noteevents['SUBJECT_ID'].astype(int)
filtered_noteevents['HADM_ID'] = filtered_noteevents['HADM_ID'].astype(int)
filtered_noteevents['CHARTDATE'] = pd.to_datetime(filtered_noteevents['CHARTDATE'])

# Sort DataFrames by the datetime columns for merge_asof
results_df = results_df.sort_values('STARTDATE')
filtered_noteevents = filtered_noteevents.sort_values('CHARTDATE')

from tqdm.auto import tqdm  # Using auto to ensure compatibility with notebooks and terminals

# Inform tqdm of the operation about to be performed
tqdm.pandas(desc="Preparing for merge")

# Perform the merge_asof
merged_df = pd.merge_asof(
    results_df,
    filtered_noteevents[['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'extracted_symptoms']],
    left_on='STARTDATE',
    right_on='CHARTDATE',
    by=['SUBJECT_ID', 'HADM_ID'],
    direction='nearest'
)

print("Merge completed.")
print(merged_df.head())

merged_df.to_csv('merged_results.csv', index=False)

In [None]:
non_null_symptoms_df = merged_df[merged_df['extracted_symptoms'].notna()]
non_null_symptoms_df

In [None]:
merged_df.info()

In [None]:
merged_df.to_csv('MIMIC without diag 15-11.csv', index=False)