In [None]:
import pandas as pd
import numpy as np
import re
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from tqdm import tqdm
tqdm.pandas()

file_path = "/kaggle/input/ehr-mimic-iv/final.csv"

# Load the latest version
df = pd.read_csv(file_path)


# Define cardiac-related keywords
cardiac_keywords = [
    "heart", "cardiac", "cardiovascular", "myocardium", "pericardium", "atrium", "atria",
    "ventricle", "coronary", "ecg", "ekg", "electrocardiogram", "echo", "echocardiogram",
    "holter", "stress test", "troponin", "bnp", "nt-probnp", "myocardial infarction",
    "heart failure", "coronary artery disease", "cad", "arrhythmia", "bradycardia",
    "tachycardia", "atherosclerosis", "angina", "atrial fibrillation", "afib",
    "ventricular fibrillation", "hypertrophic cardiomyopathy", "dilated cardiomyopathy",
    "valvular heart disease", "congenital heart defect", "ischemic heart disease",
    "stent", "angioplasty", "pacemaker", "defibrillator", "cabg"
]

# Compile regex pattern
pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, cardiac_keywords)) + r')\b', re.IGNORECASE)

# Apply logic to the DataFrame
def label_cardiac(text):
    if pd.isnull(text):
        return "No cardiac disease mentioned"
    return text if pattern.search(text) else "No cardiac disease mentioned"

# Apply to the 'target' column
df['target'] = df['target'].apply(label_cardiac)

nlp = spacy.load("en_core_web_sm")
keep_words = {"yes", "no"}

# Make sure "yes" and "no" are not marked as stopwords in the vocab
for word in keep_words:
    nlp.vocab[word].is_stop = False

# Get current stop words after editing
stop_words = STOP_WORDS - keep_words
punctuations = set(punctuation)

def clean_ehr_text(raw_text):
    if pd.isnull(raw_text):
        return []

    remove_sections = [
        r"<SEX>.*?(?=<|$)", r"<SERVICE>.*?(?=<|$)", r"<Age>.*?(?=<|$)", r"<Mortality>.*?(?=<|$)",
        r"<Note Type>.*?(?=<|$)", r"<ATTENDING>.*?(?=<|$)", r"<SOCIAL HISTORY>.*?(?=<|$)",
        r"<FAMILY HISTORY>.*?(?=<|$)", r"<DISCHARGE INSTRUCTIONS>.*?(?=<|$)",
        r"<DISCHARGE DISPOSITION>.*?(?=<|$)", r"<DISCHARGE DIAGNOSIS>.*?(?=<|$)",
        r"<DISCHARGE CONDITION>.*?(?=<|$)", r"<FOLLOWUP INSTRUCTIONS>.*?(?=<|$)",
        r"<DISCHARGE MEDICATIONS>.*?(?=<|$)", r"<PAST MEDICAL HISTORY>.*?(?=<|$)",
    ]

    combined_pattern = re.compile("|".join(remove_sections), re.IGNORECASE | re.DOTALL)
    cleaned_text = re.sub(combined_pattern, "", raw_text)

    # Format cleanup
    cleaned_text = re.sub(r'\n\s*\n+', '\n\n', cleaned_text).strip()
    text = cleaned_text
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"(</?[A-Z ]+>)", r"\n\1\n", text)
    text = re.sub(r"_+", "", text)
    text = re.sub(r"\d{1,2}:\d{2}\s*(?:AM|PM)?", "", text, flags=re.IGNORECASE)
    text = re.sub(r"[\.\:]\s*___", "", text)
    text = re.sub(r"(</?[A-Z ]+>)", lambda m: m.group(1).upper(), text)
    text = re.sub(r"\s+([,:;])", r"\1", text)
    text = re.sub(r"([,:;])\s+", r"\1 ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)

    # Tokenize and lemmatize
    doc = nlp(text.strip())
    tokens = [
        token.lemma_.lower().strip()
        for token in doc
        if (token.text.lower() in keep_words) or (not token.is_stop and token.text not in punctuations)
    ]
    return tokens


# ========== Step 4: Apply to DataFrame ==========
# Assuming your DataFrame is named df and contains 'target' and 'augmented_input'
df["target_tokens"] = df["target"].progress_apply(clean_ehr_text)
df["augmented_input_tokens"] = df["augmented_input"].progress_apply(clean_ehr_text)
df = df[["target_tokens", "augmented_input_tokens"]]

df.to_csv('/kaggle/working/final.csv', index=False)