# 1. Clean up the data

In [None]:
import pandas as pd
import re
from collections import Counter


def clean_text(text, abbrev_map: dict = None):
    # Remove special characters and standardize
    text = re.sub(r"[:#\(\)\[\].,\-\'\"/\\, *, ?]", " ", text.lower())
    # Remove numbers and pound signs
    text = re.sub(r"#", "", text)
    text = re.sub(r"\d+", "", text)
    # Standardize whitespace
    text = " ".join(text.split())

    if abbrev_map:
        # Replace abbreviations
        for abbrev, full in abbrev_map.items():
            text = re.sub(rf"\b{abbrev}\b", full, text)

    return text

In [2]:
df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/data/mergedBioNLP2023.csv")
df["cleaned_expanded_Summary"] = df["Summary_expanded"].apply(
    lambda x: clean_text(x) if pd.notnull(x) else x
)

"cleaned_expanded_Summary" is now the reference

In [None]:
# Step 1: Split the 'cleaned_expanded_Summary' into lists of terms
df["terms"] = (
    df["cleaned_expanded_Summary"]
    .dropna()
    .str.split(";")
    .apply(lambda x: [term.strip() for term in x if term.strip()])
)

# Step 2: Explode the lists into separate rows
exploded_df = df.explode("terms")

# Step 3: Count the frequency of each term using value_counts
disease_series = exploded_df["terms"].value_counts()

# Step 4: Convert the Series to a DataFrame
disease_df = disease_series.reset_index().rename(
    columns={"terms": "Disease", "cound": "Frequency"}
)

# Step 5: Display top 20 most frequent terms
print("Top 20 most frequent terms:")
print(disease_df.head(20))

Top 20 most frequent terms:
                                  Disease  count
0                            hypertension    132
1                     acute renal failure    116
2                     atrial fibrillation    100
3                                  anemia     90
4                 coronary artery disease     83
5                             hypotension     77
6                   altered mental status     64
7                     respiratory failure     51
8                       diabetes mellitus     46
9                            leukocytosis     44
10                       thrombocytopenia     41
11  chronic obstructive pulmonary disease     39
12                                hypoxia     35
13                              pneumonia     33
14                                 sepsis     31
15         acute on chronic renal failure     30
16                         hyperlipidemia     27
17                end stage renal disease     26
18               congestive heart failure

In [None]:
# df.to_csv('/home/yl3427/cylab/SOAP_MA/data/mergedBioNLP2023.csv', index=False)

In [4]:
disease_df.to_csv(
    "/home/yl3427/cylab/SOAP_MA/data/disease_frequencies.csv", index=False
)

In [44]:
disease_df[:30]

Unnamed: 0,Disease,count
0,hypertension,132
1,acute renal failure,116
2,atrial fibrillation,100
3,anemia,90
4,coronary artery disease,83
5,hypotension,77
6,altered mental status,64
7,respiratory failure,51
8,diabetes mellitus,46
9,leukocytosis,44


In [42]:
disease_df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/data/disease_frequencies.csv")

In [None]:
from quickumls import QuickUMLS

filtered_matcher = QuickUMLS(
    "/secure/shared_data/QuickUMLS",
    overlapping_criteria="length",
    threshold=1.0,
    similarity_name="dice",
    accepted_semtypes=["T047"],  # Disease or Syndrome
)

In [9]:
disease_lst = []
for row in disease_df["Disease"]:
    result = filtered_matcher.match(row, best_match=True, ignore_syntax=False)
    if result:
        disease_lst.append(row)

In [10]:
disease_lst

['hypertension',
 'acute renal failure',
 'atrial fibrillation',
 'anemia',
 'coronary artery disease',
 'respiratory failure',
 'diabetes mellitus',
 'leukocytosis',
 'thrombocytopenia',
 'chronic obstructive pulmonary disease',
 'pneumonia',
 'sepsis',
 'acute on chronic renal failure',
 'hyperlipidemia',
 'end stage renal disease',
 'congestive heart failure',
 'hypoxic respiratory failure',
 'diabetes',
 'diabetes mellitus type',
 'hyponatremia',
 'human immunodeficiency virus',
 'renal failure',
 'hypernatremia',
 'hypothyroidism',
 'cirrhosis',
 'atrial fibrillation with rapid ventricular rate',
 'non st elevation myocardial infarction',
 'diabetic ketoacidosis',
 'pancytopenia',
 'urinary tract infection',
 'coagulopathy',
 'peripheral vascular disease',
 'hypercarbic respiratory failure',
 'end stage renal disease on hemodialysis',
 'st elevation myocardial infarction',
 'seizure disorder',
 'pancreatitis',
 'obstructive sleep apnea',
 'type diabetes mellitus',
 'asthma',
 'asp

In [12]:
import ast

df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/data/mergedBioNLP2023.csv")
disease_file_dict = {}
for search_term in disease_lst:
    mask = df["terms"].apply(
        lambda x: search_term in eval(x) if pd.notnull(x) else False
    )
    count = mask.sum()
    print(f"Number of rows containing '{search_term}':", count)
    file_ids = df.loc[mask, "File ID"].tolist()
    print(len(file_ids))
    disease_file_dict[search_term] = file_ids

Number of rows containing 'hypertension': 132
132
Number of rows containing 'acute renal failure': 116
116
Number of rows containing 'atrial fibrillation': 100
100
Number of rows containing 'anemia': 90
90
Number of rows containing 'coronary artery disease': 83
83
Number of rows containing 'respiratory failure': 51
51
Number of rows containing 'diabetes mellitus': 46
46
Number of rows containing 'leukocytosis': 44
44
Number of rows containing 'thrombocytopenia': 41
41
Number of rows containing 'chronic obstructive pulmonary disease': 39
39
Number of rows containing 'pneumonia': 33
33
Number of rows containing 'sepsis': 31
31
Number of rows containing 'acute on chronic renal failure': 30
30
Number of rows containing 'hyperlipidemia': 27
27
Number of rows containing 'end stage renal disease': 26
26
Number of rows containing 'congestive heart failure': 25
25
Number of rows containing 'hypoxic respiratory failure': 24
24
Number of rows containing 'diabetes': 24
24
Number of rows containing

In [38]:
disease_file_dict

{'hypertension': ['193604.txt',
  '189703.txt',
  '190086.txt',
  '110692.txt',
  '187277.txt',
  '105738.txt',
  '147696.txt',
  '186757.txt',
  '197276.txt',
  '185114.txt',
  '190524.txt',
  '105852.txt',
  '148588.txt',
  '184814.txt',
  '102324.txt',
  '109844.txt',
  '111039.txt',
  '149619.txt',
  '111314.txt',
  '111857.txt',
  '148314.txt',
  '149546.txt',
  '110341.txt',
  '147435.txt',
  '148470.txt',
  '198734.txt',
  '184453.txt',
  '111845.txt',
  '198330.txt',
  '103677.txt',
  '192754.txt',
  '105172.txt',
  '194131.txt',
  '103853.txt',
  '101063.txt',
  '100590.txt',
  '101324.txt',
  '183333.txt',
  '194061.txt',
  '106110.txt',
  '104263.txt',
  '184503.txt',
  '148647.txt',
  '111040.txt',
  '101737.txt',
  '186671.txt',
  '194088.txt',
  '185556.txt',
  '106883.txt',
  '191719.txt',
  '199580.txt',
  '185625.txt',
  '189180.txt',
  '194847.txt',
  '195768.txt',
  '105347.txt',
  '148251.txt',
  '199781.txt',
  '197960.txt',
  '149006.txt',
  '183813.txt',
  '11301

In [18]:
import pickle

# with open('/home/yl3427/cylab/SOAP_MA/data/disease_file_dict.pkl', 'wb') as f:
#     pickle.dump(disease_file_dict, f) # be used in test

with open("/home/yl3427/cylab/SOAP_MA/data/disease_file_dict.pkl", "rb") as f:
    disease_file_dict = pickle.load(f)

In [20]:
len(disease_file_dict)

1043

In [22]:
df = pd.read_csv("/home/yl3427/cylab/SOAP_MA/data/mergedBioNLP2023.csv")

arf_ltm_file_ids = disease_file_dict["acute renal failure"][:16]
arf_test_file_ids_positive = disease_file_dict["acute renal failure"][16:]
arf_test_file_ids_negative = list(
    set(df["File ID"].tolist())
    - set(arf_ltm_file_ids)
    - set(arf_test_file_ids_positive)
)

len(arf_ltm_file_ids), len(arf_test_file_ids_positive), len(arf_test_file_ids_negative)

(16, 100, 886)

In [23]:
(
    len(arf_ltm_file_ids)
    + len(arf_test_file_ids_positive)
    + len(arf_test_file_ids_negative)
)

1002

# check the generated ltm (generated the ltm using run_LTMbuild_test.py)

In [24]:
with open(
    "/home/yl3427/cylab/SOAP_MA/ltm_per_disease/acute_renal_failure_raw.pkl", "rb"
) as f:
    arf_ltm_raw = pickle.load(f)

In [25]:
len(arf_ltm_raw)

140

In [26]:
with open(
    "/home/yl3427/cylab/SOAP_MA/ltm_per_disease/acute_renal_failure_refined.pkl", "rb"
) as f:
    arf_ltm_refined = pickle.load(f)

len(arf_ltm_refined)

15

In [27]:
arf_ltm_refined

['Evaluate fluid balance and urine output: Significant discrepancies between fluid intake and output, such as negative or positive fluid balance, can indicate renal impairment.',
 'Monitor serum creatinine levels: An increase from baseline is a key indicator of renal dysfunction, while a decrease may suggest improvement.',
 'Assess urine output: Oliguria or anuria, especially when fluid intake is adequate, can be a sign of acute renal failure.',
 'Review medication history: Recent use of nephrotoxic drugs or changes in medication can contribute to renal impairment.',
 'Consider underlying conditions: Chronic diseases like diabetes mellitus or cardiovascular issues can predispose patients to renal failure.',
 'Evaluate hemodynamic stability: Hypotension or significant blood pressure changes can lead to renal hypoperfusion and acute renal failure.',
 'Check for signs of systemic infection or inflammation: Elevated white blood cell count or fever can precipitate renal dysfunction.',
 'Ass

In [29]:
# training data, test data(positive), test data(negative)
len(arf_ltm_file_ids), len(arf_test_file_ids_positive), len(arf_test_file_ids_negative)

(16, 100, 886)

In [36]:
tp = 0
fp = 0
tn = 0
fn = 0

result_with_ltm = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1114_arf_ltm.csv")
result_without_ltm = pd.read_csv(
    "/home/yl3427/cylab/SOAP_MA/soap_result/1114_arf_without_ltm.csv"
)
disease = "acute renal failure"
for idx, row in result_with_ltm[["terms", "pred"]].iterrows():
    if pd.isnull(row["terms"]):
        continue
    if disease in row["terms"]:
        if disease in row["pred"].lower():
            tp += 1
        else:
            fn += 1
    else:
        if disease in row["pred"].lower():
            fp += 1
        else:
            tn += 1

In [31]:
tp, fp, tn, fn  # 1114_arf_without_ltm.csv

(7, 8, 852, 114)

In [35]:
tp, fp, tn, fn  # 1114_arf_ltm.csv

(93, 437, 423, 28)

In [40]:
import pandas as pd

# disease of interest
disease = "acute renal failure"


def compute_confusion_matrix(df, disease):
    tp = fp = tn = fn = 0

    for idx, row in df[["terms", "pred"]].iterrows():
        if pd.isnull(row["terms"]):
            # print('null')
            continue
        if disease in row["terms"]:
            if disease in row["pred"].lower():
                tp += 1
            else:
                fn += 1
        else:
            if disease in row["pred"].lower():
                fp += 1
            else:
                tn += 1

    return {"TP": tp, "FP": fp, "TN": tn, "FN": fn}


result_with_ltm = pd.read_csv("/home/yl3427/cylab/SOAP_MA/soap_result/1114_arf_ltm.csv")
result_without_ltm = pd.read_csv(
    "/home/yl3427/cylab/SOAP_MA/soap_result/1114_arf_without_ltm.csv"
)

confusion_with_ltm = compute_confusion_matrix(result_with_ltm, disease)
confusion_without_ltm = compute_confusion_matrix(result_without_ltm, disease)

comparison_df = pd.DataFrame(
    {
        "Metric": [
            "True Positives",
            "False Positives",
            "True Negatives",
            "False Negatives",
        ],
        "With LTM": [
            confusion_with_ltm["TP"],
            confusion_with_ltm["FP"],
            confusion_with_ltm["TN"],
            confusion_with_ltm["FN"],
        ],
        "Without LTM": [
            confusion_without_ltm["TP"],
            confusion_without_ltm["FP"],
            confusion_without_ltm["TN"],
            confusion_without_ltm["FN"],
        ],
    }
)

print("Comparison of Confusion Matrix Metrics:")
print(comparison_df)

Comparison of Confusion Matrix Metrics:
            Metric  With LTM  Without LTM
0   True Positives        93            7
1  False Positives       437            8
2   True Negatives       423          852
3  False Negatives        28          114
