<a href="https://colab.research.google.com/github/DarthCoder501/GAAP/blob/main/Impressions_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from sklearn.metrics import roc_auc_score, classification_report

In [None]:
train = pd.read_csv("train.csv", parse_dates=["note_DATETIME"])
test = pd.read_csv("test.csv", parse_dates=["note_DATETIME"])

In [None]:
acronyms = {
    "MM": "millimeter",
    #"ET": "eastern time",
    #"DP": "",
    "CT": "computed tomography",
    #"JM": "",
    "HCW": "healthcare Worker",
    "BAC": "bronchioloalveolar carcinoma",
    "ED": "emergency department",
    "PT": "pacific time",
    "MRI": "magnetic resonance imaging",
    "RN": "registered nurse",
    "CVI": "",
    "PACS": "",
    "FOCI": "",
    "PET": "",
    "OPE": "",
    "SVC": "",
    "CM": "centimeter",
    "RLL": "",
    "RUL": "",
    "LAD": "",
    "VS": "",
    "TB": "",
    "IPMT": "",
    "IVC": "",
    "PE": "",
    "PEs": "",
    "FDG": "",
    "SFV": "",
    "DVT": "",
    "SMA": "",
    "NSIP": "",
    "SITU": "",
    "HR": "",
    "4A": "",
    "PST": "",
    "ID": "",
    "CTA": "",
    "NG": "",
    "IPMN": "",
    "UIP": "",
    "ER": "",
    "ARDS": "",
    "MRN": "",
    "RV": "",
    "CHF": "",
    "CAVA": "",
    "VENA": "",
    "PEG": "",
    "PICC": "",
    "GI": "",
    "CYST": "",
    "ASD": "",
    "MR": "",
    "EST": "",
    "CTs": "",
    "3D": "",
    "MAC": "",
    "METS": "",
    "MICU": "",
    "MAI": "",
    "PJP": "",
    "LIMA": "",
    "LV": "",
    "EGD": "",
    "PAU": "",
    "VP": "",
    "CSF": "",
    "SAC": "",
    "HCC": "",
    "SABR": "",
    "ILD": "",
    "IVP": "",
    "MRCP": "",
    "IV": "",
    "RCA": "",
    "COVID": "",
    "2D": "",
    "SMV": "",
    "FNA": "",
    "BAL": "",
    "AVMs": "",
    "AVM": "",
    "MRA": "",
    "AP": "",
    "MRIs": "",
    "COVID19": "",
    "BHD": "",
    "CTEPH": "",
    "RML": "",
    "ITA": "",
    "NGT": "",
    "GE": "",
    "MDS": "",
    "UVJ": "",
    "ERCP": "",
    "OP": "",
    "IJ": "",
    "VSD": "",
    "EMR": "",
    "TE": "",
    "AV": "",
    "PAN": "",
    "ACR": "",
    "III": "",
    "SLE": "",
    "CTS": "",
    "IPF": "",
    "3MM": "",
    "4MM": "",
    "PAPVR": "",
    "ANCA": "",
    "HILA": "",
    "VQ": "",
    "PA": "",
    "PCP": "",
    "CMV": "",
    "PSOAS": "",
    "RVH": "",
    "TSH": "",
    "CBD": "",
    "BNP": "",
    "16MM": "",
    "SM": "",
    "NP": "",
    "ST": "",
    "CVC": "",
    "SVG": "",
    "PDA": "",
    "VIII": "",
    "5A": "",
    "ICU": "",
    "CPR": "",
    "DAH": "",
    "PAP": "",
    "II": "",
    "ENT": "",
    "FNH": "",
    "LLL": "",
    "CTPA": "",
    "LA": "",
    "ABPA": "",
    "IMA": "",
    "CDA": "",
    "RT": "",
    "CCU": "",
    "ALS": "",
    "LT": "",
    "RCC": "",
    "AML": "",
    "HCG": "",
    "2R": "",
    "IJV": "",
    "LE": "",
    "ASAP": "",
    "1L": "",
    "IHSS": "",
    "13MM": "",
    "PFO": "",
    "CCA": "",
    "SCA": "",
    "CRUS": "",
    "ANS": "",
    "IgG4": "",
    "ICD": "",
    "T9": "",
    "CVICU": "",
    "T12": "",
    "L5": "",
    "L1": "",
    "L3": "",
    "T4": "",
    "T5": "",
    "T7": "",
    "T8": "",
    "T10": "",
    "L2": "",
    "8MM": "",
    "T2": "",
    "PM": "",
    "IUD": "",
    "T3": "",
    "T6": "",
    "C7": "",
    "S4": "",
    "T11": "",
    "L4": "",
    "T1": "",
    "S1": "",
    "PAH": "",
    "S9": "",
    "IMH": "",
    "ILL": "",
    "VATS": "",
    "X3": "",
    "S2": "",
    "LVAD": "",
    "ATTHE": ""
}

In [None]:
train["impressions_clean"] = train["impressions"].replace(abbreviations, regex=True)
test["impressions_clean"] = test["impressions"].replace(abbreviations, regex=True)

In [None]:
train["impressions_clean"] = train["impressions_clean"].str.lower().str.replace('[^a-z0-9\s]', '')
test["impressions_clean"] = test["impressions_clean"].str.lower().str.replace('[^a-z0-9\s]', '')
train.dropna(subset=["impressions_clean"], inplace=True)  # Drop empty impressions

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = TFAutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
# Tokenize text
train_encodings = tokenizer(train["impressions_clean"].tolist(), padding=True, truncation=True, return_tensors="tf")
test_encodings = tokenizer(test["impressions_clean"].tolist(), padding=True, truncation=True, return_tensors="tf")

In [None]:
# Extract embeddings (use [CLS] token)
X_train = model(train_encodings.input_ids).last_hidden_state[:, 0, :]
X_test = model(test_encodings.input_ids).last_hidden_state[:, 0, :]

In [None]:
# Shared encoder + task-specific heads
input_layer = Input(shape=(X_train.shape[1],))
dense = Dense(128, activation="relu")(input_layer)

outputs = []
for i, target in enumerate(targets):
    outputs.append(Dense(1, activation="sigmoid", name=target)(dense))

model = Model(inputs=input_layer, outputs=outputs)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Train
history = model.fit(
    X_train, [y_train[target] for target in targets],
    epochs=10,
    batch_size=32,
    validation_data=(X_test, [y_test[target] for target in targets])
)

In [None]:
for i, target in enumerate(targets):
    y_true = y_test[target]
    y_pred = predictions[:, i]  # For Option 1
    # y_pred = model.predict(X_test)[i]  # For Option 2

    print(f"Results for {target}:")
    print(classification_report(y_true, y_pred.round()))
    print("AUC:", roc_auc_score(y_true, y_pred))