In [2]:
import sqlite3
import pandas as pd

# Connect to SQLite database
conn = sqlite3.connect("MIMIC3_demo.db")

# Load ADMISSIONS and DIAGNOSES_ICD tables
admissions_df = pd.read_sql_query("SELECT HADM_ID, DIAGNOSIS FROM admissions", conn)
diagnoses_df = pd.read_sql_query("SELECT HADM_ID, ICD9_CODE FROM diagnoses_icd", conn)

conn.close()

# Preview the data
print(admissions_df.head())
print(diagnoses_df.head())

  hadm_id            diagnosis
0  142345               SEPSIS
1  105331          HEPATITIS B
2  165520               SEPSIS
3  199207     HUMERAL FRACTURE
4  177759  ALCOHOLIC HEPATITIS
  hadm_id icd9_code
0  142345     99591
1  142345     99662
2  142345      5672
3  142345     40391
4  142345     42731


In [20]:
# Merge tables on HADM_ID
merged_df = admissions_df.merge(diagnoses_df, on="hadm_id", how="inner")

# Preview the merged data
print(merged_df.head())

  hadm_id diagnosis icd9_code
0  142345    SEPSIS     99591
1  142345    SEPSIS     99662
2  142345    SEPSIS      5672
3  142345    SEPSIS     40391
4  142345    SEPSIS     42731


In [21]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning to DIAGNOSIS column
merged_df["clean_diagnosis"] = merged_df["diagnosis"].apply(clean_text)

# Drop rows with missing or empty text
merged_df = merged_df.dropna(subset=["clean_diagnosis"])
merged_df = merged_df[merged_df["clean_diagnosis"] != ""]

# Find the top 10 most frequent ICD codes
top_icd_codes = merged_df["icd9_code"].value_counts().head(10).index

# Filter the dataset to include only these ICD codes
filtered_df = merged_df[merged_df["icd9_code"].isin(top_icd_codes)]

# Preview cleaned data
print(filtered_df.head())

   hadm_id diagnosis icd9_code clean_diagnosis
4   142345    SEPSIS     42731          sepsis
5   142345    SEPSIS      4280          sepsis
15  142345    SEPSIS     25000          sepsis
30  165520    SEPSIS       486          sepsis
31  165520    SEPSIS     42731          sepsis


In [22]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert ICD9_CODE into lists (one code per admission in this case)
filtered_df["icd_list"] = filtered_df["icd9_code"].apply(lambda x: [x])

# Use MultiLabelBinarizer to encode ICD codes
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(filtered_df["icd_list"])

# Preview encoded ICD codes
print("Classes:", mlb.classes_)
print("Encoded Labels Example:", y[:5])

Classes: ['25000' '2724' '4019' '42731' '4280' '486' '51881' '5849' '5990' '99592']
Encoded Labels Example: [[0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["icd_list"] = filtered_df["icd9_code"].apply(lambda x: [x])


In [25]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    filtered_df["clean_diagnosis"], y, test_size=0.2, random_state=42
)

print("Training examples:", len(X_train))
print("Test examples:", len(X_test))


Training examples: 283
Test examples: 71


In [29]:
## Augementation 

import nltk
nltk.download('wordnet')
import random

# Augment text with synonym replacement
def synonym_replacement(sentence, n=2):  # Replace `n` words
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        word_to_replace = random.choice(words)
        synonyms = wordnet.synsets(word_to_replace)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word_to_replace else w for w in new_words]
    return " ".join(new_words)

# Apply augmentation to training data
X_train_augmented = X_train.apply(lambda x: synonym_replacement(x, n=1))

# Combine original and augmented data
X_train_final = pd.concat([X_train, X_train_augmented])
y_train_final = pd.concat([pd.DataFrame(y_train), pd.DataFrame(y_train)])


[nltk_data] Downloading package wordnet to /Users/ecomak/nltk_data...


In [30]:
from transformers import AutoTokenizer

# Load ClinicalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Tokenize training and test data
def tokenize_texts(texts):
    return tokenizer(
        list(texts),
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(X_train_final)
test_encodings = tokenize_texts(X_test)

In [31]:
from transformers import AutoModelForSequenceClassification

# Load ClinicalBERT pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=y_train_final.shape[1]
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AdamW

# Custom Dataset class
class NotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create DataLoaders
train_dataset = NotesDataset(train_encodings, y_train_final.values)
test_dataset = NotesDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [33]:
from tqdm import tqdm

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels.float())
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 71/71 [02:43<00:00,  2.31s/it, loss=0.338]
Epoch 1: 100%|██████████| 71/71 [03:03<00:00,  2.58s/it, loss=0.299]
Epoch 2: 100%|██████████| 71/71 [02:56<00:00,  2.49s/it, loss=0.332]


In [34]:
from sklearn.metrics import classification_report

# Evaluation loop
model.eval()
y_pred = []
y_true = []

for batch in test_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = (torch.sigmoid(logits) > 0.5).cpu().numpy()

    y_pred.extend(predictions)
    y_true.extend(labels.cpu().numpy())

# Classification report
print(classification_report(y_true, y_pred, target_names=mlb.classes_))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

       25000       0.00      0.00      0.00         3
        2724       0.00      0.00      0.00         9
        4019       0.00      0.00      0.00        14
       42731       0.00      0.00      0.00         7
        4280       0.00      0.00      0.00         5
         486       0.00      0.00      0.00         4
       51881       0.00      0.00      0.00        10
        5849       0.00      0.00      0.00         5
        5990       0.00      0.00      0.00         8
       99592       0.00      0.00      0.00         6

   micro avg       0.00      0.00      0.00        71
   macro avg       0.00      0.00      0.00        71
weighted avg       0.00      0.00      0.00        71
 samples avg       0.00      0.00      0.00        71



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
