In [20]:
# prompt: connect to my drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# prompt: read in /content/drive/My Drive/data/df_notes_demographics.csv

import pandas as pd

train_notes_df= pd.read_csv("/content/drive/My Drive/data/df_notes.csv")


In [None]:
train_notes_df.head(2)

In [22]:
url = 'https://raw.githubusercontent.com/hibaahsan/MIMIC-SBDH/main/MIMIC-SBDH.csv'
train_ann_df= pd.read_csv(url)

In [23]:
alcohol_level_dict = {
    0: 0,
    1: 1,
    2: 1,
    3: 1,
    4: 1
}

environment_level_dict = {
    0: 0,
    1: 1,
    2: 1
}

train_set_df = train_notes_df.merge(train_ann_df, left_on="ROW_ID", right_on="row_id")
train_set_df

# recode the levels of 'behavior_alcohol' based on the alcohol level dict
train_set_df.loc[:, 'alcohol_binary'] = train_set_df.loc[:, 'behavior_alcohol'].map(alcohol_level_dict)

# recode the levels of 'sdoh_environment' based on the environment level dict
train_set_df.loc[:, 'environment_binary'] = train_set_df.loc[:, 'sdoh_environment'].map(environment_level_dict)

# if we have documentation of community present OR absent, code the binary version as 1 else 0
train_set_df.loc[:, 'community_binary'] = (train_set_df.sdoh_community_present == 1) | (train_set_df.sdoh_community_absent == 1)
train_set_df.loc[:, 'community_binary'] = train_set_df.loc[:, 'community_binary'].astype(int)

In [5]:
train_set_df.behavior_alcohol.value_counts()

behavior_alcohol
3    2444
1    2077
0    1657
2     515
4     332
Name: count, dtype: int64

In [6]:
train_set_df[['sdoh_community_present', 'sdoh_community_absent']].value_counts()

sdoh_community_present  sdoh_community_absent
1                       0                        3878
0                       0                        2363
1                       1                         585
0                       1                         199
Name: count, dtype: int64

In [7]:
train_set_df.environment_binary.value_counts()

environment_binary
1    4420
0    2605
Name: count, dtype: int64

In [8]:
train_set_df.community_binary.value_counts()

community_binary
1    4662
0    2363
Name: count, dtype: int64

In [9]:
train_set_df.alcohol_binary.value_counts()

alcohol_binary
1    5368
0    1657
Name: count, dtype: int64

In [10]:
train_set_df.head()

Unnamed: 0.1,Unnamed: 0,ROW_ID,TEXT,row_id,sdoh_community_present,sdoh_community_absent,sdoh_education,sdoh_economics,sdoh_environment,behavior_alcohol,behavior_tobacco,behavior_drug,alcohol_binary,environment_binary,community_binary
0,0,37988,Admission Date: [**2166-6-5**] D...,37988,0,0,0,0,1,2,3,0,1,1,0
1,1,37282,Admission Date: [**2109-12-23**] ...,37282,1,1,0,1,0,2,0,0,1,0,1
2,2,26313,Admission Date: [**2114-12-17**] Discha...,26313,0,0,0,0,0,3,2,0,1,0,0
3,3,13852,Admission Date: [**2112-5-16**] ...,13852,0,0,0,0,0,1,0,0,1,0,0
4,4,51031,Admission Date: [**2155-2-26**] ...,51031,1,1,0,0,1,0,0,0,0,1,1


1. Data Preprocessing

In [24]:
import pandas as pd
import re
def extract_section(text, start_keyword):
    """
    Extracts the section from the text starting with a keyword until a blank line or
    another major section header is detected.
    """
    # Convert to lowercase for case-insensitive matching, but work with original for extraction
    text_lower = text.lower()
    start_keyword = start_keyword.lower()

    try:
        start_index = text_lower.index(start_keyword) + len(start_keyword)
        # Find the end index by looking for two newline characters indicating a new section or end of the note
        end_index = text_lower.find('\n\n', start_index)
        if end_index == -1:  # If no double newline is found, take the rest of the text
            end_index = len(text)
        return text[start_index:end_index].strip()
    except ValueError:
        # Return None if the section isn't found
        return None

def clean_text(text):
    """
    Performs basic cleaning of text such as removing extra spaces, special characters, etc.
    """
    if text is not None:
        text = re.sub(r'\[\*\*.*?\*\*\]', '', text)  # Remove de-identified placeholders
        text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text




In [25]:
# Define keywords to mark the start and end of the section of interest
# Define keywords to mark the start and end of the section of interest, maintain original case for matching
start_keyword = "Social History:"

# Extract and clean the Social History section from each row
train_set_df['processed_text'] = train_set_df['TEXT'].apply(
    lambda x: clean_text(extract_section(x, start_keyword))
)
# Count how many rows have None or missing 'processed_text'
missing_count = train_set_df['processed_text'].isnull().sum()

print(f"Number of records with missing 'Social History' sections: {missing_count}")


Number of records with missing 'Social History' sections: 0


In [26]:
# Calculate the number of words in each row of 'processed_text'
train_set_df['word_count'] = train_set_df['processed_text'].apply(lambda x: len(x.split()))

# Get descriptive statistics of the word count distribution
train_set_df['word_count'].describe()


count    7025.000000
mean       25.492242
std        19.714779
min         0.000000
25%        12.000000
50%        21.000000
75%        34.000000
max       234.000000
Name: word_count, dtype: float64

Model Selection and Training

In [27]:
pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [28]:
from transformers import DistilBertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        # Encoding the text for DistilBERT
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Assume train_set_df is your DataFrame and already has 'processed_text'
texts = train_set_df['processed_text'].tolist()
labels = train_set_df[['alcohol_binary', 'environment_binary', 'community_binary']].values

# Create the dataset
dataset = TextDataset(texts, labels, tokenizer)

# DataLoader for batching
loader = DataLoader(dataset, batch_size=16, shuffle=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [29]:
from transformers import DistilBertForSequenceClassification, AdamW

# Modify the config to have 3 labels
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3  # We have three binary outputs
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [30]:
from sklearn.model_selection import train_test_split

# Assuming 'texts' and 'labels' are already defined as shown previously
# Split data into training and temporary dataset first (e.g., 80% train, 20% temp)
texts_train, texts_temp, labels_train, labels_temp = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

# Split the temporary dataset into validation and test sets (e.g., 50% each of temp)
texts_val, texts_test, labels_val, labels_test = train_test_split(
    texts_temp, labels_temp, test_size=0.5, random_state=42)

# Print out the distribution of the datasets
print(f"Training set size: {len(texts_train)}")
print(f"Validation set size: {len(texts_val)}")
print(f"Test set size: {len(texts_test)}")


Training set size: 5620
Validation set size: 702
Test set size: 703


In [31]:
# Create datasets for training, validation, and testing
train_dataset = TextDataset(texts_train, labels_train, tokenizer)
val_dataset = TextDataset(texts_val, labels_val, tokenizer)
test_dataset = TextDataset(texts_test, labels_test, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = BCEWithLogitsLoss()

def train_epoch(model, data_loader, optimizer, device, criterion):
    model = model.train()

    losses = []
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None
        )

        logits = outputs.logits
        loss = criterion(logits, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return np.mean(losses)
def evaluate(model, data_loader, device):
    model = model.eval()
    final_targets = []
    final_outputs = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            ).logits

            final_outputs.extend(outputs.detach().cpu().numpy())
            final_targets.extend(targets.detach().cpu().numpy())

    final_outputs = np.array(final_outputs)
    final_targets = np.array(final_targets)

    accuracies = [accuracy_score(final_targets[:, i], final_outputs[:, i] > 0) for i in range(final_targets.shape[1])]
    aucs = [roc_auc_score(final_targets[:, i], final_outputs[:, i]) for i in range(final_targets.shape[1])]

    return accuracies, aucs
train_accuracies = []
train_aucs = []
val_accuracies = []
val_aucs = []

for epoch in range(3):
    train_loss = train_epoch(model, train_loader, optimizer, device, criterion)
    train_acc, train_auc = evaluate(model, train_loader, device)
    val_acc, val_auc = evaluate(model, val_loader, device)

    train_accuracies.append(train_acc)
    train_aucs.append(train_auc)
    val_accuracies.append(val_acc)
    val_aucs.append(val_auc)

    print(f"Epoch {epoch + 1}, Training Loss: {train_loss:.2f}")
    print(f"Train Acc: {train_acc}, Train AUC: {train_auc}")
    print(f"Val Acc: {val_acc}, Val AUC: {val_auc}")

In [None]:
epochs = range(1, 4)
fig, ax = plt.subplots(2, 1, figsize=(10, 10))

# Plot accuracies
for i in range(3):
    ax[0].plot(epochs, [acc[i] for acc in train_accuracies], label=f'Train Class {i+1} Accuracy')
    ax[0].plot(epochs, [acc[i] for acc in val_accuracies], label=f'Val Class {i+1} Accuracy', linestyle='--')
ax[0].set_title('Accuracy over Epochs by Class')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')
ax[0].legend()

# Plot AUCs
for i in range(3):
    ax[1].plot(epochs, [auc[i] for auc in train_aucs], label=f'Train Class {i+1} AUC')
    ax[1].plot(epochs, [auc[i] for auc in val_aucs], label=f'Val Class {i+1} AUC', linestyle='--')
ax[1].set_title('AUC over Epochs by Class')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('AUC')
ax[1].legend()

plt.tight_layout()
plt.show()
