In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import pandas as pd
import random
from dataclasses import dataclass
import numpy as np
import torch
# import seaborn as sns
import transformers
import json
# import glob
from tqdm import tqdm
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from torchmetrics import MetricCollection
from torchmetrics.classification import Accuracy, AUROC, F1Score, Precision, Recall
# from itertools import chain
import medspacy
from medspacy.section_detection import SectionRule

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"using device: {device}...")

using device: cuda...


In [3]:
torch.cuda.is_available()

True

In [4]:
def seed_script(seed: int):
    # set torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # set numpy seed
    np.random.seed(seed)
    print(f"seed set to {seed}...")
    
SEED = 13
seed_script(SEED)

seed set...


## Data Import and Preprocessing

In [5]:
# handle data import here
train_notes_df = pd.read_csv("./data/df_notes.csv")
train_notes_df

Unnamed: 0.1,Unnamed: 0,ROW_ID,TEXT
0,0,37988,Admission Date: [**2166-6-5**] D...
1,1,37282,Admission Date: [**2109-12-23**] ...
2,2,26313,Admission Date: [**2114-12-17**] Discha...
3,3,13852,Admission Date: [**2112-5-16**] ...
4,4,51031,Admission Date: [**2155-2-26**] ...
...,...,...,...
7020,7020,14096,Admission Date: [**2136-5-6**] Discharg...
7021,7021,12564,Admission Date: [**2152-2-28**] Dischar...
7022,7022,24492,Admission Date: [**2146-2-7**] D...
7023,7023,26304,Admission Date: [**2167-2-10**] ...


In [6]:
# load the dataset of annotations
train_ann_url = 'https://raw.githubusercontent.com/hibaahsan/MIMIC-SBDH/main/MIMIC-SBDH.csv'
train_ann_df = pd.read_csv(train_ann_url)
train_ann_df

Unnamed: 0,row_id,sdoh_community_present,sdoh_community_absent,sdoh_education,sdoh_economics,sdoh_environment,behavior_alcohol,behavior_tobacco,behavior_drug
0,5,0,0,0,0,0,0,1,0
1,42,0,0,0,0,0,0,2,0
2,136,1,0,0,2,1,3,4,0
3,442,1,1,0,0,1,3,1,2
4,328,1,0,0,2,1,3,3,3
...,...,...,...,...,...,...,...,...,...
7020,58064,1,0,1,1,1,0,0,0
7021,58873,1,0,0,0,1,3,3,3
7022,58947,1,0,1,2,2,0,0,0
7023,58624,1,0,0,1,0,1,2,0


In [7]:
# recode our target variables
alcohol_level_dict = {
    0: 0, 
    1: 1, 
    2: 1, 
    3: 1, 
    4: 1
}
environment_level_dict = {
    0: 0, 
    1: 1, 
    2: 1
}

train_set_df = train_notes_df.merge(train_ann_df, left_on="ROW_ID", right_on="row_id")
train_set_df

# recode the levels of 'behavior_alcohol' based on the alcohol level dict
train_set_df.loc[:, 'alcohol_binary'] = train_set_df.loc[:, 'behavior_alcohol'].map(alcohol_level_dict)

# recode the levels of 'sdoh_environment' based on the environment level dict
train_set_df.loc[:, 'environment_binary'] = train_set_df.loc[:, 'sdoh_environment'].map(environment_level_dict)

# if we have documentation of community present OR absent, code the binary version as 1 else 0
train_set_df.loc[:, 'community_binary'] = (train_set_df.sdoh_community_present == 1) | (train_set_df.sdoh_community_absent == 1)
train_set_df.loc[:, 'community_binary'] = train_set_df.loc[:, 'community_binary'].astype(int)

In [8]:
train_set_df.behavior_alcohol.value_counts()

behavior_alcohol
3    2444
1    2077
0    1657
2     515
4     332
Name: count, dtype: int64

In [9]:
train_set_df[['sdoh_community_present', 'sdoh_community_absent']].value_counts()

sdoh_community_present  sdoh_community_absent
1                       0                        3878
0                       0                        2363
1                       1                         585
0                       1                         199
Name: count, dtype: int64

In [10]:
train_set_df.environment_binary.value_counts()

environment_binary
1    4420
0    2605
Name: count, dtype: int64

In [11]:
train_set_df.community_binary.value_counts()

community_binary
1    4662
0    2363
Name: count, dtype: int64

In [12]:
train_set_df.alcohol_binary.value_counts()

alcohol_binary
1    5368
0    1657
Name: count, dtype: int64

In [13]:
def social_history_sectionizer(dataframe, text_col: str = 'TEXT'):
    dataframe.loc[:, 'social_history'] = ''

    # build our medspacy pipeline
    nlp = medspacy.load()
    sectionizer = nlp.add_pipe("medspacy_sectionizer")
    pattern_list = [
        SectionRule(
            category="social_history", 
            literal="SOCIAL HISTORY:"
        )
    ]
    sectionizer.add(pattern_list)
    
    for i, data in tqdm(dataframe.iterrows(), total=len(dataframe)):
        # apply the spacy pipeline to the text
        text_sections = nlp(data[text_col])

        # zip the titles and bodies together
        title_body_zip = zip(
            text_sections._.section_categories, 
            text_sections._.section_bodies
        )

        # extract the social history
        soc_hist = [str(body).strip() for title, body in title_body_zip if title == "social_history"]
        
        # if a social history section was found, add it to the dataframe
        if soc_hist:
            dataframe.at[i, 'social_history'] = soc_hist[0]

    return dataframe
    
train_set_df = social_history_sectionizer(train_set_df)
# test_sections = social_history_sectionizer(train_set_df)

 55%|█████▌    | 3872/7025 [23:35<20:31,  2.56it/s]  

Admission Date:  [**2129-10-18**]     Discharge Date:  [**2129-10-22**]

Date of Birth:   [**2072-10-1**]     Sex:  F

Service:
CHIEF COMPLAINT:  Pelvic organ prolapse status post
anterior-posterior repair.

HISTORY OF PRESENT ILLNESS:  This is a 56-year-old G3, P3 who
has noticed an increasing vaginal bulge in [**Month (only) 205**] of this year.
but did notice an increase in urinary frequency, nocturia,
and urgency.  She had no change in her bowel habits, and is
not sexually active.

Preoperative physical examination showed a Stage II pelvic
organ prolapse mostly cystocele.  The decision was made to
proceed with an anterior-posterior colporrhaphy.
PAST MEDICAL HISTORY AND PAST SURGICAL HISTORY:  Uterine
suspension and total abdominal hysterectomy, left
salpingo-oophorectomy in [**2106**], three right breast biopsies
all benign, tonsil and adenoidectomy, and appendectomy,
irritable bowel syndrome, pernicious anemia, migraine
headaches.

PAST OB HISTORY:  Three full term normal spontan

100%|██████████| 7025/7025 [42:43<00:00,  2.74it/s]


In [40]:
train_set_df.to_csv('data/train_sample.csv', index=False)

In [39]:
train_set_df[pd.isnull(train_set_df.social_history)]

Unnamed: 0.1,Unnamed: 0,ROW_ID,TEXT,row_id,sdoh_community_present,sdoh_community_absent,sdoh_education,sdoh_economics,sdoh_environment,behavior_alcohol,behavior_tobacco,behavior_drug,alcohol_binary,environment_binary,community_binary,social_history


In [15]:
train_set_df = pd.read_csv("./train_sample.csv")
train_set_df

Unnamed: 0.1,Unnamed: 0,ROW_ID,TEXT,row_id,sdoh_community_present,sdoh_community_absent,sdoh_education,sdoh_economics,sdoh_environment,behavior_alcohol,behavior_tobacco,behavior_drug,alcohol_binary,environment_binary,community_binary,social_history
0,0,37988,Admission Date: [**2166-6-5**] D...,37988,0,0,0,0,1,2,3,0,1,1,0,Lives at [**First Name4 (NamePattern1) 2299**]...
1,1,37282,Admission Date: [**2109-12-23**] ...,37282,1,1,0,1,0,2,0,0,1,0,1,She is divorced with 2 children. Recently move...
2,2,26313,Admission Date: [**2114-12-17**] Discha...,26313,0,0,0,0,0,3,2,0,1,0,0,Negative for alcohol. She does admit to\nprevi...
3,3,13852,Admission Date: [**2112-5-16**] ...,13852,0,0,0,0,0,1,0,0,1,0,0,"- ETOH, -Tob"
4,4,51031,Admission Date: [**2155-2-26**] ...,51031,1,1,0,0,1,0,0,0,0,1,1,Patient is single without children. She lives ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7020,7020,14096,Admission Date: [**2136-5-6**] Discharg...,14096,1,0,0,0,1,3,3,0,1,1,1,She denies any history of tobacco and no\nhist...
7021,7021,12564,Admission Date: [**2152-2-28**] Dischar...,12564,0,0,0,0,0,0,2,0,0,0,0,The patient quit smoking 40 years previous.
7022,7022,24492,Admission Date: [**2146-2-7**] D...,24492,1,0,0,2,1,1,1,2,1,1,1,- Retired and lives at [**Hospital1 1426**]/[*...
7023,7023,26304,Admission Date: [**2167-2-10**] ...,26304,0,0,0,0,0,1,2,3,1,0,0,[**1-19**] pack per day smoking history for 20...


In [16]:
from sklearn.model_selection import train_test_split

# 70/15/15 split
# perform a split on the idxs stratified by the target classes
# to make sure we have an even class distribution
train_df, val_test_df = train_test_split(
    train_set_df,
    test_size=0.3,
    random_state=SEED
)

val_df, test_df = train_test_split(
    val_test_df,
    test_size=0.5,
    random_state=SEED
)

## Data Import and Preprocessing

In [19]:
class TextDataset(Dataset):
    """
    class is very closely based on the huggingface tutorial implementation
    """
    def __init__(self, dataframe, tokenizer, max_len, target_cols: list[str], id_col: str = 'row_id',
                 text_col: str = 'TEXT'):
        self.tokenizer = tokenizer
        # self.data = dataframe
        self.text_id_list = list(dataframe[id_col])
        self.text_list = list(dataframe[text_col])
        self.label_list = self._get_labels(dataframe, target_cols)
        self.max_len = max_len
        
    def _get_labels(self, dataframe, target_col_list):
        label_list_container = list()
        
        for target_col in target_col_list:
            label_list_container.append(
                list(dataframe[target_col].astype(float))
            )
            
        return list(zip(*label_list_container))

    def __len__(self):
        # get length of dataset (required for dataloader)
        return len(self.text_list)

    def __getitem__(self, idx):
        # extract text
        text = str(self.text_list[idx])

        # extract label
        label = self.label_list[idx]

        # tokenize text
        encoded_text = self.tokenizer.encode_plus(
            text,
            # add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )

        # unpack encoded text
        ids = encoded_text['input_ids']
        attention_mask = encoded_text['attention_mask']
        token_type_ids = encoded_text["token_type_ids"]

        # wrap outputs in dict
        out_dict = {
            'text_id_list': self.text_id_list,
            'id_tensor': torch.tensor(ids, dtype=torch.long),
            'mask_tensor': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_tensor': torch.tensor(token_type_ids, dtype=torch.long),
            'label_tensor': torch.tensor(label, dtype=torch.float)
        }

        return out_dict

In [20]:
# load roberta base as a tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

target_col_list = ['environment_binary', 'community_binary', 'alcohol_binary']

MAX_LEN = 256

# load dataframes into dataset objects
train_ds = TextDataset(
    dataframe=train_df, 
    tokenizer=tokenizer, 
    max_len=MAX_LEN, 
    target_cols=target_col_list,
    text_col='social_history'
)

val_ds = TextDataset(
    dataframe=val_df, 
    tokenizer=tokenizer, 
    max_len=MAX_LEN, 
    target_cols=target_col_list,
    text_col='social_history'
)

test_ds = TextDataset(
    dataframe=test_df, 
    tokenizer=tokenizer, 
    max_len=MAX_LEN, 
    target_cols=target_col_list,
    text_col='social_history'
)

In [11]:
def get_dataloader(dataset, batch_size, shuffle: bool = True,
                   pin_memory: bool = True, num_workers: int = 0,
                   prefetch_factor: int or None = None):
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=pin_memory,
        num_workers=num_workers,
        prefetch_factor=prefetch_factor
    )
    return dataloader

BATCH_SIZE = 128

# load datasets into loaders
train_loader = get_dataloader(train_ds, BATCH_SIZE)
val_loader = get_dataloader(val_ds, BATCH_SIZE)
test_loader = get_dataloader(test_ds, BATCH_SIZE)

## Instantiate RoBERTa

In [12]:
class CustomRoberta(torch.nn.Module):
    """
    model subclass to define the RoBERTa architecture, also closely based on
    the huggingface tutorial implementation
    """
    def __init__(self, drop_percent, num_classes, pt_model_name: str = 'roberta-base'):
        super().__init__()
        self.base_model = RobertaModel.from_pretrained(pt_model_name)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(drop_percent)
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # get outputs from base model
        base_outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # extract hidden state from roberta base outputs
        hidden_state = base_outputs[0]
        x = hidden_state[:, 0]

        # define the linear layer preceding the classifier
        # and apply ReLU activation to its outputs
        x = self.pre_classifier(x)
        x = torch.nn.ReLU()(x)

        # define the dropout layer and classifier
        x = self.dropout(x)
        x = self.classifier(x)
        return x

In [13]:
# define metric collection
# TASK_TYPE = 'binary'
TASK_TYPE = 'multilabel'
NUM_CLASSES = 2
NUM_LABELS = 3

AVERAGE_STRATEGY = 'global'

metric_collection = MetricCollection({
    'acc': Accuracy(task=TASK_TYPE, num_labels=NUM_LABELS, num_classes=NUM_CLASSES, multidim_average=AVERAGE_STRATEGY),
    'auc': AUROC(task=TASK_TYPE, num_labels=NUM_LABELS, num_classes=NUM_CLASSES),
    'prec': Precision(task=TASK_TYPE, num_labels=NUM_LABELS, num_classes=NUM_CLASSES, multidim_average=AVERAGE_STRATEGY),
    'rec': Recall(task=TASK_TYPE, num_labels=NUM_LABELS, num_classes=NUM_CLASSES, multidim_average=AVERAGE_STRATEGY),
    'f1': F1Score(task=TASK_TYPE, num_labels=NUM_LABELS, num_classes=NUM_CLASSES, multidim_average=AVERAGE_STRATEGY)
})

metric_collection.to(device)

MetricCollection(
  (acc): MultilabelAccuracy()
  (auc): MultilabelAUROC()
  (f1): MultilabelF1Score()
  (prec): MultilabelPrecision()
  (rec): MultilabelRecall()
)

## Train

In [14]:
# weight_tensor = torch.Tensor()

model = CustomRoberta(0.5, 3)
model.to(device)

LEARNING_RATE = 1e-4

# define loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from model.train import train_model
loader_dict = {'train': train_loader, 'val': val_loader, 'test': test_loader}

seed_script(SEED)

train_model(
    device=device, 
    model=model, 
    loader_dict=loader_dict, 
    metric_collection=metric_collection, 
    criterion=criterion,
    optimizer=optimizer, 
    n_epochs=10, 
    save_dir='roberta-test', 
    monitor_metric="val_loss"
)

seed set...

Epoch 0 ----------------------------------------


  0%|          | 0/9 [00:00<?, ?batch/s]

tensor([0., 0., 1.])
tensor([1., 1., 1.])
tensor([0., 0., 1.])
tensor([0., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([1., 1., 0.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([0., 0., 1.])
tensor([1., 1., 1.])
tensor([0., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([0., 0., 1.])
tensor([0., 1., 1.])
tensor([1., 0., 1.])
tensor([1., 0., 1.])
tensor([0., 0., 0.])
tensor([0., 0., 1.])
tensor([0., 0., 1.])
tensor([1., 0., 1.])
tensor([1., 1., 0.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])


 11%|█         | 1/9 [00:01<00:13,  1.69s/batch, train_loss=0.685, train_acc=0.552, train_auc=0.459, train_f1=0.606, train_prec=0.733, train_rec=0.516]

tensor([0., 0., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 0.])
tensor([0., 1., 0.])
tensor([1., 0., 0.])
tensor([1., 0., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 0.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([1., 0., 1.])
tensor([1., 1., 1.])
tensor([0., 0., 0.])
tensor([0., 0., 1.])
tensor([1., 0., 0.])
tensor([0., 0., 1.])
tensor([1., 1., 1.])
tensor([0., 1., 0.])
tensor([0., 1., 1.])
tensor([0., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([0., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([0., 0., 0.])
tensor([0., 1., 1.])
tensor([0., 1., 1.])


 22%|██▏       | 2/9 [00:03<00:10,  1.57s/batch, train_loss=0.681, train_acc=0.531, train_auc=0.432, train_f1=0.612, train_prec=0.689, train_rec=0.55] 

tensor([1., 1., 1.])
tensor([1., 1., 0.])
tensor([1., 1., 1.])
tensor([0., 0., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 0.])
tensor([1., 0., 0.])
tensor([0., 1., 1.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([0., 1., 1.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([0., 1., 1.])
tensor([0., 0., 0.])
tensor([1., 1., 1.])
tensor([1., 0., 0.])
tensor([1., 0., 1.])
tensor([1., 1., 0.])
tensor([1., 1., 1.])
tensor([1., 0., 1.])
tensor([0., 0., 1.])
tensor([1., 0., 1.])


 22%|██▏       | 2/9 [00:04<00:16,  2.32s/batch, train_loss=0.681, train_acc=0.531, train_auc=0.432, train_f1=0.612, train_prec=0.689, train_rec=0.55]


KeyboardInterrupt: 