# Data shift experiment

We create a new version of the text data, where phrases describing presence or absence of symptoms are randomly masked out of the clinical notes. We train our text-only model on the original notes (nothing masked out), fit the C-BN-text and V-C-BN-text models on these notes as well, and then evaluate all models' performance on the redacted notes. This way, we find out how our method deals with distributions in text data at-inference-time. 

## Redact symptom mentions from notes

We can use the span annotations released with the synsum dataset for identifying in which parts of the note each symptom is mentioned. We then mask each of these out with a 50% probability by dropping the sentence containing the span.

In [None]:
import pandas as pd
from run_experiments import load_simsum

df = load_simsum()

In [None]:
import json
with open("simsum/normal_span_annotations.json", "r") as file: 
    ann = json.load(file)

In [None]:
from nltk.tokenize import sent_tokenize

def split_into_sentences(note):
    """Use nltk.tokenize.sent_tokenize, but explicitly handle the **History** and **Physical Examination** section headings in the clinical notes."""
    history_token = "**History**"
    history_length = len(history_token)
    pe_token = "**Physical Examination**"
    pe_length = len(pe_token)
    history_start = note.find(history_token)
    pe_start = note.find(pe_token)
    history = note[history_start+history_length:pe_start]
    pe = note[pe_start+pe_length:]
    return [history_token+'\n'] + sent_tokenize(history) + ['\n'+pe_token+'\n'] + sent_tokenize(pe)

In [None]:
import numpy as np
import textwrap
np.random.seed(2025)

def mask_spans(row): 
    p = 0.5
    id = row.name
    text = row["text"]
    tokenized = split_into_sentences(text)
    for span in ann[str(id)]: 
        mask = np.random.rand() < p # mask span with a chance p 
        if mask:
            new_tokens = []
            for sent in tokenized:
                if span["text"] not in sent:
                    new_tokens.append(sent)
            tokenized = new_tokens
    return ' '.join(tokenized)

In [None]:
df["redacted"] = df.apply(mask_spans, axis=1)

In [None]:
for i in range(5):
    print(textwrap.fill(df.iloc[i]["text"]))
    print("--------")
    print(textwrap.fill(df.iloc[i]["redacted"]))
    print("--------")

Get BioLORD embeddings

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("FremyCompany/BioLORD-2023") # import the BioLORD model

In [None]:
import nltk
nltk.download('punkt_tab')

def get_embeddings(row): 

    # split into sentences
    sent = nltk.tokenize.sent_tokenize(row["redacted"])

    # encode sentences
    enc = model.encode(sent, normalize_embeddings=True, show_progress_bar = False)

    # mean pool
    mean_enc = enc.mean(axis=0)

    return mean_enc

In [None]:
for k in range(int(len(df)/100)): # jump in blocks of 100
    print(f"retrieving embeddings for round {k}")
    df_subset = df.iloc[100*k:100*(k+1)].copy()
    df_subset["redacted_embedding"] = df_subset.apply(get_embeddings, axis=1)
    df.loc[df_subset.index, "redacted_embedding"] = df_subset["redacted_embedding"]