In [1]:
import pandas as pd

In [2]:
df_multiple_notes = pd.read_csv("../data/cleaned_notes.csv")
df_multiple_notes.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,diagnosis_category,gender,anchor_age,anchor_year,anchor_year_group,...,note_id,note_type,note_seq,charttime,storetime,text,note_length,anchor_date,days_since_anchor,cleaned_note_text
0,10070024,26769931,1,F323,10,Major Depression,F,23,2142,2014 - 2016,...,10070024-DS-13,DS,13.0,2145-01-29,2145-01-29 09:59:00,\nName: ___ Unit No: ___...,20409,2142-01-01,1124,\nName: ___ Unit No: ___...
1,10070024,26398294,1,F3189,10,Bipolar Disorder,F,23,2142,2014 - 2016,...,10070024-DS-14,DS,14.0,2145-02-09,2145-02-09 11:46:00,\nName: ___ Unit No: ___...,17868,2142-01-01,1135,"Chief Complaint:\n""I made a mistake. ""\n \nMaj..."
2,10080985,24679803,1,F332,10,Major Depression,F,22,2179,2014 - 2016,...,10080985-DS-16,DS,16.0,2179-05-13,2179-05-13 15:28:00,\nName: ___ Unit No: ___...,14035,2179-01-01,132,\nName: ___ Unit No: ___...
3,10080985,26523165,1,F329,10,Major Depression,F,22,2179,2014 - 2016,...,10080985-DS-15,DS,15.0,2179-05-11,2180-08-14 18:55:00,\nName: ___ Unit No: ___...,7168,2179-01-01,130,Chief Complaint:\ngenetic predisposition to br...
4,10266157,29245849,1,F332,10,Major Depression,F,76,2194,2011 - 2013,...,10266157-DS-22,DS,22.0,2198-10-22,2198-10-26 07:11:00,\nName: ___ Unit No: ___\...,11648,2194-01-01,1755,\nName: ___ Unit No: ___\...


In [3]:
#!pip install --quiet nltk Levenshtein

In [7]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

In [4]:
from nltk.util import ngrams

# Match sentences exactly
def exact_sentence_match(curr_sentences, prev_sentences):
    prev_set = set(prev_sentences)
    return [sent for sent in curr_sentences if sent in prev_set]

# Match n-grams (6-10 words)
def exact_ngram_match(curr_sentences, prev_sentences, n_min=6, n_max=10):
    prev_ngrams = set()
    for sent in prev_sentences:
        tokens = sent.split()
        for n in range(n_min, n_max+1):
            prev_ngrams.update(ngrams(tokens, n))

    filtered_sentences = []
    for sent in curr_sentences:
        tokens = sent.split()
        keep = True
        for n in range(n_min, n_max+1):
            for gram in ngrams(tokens, n):
                if gram in prev_ngrams:
                    keep = False
                    break
            if not keep:
                break
        if keep:
            filtered_sentences.append(sent)
    return filtered_sentences


In [5]:
import Levenshtein

def levenshtein_match(curr_sentences, prev_sentences, threshold=0.85):
    repeated = []
    for curr in curr_sentences:
        if any(Levenshtein.ratio(curr, prev) >= threshold for prev in prev_sentences):
            repeated.append(curr)
    return repeated


In [8]:
from collections import Counter

methods = {
    "Exact Sentence Match": exact_sentence_match,
    "Exact Ngram Match": exact_ngram_match,
    "Levenshtein Match": levenshtein_match,
    # "Cosine Match": cosine_match,
    # "Sentence Transformer Match": sentence_transformer_match,
}

# To record results
results = {
    method: {
        "total_sentences": 0,
        "repeated_sentences": 0,
        "total_words": 0,
        "repeated_words": 0
    } for method in methods.keys()
}

# Sort the notes first
df_multiple_notes['storetime'] = pd.to_datetime(df_multiple_notes['storetime'])
df_multiple_notes = df_multiple_notes.sort_values(by=["subject_id", "storetime"])

for subject_id, group in df_multiple_notes.groupby('subject_id'):
    group = group.sort_values('storetime')
    prev_note = None
    
    for idx, row in group.iterrows():
        curr_note = row['text']
        curr_sentences = split_into_sentences(curr_note)
        curr_words = sum(len(sent.split()) for sent in curr_sentences)

        if prev_note:
            prev_sentences = split_into_sentences(prev_note)

            for method_name, method_func in methods.items():
                repeated_sents = method_func(curr_sentences, prev_sentences)
                repeated_words = sum(len(sent.split()) for sent in repeated_sents)

                results[method_name]["total_sentences"] += len(curr_sentences)
                results[method_name]["repeated_sentences"] += len(repeated_sents)
                results[method_name]["total_words"] += curr_words
                results[method_name]["repeated_words"] += repeated_words

        prev_note = curr_note  # next iteration

In [9]:
for method_name, stat in results.items():
    sent_total = stat["total_sentences"]
    sent_repeated = stat["repeated_sentences"]
    word_total = stat["total_words"]
    word_repeated = stat["repeated_words"]

    sent_ratio = (sent_repeated / sent_total * 100) if sent_total else 0
    word_ratio = (word_repeated / word_total * 100) if word_total else 0

    print(f"\n {method_name}")
    print(f"  - Total sentences: {sent_total}")
    print(f"  - Repeated sentences: {sent_repeated} ({sent_ratio:.2f}%)")
    print(f"  - Total words: {word_total}")
    print(f"  - Repeated words: {word_repeated} ({word_ratio:.2f}%)")



 Exact Sentence Match
  - Total sentences: 59816
  - Repeated sentences: 6792 (11.35%)
  - Total words: 905795
  - Repeated words: 63276 (6.99%)

 Exact Ngram Match
  - Total sentences: 59816
  - Repeated sentences: 50852 (85.01%)
  - Total words: 905795
  - Repeated words: 699583 (77.23%)

 Levenshtein Match
  - Total sentences: 59816
  - Repeated sentences: 10693 (17.88%)
  - Total words: 905795
  - Repeated words: 120173 (13.27%)


In [10]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Downloading scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:04[0m
[?25hDownloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [11]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# Repetition detection using MPNet + cosine similarity
def mpnet_similarity_filter(curr_sentences, prev_sentences, threshold=0.85):
    if not prev_sentences or not curr_sentences:
        return []

    embeddings_curr = model.encode(curr_sentences, convert_to_tensor=True)
    embeddings_prev = model.encode(prev_sentences, convert_to_tensor=True)

    repeated = []
    for i in range(len(curr_sentences)):
        scores = util.cos_sim(embeddings_curr[i], embeddings_prev)
        if scores.max() >= threshold:
            repeated.append(curr_sentences[i])
    return repeated

In [17]:
#!pip install tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
from tqdm import tqdm
# Prepare result counters
results = {
    "Sentence Transformer Match": {
        "total_sentences": 0,
        "repeated_sentences": 0,
        "total_words": 0,
        "repeated_words": 0
    }
}

# Ensure sorted notes
df_multiple_notes['storetime'] = pd.to_datetime(df_multiple_notes['storetime'])
df_multiple_notes = df_multiple_notes.sort_values(by=["subject_id", "storetime"])

# Process per patient
for subject_id, group in tqdm(df_multiple_notes.groupby("subject_id"), desc="Processing Patients"):
    group = group.sort_values("storetime")
    prev_note = None

    for idx, row in group.iterrows():
        curr_text = row['text']
        curr_sentences = split_into_sentences(curr_text)
        curr_word_count = sum(len(sent.split()) for sent in curr_sentences)

        if prev_note:
            prev_sentences = split_into_sentences(prev_note)
            repeated_sents = mpnet_similarity_filter(curr_sentences, prev_sentences)
            repeated_word_count = sum(len(sent.split()) for sent in repeated_sents)

            stats = results["Sentence Transformer Match"]
            stats["total_sentences"] += len(curr_sentences)
            stats["repeated_sentences"] += len(repeated_sents)
            stats["total_words"] += curr_word_count
            stats["repeated_words"] += repeated_word_count

        prev_note = curr_text


In [16]:
stat = results["Sentence Transformer Match"]
sent_total = stat["total_sentences"]
sent_repeated = stat["repeated_sentences"]
word_total = stat["total_words"]
word_repeated = stat["repeated_words"]

sent_ratio = (sent_repeated / sent_total * 100) if sent_total else 0
word_ratio = (word_repeated / word_total * 100) if word_total else 0

print(f" Sentence Transformer Match (all-mpnet-base-v2)")
print(f"  - Total sentences: {sent_total}")
print(f"  - Repeated sentences: {sent_repeated} ({sent_ratio:.2f}%)")
print(f"  - Total words: {word_total}")
print(f"  - Repeated words: {word_repeated} ({word_ratio:.2f}%)")


 Sentence Transformer Match (all-mpnet-base-v2)
  - Total sentences: 59816
  - Repeated sentences: 12649 (21.15%)
  - Total words: 905795
  - Repeated words: 149325 (16.49%)
