# Cell 1: Download models if needed

In [9]:
import stanza

# Only download if not already present (silently)
for (lang, package) in [('en', None), ('en', 'bc5cdr')]:
    try:
        stanza.Pipeline(lang=lang, processors='tokenize,ner', package=package or 'default', verbose=False)
    except Exception:
        stanza.download(lang, package=package)


# Cell 2: Load the sample text

In [10]:
# Sample dialogue (as a string)
text = """
Patient: "Hi Dr. Carter. My name is Sarah, I’m here because I’ve been struggling with anxiety and depression since starting at Newton University in September."

Therapist: "Nice to meet you, Sarah. I’m Dr. Carter, a psychiatrist at the Boston Mind Health Center. Can you tell me more about your symptoms?"

Patient: "Mostly I feel constant fatigue, and I’ve had headaches every morning since the Halloween event on campus. My mom, Robyn, is worried because she thinks I might need Zoloft or Lexapro, but I haven’t tried either."

Therapist: "It’s understandable to feel overwhelmed, especially with your course load and the transition to university life. Are you currently taking any medication or supplements?"

Patient: "I tried Tylenol and Advil for the headaches, but they don’t help. I also bought a new sleep tracker and some noise-cancelling headphones, but nothing seems to make a difference."

Therapist: "Have you considered attending the wellness group for freshmen? Sometimes talking to peers with similar experiences can be helpful, especially for students from different backgrounds."

Patient: "I heard about that group during orientation, but I haven’t joined. I worry my schedule won’t allow it, and honestly, I’m not sure if talking will help."

Therapist: "Have you spoken with your primary care doctor at Newton General Hospital about these symptoms? Also, are there any other medical conditions in your family history?"

Patient: "My mom’s side has a history of migraine and insomnia, but I don’t think anyone’s had anxiety like this. Robyn suggested I journal my symptoms and use the meditation app she downloaded on my phone."

Therapist: "Let’s try some practical steps: keep a daily log of your headaches and sleep patterns, consider meeting with the freshman group, and follow up with me in two weeks. If symptoms worsen, we can discuss medication options like Zoloft or Lexapro at your next appointment."
"""


# Cell 3: Run Stanza pipelines & display NER, dependency, constituency

In [11]:
# Load Stanza pipelines (once)
nlp_en = stanza.Pipeline(
    lang='en',
    processors='tokenize,pos,lemma,ner,depparse,constituency',
    use_gpu=False,
    verbose=False
)
nlp_bc5cdr = stanza.Pipeline(
    lang='en',
    processors='tokenize,ner',
    package={'ner': 'bc5cdr', 'tokenize': 'default'},
    use_gpu=False,
    verbose=False
)

# Run on the full text (as a big chunk)
doc_en = nlp_en(text)
doc_bc5cdr = nlp_bc5cdr(text)

# Print NER (English)
print("="*30, "\nGENERAL ENGLISH (default)\n", "="*30)
print("-- Named Entities --")
for ent in doc_en.ents:
    print(f"  {ent.text:30}  ({ent.type})")

print("\n-- Dependency Parse (token/parent/rel) --")
for sent in doc_en.sentences:
    for word in sent.words:
        head = sent.words[word.head-1].text if word.head > 0 else 'ROOT'
        print(f"  {word.text:15} ← {head:15}  ({word.deprel})")

print("\n-- Constituency Parse --")
for sent in doc_en.sentences:
    print(sent.constituency)

print("\n" + "="*30, "\nBIOMEDICAL NER (BC5CDR)\n", "="*30)
print("-- Named Entities --")
for ent in doc_bc5cdr.ents:
    print(f"  {ent.text:30}  ({ent.type})")


GENERAL ENGLISH (default)
-- Named Entities --
  Carter                          (PERSON)
  Sarah                           (PERSON)
  Newton University               (ORG)
  September                       (DATE)
  Sarah                           (PERSON)
  Carter                          (PERSON)
  the Boston Mind Health Center   (ORG)
  Halloween                       (DATE)
  Robyn                           (PERSON)
  Lexapro                         (PRODUCT)
  Newton General Hospital         (FAC)
  Robyn                           (PERSON)
  two weeks                       (DATE)
  Zoloft                          (PRODUCT)
  Lexapro                         (PRODUCT)

-- Dependency Parse (token/parent/rel) --
  Patient         ← ROOT             (root)
  :               ← Dr.              (punct)
  "               ← Dr.              (punct)
  Hi              ← Dr.              (discourse)
  Dr.             ← Patient          (dep)
  Carter          ← Dr.              (flat)
  .    

# Cell 4: Split text into utterances (by speaker, crude version)

In [12]:
import re

# Updated pattern: start-of-line (^) match, no lookbehind needed!
utterance_pattern = re.compile(r'^(Patient|Therapist):\s*"([^"]+)"', re.MULTILINE)
utterances = []
for match in utterance_pattern.finditer(text):
    speaker = match.group(1).lower()
    utt_text = match.group(2).strip()
    if utt_text:
        utterances.append({"speaker": speaker, "text": utt_text})

# Sanity check
for i, utt in enumerate(utterances):
    print(f"[{i}] {utt['speaker'].capitalize()}: {utt['text']}")


[0] Patient: Hi Dr. Carter. My name is Sarah, I’m here because I’ve been struggling with anxiety and depression since starting at Newton University in September.
[1] Therapist: Nice to meet you, Sarah. I’m Dr. Carter, a psychiatrist at the Boston Mind Health Center. Can you tell me more about your symptoms?
[2] Patient: Mostly I feel constant fatigue, and I’ve had headaches every morning since the Halloween event on campus. My mom, Robyn, is worried because she thinks I might need Zoloft or Lexapro, but I haven’t tried either.
[3] Therapist: It’s understandable to feel overwhelmed, especially with your course load and the transition to university life. Are you currently taking any medication or supplements?
[4] Patient: I tried Tylenol and Advil for the headaches, but they don’t help. I also bought a new sleep tracker and some noise-cancelling headphones, but nothing seems to make a difference.
[5] Therapist: Have you considered attending the wellness group for freshmen? Sometimes talk

# Cell 5: Build Clipboard dictionary (prototype)

In [13]:
from collections import defaultdict

def build_clipboard_dict(doc_en, doc_bc5cdr, utterances):
    clipboard = {"named_entities": {}, "dependency_relations": []}
    entity_index = {}

    # Biomedical first (higher priority)
    for ent in doc_bc5cdr.ents:
        entity_index[ent.text] = {"label": ent.type, "source": "bc5cdr"}

    # Add generic entities if not already in
    for ent in doc_en.ents:
        if ent.text not in entity_index:
            entity_index[ent.text] = {"label": ent.type, "source": "en"}

    # Map entity to utterances
    for i, utt in enumerate(utterances):
        for ent_text, info in entity_index.items():
            if ent_text in utt['text']:
                if ent_text not in clipboard["named_entities"]:
                    clipboard["named_entities"][ent_text] = {
                        "type": info["label"],
                        "source": info["source"],
                        "timestamps": [i],
                        "FirstInstance": {
                            "utterance_id": i,
                            "speaker": utt["speaker"],
                            "rawText": utt["text"],
                            "han_vector": "[HAN vector placeholder]"
                        },
                        "LatestInstance": {
                            "utterance_id": i,
                            "speaker": utt["speaker"],
                            "rawText": utt["text"],
                            "han_vector": "[HAN vector placeholder]"
                        }
                    }
                else:
                    clipboard["named_entities"][ent_text]["timestamps"].append(i)
                    clipboard["named_entities"][ent_text]["LatestInstance"] = {
                        "utterance_id": i,
                        "speaker": utt["speaker"],
                        "rawText": utt["text"],
                        "han_vector": "[HAN vector placeholder]"
                    }

    # Dependency relations (sentence-level)
    for i, sent in enumerate(doc_en.sentences):
        for word in sent.words:
            if word.deprel in ("nsubj", "obj"):
                subj = word.text if word.deprel == "nsubj" else None
                obj = word.text if word.deprel == "obj" else None
                verb = sent.words[word.head-1].text if word.head > 0 else None
                if subj and verb:
                    clipboard["dependency_relations"].append(
                        {"subj": subj, "verb": verb, "obj": obj, "timestamp": i}
                    )
    return clipboard

# Actually build it
clipboard = build_clipboard_dict(doc_en, doc_bc5cdr, utterances)


# Cell 6: Pretty-print the Clipboard

In [14]:
import json

def print_clipboard(clipboard):
    print("\n\n=== Named Entities ===\n")
    for ent, info in clipboard["named_entities"].items():
        print(f"{ent!r} ({info['type']}, {info['source']})")
        print(f"  Timestamps: {info['timestamps']}")
        print(f"  FirstInstance: [{info['FirstInstance']['speaker']}] {info['FirstInstance']['rawText']}")
        print(f"  LatestInstance: [{info['LatestInstance']['speaker']}] {info['LatestInstance']['rawText']}")
        print("")
    # print("\n=== Dependency Relations ===\n")
    # for rel in clipboard["dependency_relations"]:
    #     print(f"  subj: {rel['subj']}, verb: {rel['verb']}, obj: {rel['obj']}, t={rel['timestamp']}")

# Show output
print_clipboard(clipboard)




=== Named Entities ===

'anxiety' (DISEASE, bc5cdr)
  Timestamps: [0, 8]
  FirstInstance: [patient] Hi Dr. Carter. My name is Sarah, I’m here because I’ve been struggling with anxiety and depression since starting at Newton University in September.
  LatestInstance: [patient] My mom’s side has a history of migraine and insomnia, but I don’t think anyone’s had anxiety like this. Robyn suggested I journal my symptoms and use the meditation app she downloaded on my phone.

'depression' (DISEASE, bc5cdr)
  Timestamps: [0]
  FirstInstance: [patient] Hi Dr. Carter. My name is Sarah, I’m here because I’ve been struggling with anxiety and depression since starting at Newton University in September.
  LatestInstance: [patient] Hi Dr. Carter. My name is Sarah, I’m here because I’ve been struggling with anxiety and depression since starting at Newton University in September.

'Carter' (PERSON, en)
  Timestamps: [0, 1]
  FirstInstance: [patient] Hi Dr. Carter. My name is Sarah, I’m here because 