In [1]:
import os
import re

In [2]:
ANNOTATIONS_DIR = '/Users/anjalikulkarni/Desktop/Assignment1/CADEC-lPWNPfjE-/data/cadec/original'

In [3]:
LABELS = ("ADR", "Drug", "Disease", "Symptom")
label_entities = {k: set() for k in LABELS}

In [4]:
def normalize(text: str) -> str:
    # normalize entity surface form a bit
    t = text.strip().lower()
    t = re.sub(r'\s+', ' ', t)           # collapse whitespace
    t = t.strip(' .,:;!?()[]{}"\'')      # trim punctuation at ends
    return t

In [5]:
def collect_entities(dirpath: str):
    files = [f for f in os.listdir(dirpath) if f.endswith('.ann')]
    if not files:
        print("No .ann files found. Is the path correct?")
        return

    for fname in files:
        fpath = os.path.join(dirpath, fname)
        with open(fpath, 'r', encoding='utf-8') as fh:
            for line in fh:
                line = line.rstrip('\n')
                if not line or line[0] in {'#', 'A', 'R'}:
                    # comments / annotator notes / attributes / relations → skip
                    continue
                if not line.startswith('T'):
                    continue  # only entity lines carry text spans

                # BRAT format: ID \t TYPE + SPAN(S) \t TEXT
                parts = line.split('\t')
                if len(parts) < 3:
                    continue  # malformed

                meta = parts[1]              # e.g. "ADR 9 19" or "ADR 9 19;25 30"
                text = parts[2]              # entity surface text from the post
                label = meta.split()[0]      # first token is the label

                if label in label_entities:
                    label_entities[label].add(normalize(text))

In [6]:
collect_entities(ANNOTATIONS_DIR)

In [7]:
for label, ents in label_entities.items():
    print(f"Label: {label}")
    print(f"Distinct Entities: {len(ents)}")
    print(f"Sample (up to 10): {sorted(list(ents))[:10]}")
    print("=" * 60)

Label: ADR
Distinct Entities: 3399
Sample (up to 10): ['2-3 periods a month instead of once a month', '5-6 times at night to pee; normally 1 or 2', 'a lot grouchier', 'abdominal cramping', 'abdominal cramps', 'abdominal cramps and pain', 'abdominal discomfort', 'abdominal distention - feel full', 'abdominal flu symptoms', 'abdominal gas']
Label: Drug
Distinct Entities: 323
Sample (up to 10): ['acidophilous', 'aciphex', 'actose', 'advicor', 'advil', 'aleve', 'allegra', 'alleve', 'alpha lipoic acid', 'alpha lipoid acid']
Label: Disease
Distinct Entities: 164
Sample (up to 10): ['abdominal hematoma', 'acute gastritis', 'acute shoulder tendonitis', 'adenomyosis', 'allergy', 'als', 'alzheimers', 'amyotrophic lateral sclerosis', 'anemia', 'arch problem']
Label: Symptom
Distinct Entities: 148
Sample (up to 10): ['abdominal pain', 'abortion', 'ache', 'aches', 'aches and pains', 'achiness', 'acid reflux', 'afterpain', 'agony', 'anxiety']
