In [2]:
import re
from collections import Counter
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ---------- Setup Transformer Model for POS Tagging ----------
model_name = "vblagoje/bert-english-uncased-finetuned-pos"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
pos_tagger = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ---------- POS Patterns ----------
POS_PATTERNS = [
    ['ADJ', 'NOUN'],
    ['NOUN', 'NOUN'],
    ['ADJ', 'NOUN', 'NOUN'],
    ['PROPN', 'NOUN'],
    ['NOUN', 'ADP', 'NOUN']
]

# ---------- Preprocessing ----------
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

# ---------- Clean WordPiece Fragments ----------
def clean_wordpieces(tagged):
    words = []
    current_word = ""
    for t in tagged:
        w = t['word']
        if w.startswith("##"):
            current_word += w[2:]
        else:
            if current_word:
                words.append(current_word)
            current_word = w
    if current_word:
        words.append(current_word)
    return [w.lower() for w in words]

# ---------- PoS Tagging ----------
def get_lemmas_and_pos(text):
    tagged = pos_tagger(text)
    cleaned_words = clean_wordpieces(tagged)
    pos_tags = [t['entity_group'] for t in tagged if not t['word'].startswith("##")]
    return cleaned_words, pos_tags

# ---------- Pattern Matching ----------
def match_patterns(words, pos_tags, patterns):
    matched = []
    for window in range(2, 5):
        for i in range(len(words) - window + 1):
            sub_words = words[i:i + window]
            sub_tags = pos_tags[i:i + window]
            if sub_tags in patterns:
                matched.append((" ".join(sub_words), sub_tags))
    return matched

# ---------- Unithood Score ----------
def unithood_score(phrase, doc_freqs):
    score = 0
    if doc_freqs[phrase] >= 1: score += 1
    if len(phrase.split()) > 1: score += 1
    if re.search(r"\b[a-z]+\s[a-z]+", phrase): score += 1
    return score

# ---------- Termhood Score ----------
def termhood_score(phrase):
    score = 0
    stop_words = {"the", "and", "in", "of", "no"}
    tokens = phrase.split()
    if len(tokens) > 1: score += 1
    if not any(w in stop_words for w in tokens): score += 1
    if any(char.isalpha() for char in phrase): score += 1
    return score

# ---------- Main Extraction Function ----------
def extract_terms_verbose(emr_texts, threshold_u=2, threshold_t=2):
    all_phrases = []
    all_matches = []

    print("\n🔹 Preprocessing Output\n")
    for text in emr_texts:
        print(f"Original: {text}")
        clean_text = preprocess(text)
        words, pos_tags = get_lemmas_and_pos(clean_text)
        phrases = match_patterns(words, pos_tags, POS_PATTERNS)
        simplified = sorted(set([p[0] for p in phrases]))
        all_phrases.extend(simplified)
        all_matches.extend(phrases)
        for s in simplified:
            print(f"- {s}")
        print()

    doc_freqs = Counter(all_phrases)

    print("\n🔹 Matched Patterns\n")
    for phrase, tags in all_matches:
        print(f"{phrase:<30} {tags}")

    print("\n🔹 Scoring (Unithood & Termhood)\n")
    results = []
    for phrase in sorted(set(all_phrases)):
        u_score = unithood_score(phrase, doc_freqs)
        t_score = termhood_score(phrase)
        accepted = u_score >= threshold_u and t_score >= threshold_t
        results.append((phrase, u_score, t_score, accepted))
        print(f"{phrase:<30} U: {u_score} | T: {t_score} | {'✅' if accepted else '❌'}")

    final_terms = [r[0] for r in results if r[3]]

    print("\n✅ Final Extracted Terms")
    print(final_terms)
    return final_terms

# ---------- Example Usage ----------
if __name__ == "__main__":
    EMR_Texts = [
    "Patient reports chest pain radiating to the left arm and jaw, suggestive of angina.",
    "Diagnosed with type 2 diabetes mellitus and prescribed metformin 500mg twice daily.",
    "Complains of blurred vision and frequent urination over the past few weeks.",
    "MRI scan reveals degenerative disc disease at L4-L5 level with mild spinal stenosis.",
    "Noted history of hypertension and hyperlipidemia for over 10 years.",
    "Recent lab tests show elevated liver enzymes and fatty infiltration of the liver.",
    "Experiencing abdominal bloating, nausea, and reduced appetite.",
    "Underwent laparoscopic cholecystectomy last month due to gallstones.",
    "Echocardiogram shows reduced ejection fraction consistent with systolic heart failure.",
    "The patient has mild cognitive impairment and difficulty with short-term memory."
]


    extract_terms_verbose(EMR_Texts)


Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



🔹 Preprocessing Output

Original: Patient reports chest pain radiating to the left arm and jaw, suggestive of angina.
- left arm

Original: Diagnosed with type 2 diabetes mellitus and prescribed metformin 500mg twice daily.

Original: Complains of blurred vision and frequent urination over the past few weeks.
- blurred vision
- frequent urination
- past few weeks

Original: MRI scan reveals degenerative disc disease at L4-L5 level with mild spinal stenosis.
- degenerative disc disease
- disc disease at ll level
- mild spinal stenosis

Original: Noted history of hypertension and hyperlipidemia for over 10 years.
- history of hypertension
- hyperlipidemia for over years

Original: Recent lab tests show elevated liver enzymes and fatty infiltration of the liver.
- fatty infiltration
- recent lab tests

Original: Experiencing abdominal bloating, nausea, and reduced appetite.
- abdominal bloating
- abdominal bloating nausea
- bloating nausea

Original: Underwent laparoscopic cholecystectom

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import time

def evaluate_extraction(emr_texts, ground_truth):
    start_time = time.time()
    predicted_terms = extract_terms_verbose(emr_texts)
    end_time = time.time()

    # Flatten all terms
    pred_set = set(predicted_terms)
    true_set = set([item for sublist in ground_truth for item in sublist])

    tp = len(pred_set & true_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)
    tn = 0  # Hard to define in open vocabulary tasks

    # Avoid division by zero
    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    accuracy = tp / len(true_set) if true_set else 0

    print("\n📊 Quantitative Evaluation")
    print(f"Precision    : {precision:.2f}")
    print(f"Recall       : {recall:.2f}")
    print(f"F1 Score     : {f1:.2f}")
    print(f"Accuracy     : {accuracy:.2f}")
    print(f"Exec Time    : {end_time - start_time:.2f} sec")
