In [1]:
# %%
# 0. INSTALL DEPENDENCIES (Run this cell first!)
%pip install transformers torch numpy scipy

# %% [markdown]
# # üß† NLP: Medical Entity & Embedding Demo
# This notebook uses DistilBERT to convert medical text into mathematical vectors.

# %%
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from scipy.spatial.distance import cosine

# %%
# 1. LOAD MODEL
# This will download about 250MB of weights the first time you run it.
print("‚è≥ Loading DistilBERT Model...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")
print("‚úÖ Model Loaded!")

# %%
# 2. DEFINE SAMPLE TEXTS
# We want to see if the AI understands that Text A and C are similar.
text_1 = "Patient diagnosed with viral fever and high temperature. Prescribed Paracetamol."
text_2 = "Diagnosis: Compound fracture in left tibia. Surgery required immediately."
text_3 = "Viral fever symptoms observed. Recommended bed rest and fluids."

print(f"Text A: {text_1}")
print(f"Text B: {text_2}")
print(f"Text C: {text_3}")

# %%
# 3. GENERATE EMBEDDINGS
def get_embedding(text):
    # Convert text to tokens (numbers)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the 'CLS' token (index 0) as the summary of the whole sentence
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()

vec_1 = get_embedding(text_1)
vec_2 = get_embedding(text_2)
vec_3 = get_embedding(text_3)

print(f"\nVector Shape: {vec_1.shape} (768 dimensions)")

# %%
# 4. CHECK SIMILARITY
# Cosine Similarity: 1.0 = Identical, 0.0 = Different
# We use 1 - cosine distance because scipy returns distance (lower is closer)

score_a_b = 1 - cosine(vec_1, vec_2) # Fever vs Fracture
score_a_c = 1 - cosine(vec_1, vec_3) # Fever vs Fever

print("\n--- Similarity Scores (Higher is Better) ---")
print(f"Fever vs Fracture: {score_a_b:.4f}  (Expected: Low)")
print(f"Fever vs Fever:    {score_a_c:.4f}  (Expected: High)")

# %%
# 5. KEYWORD EXTRACTION (Simple Rule-Based)
# In a full system, we would use a Named Entity Recognition (NER) model here.
medical_keywords = ["fever", "fracture", "surgery", "paracetamol", "viral", "orthopedic"]

found_keywords = [word for word in text_1.lower().replace('.', '').split() if word in medical_keywords]
print(f"\nExtracted Keywords from Text A: {found_keywords}")


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


Note: you may need to restart the kernel to use updated packages.
‚è≥ Loading DistilBERT Model...
‚úÖ Model Loaded!
Text A: Patient diagnosed with viral fever and high temperature. Prescribed Paracetamol.
Text B: Diagnosis: Compound fracture in left tibia. Surgery required immediately.
Text C: Viral fever symptoms observed. Recommended bed rest and fluids.

Vector Shape: (768,) (768 dimensions)

--- Similarity Scores (Higher is Better) ---
Fever vs Fracture: 0.9315  (Expected: Low)
Fever vs Fever:    0.9566  (Expected: High)

Extracted Keywords from Text A: ['viral', 'fever', 'paracetamol']
