In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Choose model name
model_name = "d4data/biomedical-ner-all"

# Load and save locally
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Save to local folder
save_path = "./drive/MyDrive/BERT/models/biomedical-ner-all"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load from local path
local_model_path = "./models/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForTokenClassification.from_pretrained(local_model_path)

# Create NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example text
text = """The patient was diagnosed with pneumonia and prescribed  amoxicillin.
He also has a history of hypertension and diabetes mellitus."""

# Run NER
entities = nlp(text)

# Display results
for ent in entities:
    print(f"{ent['word']} → {ent['entity_group']} (score: {ent['score']:.3f})")


Device set to use cpu


pneumonia → Disease_disorder (score: 0.996)
am → Medication (score: 1.000)
##oxici → Medication (score: 0.976)
hyper → History (score: 0.781)
diabetes → Family_history (score: 0.838)


In [None]:
!pip install PyPDF2
!pip install reportlab

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.4-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.4


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def create_fake_ehr_pdf(filename="fake_ehr_report.pdf"):
    c = canvas.Canvas(filename, pagesize=letter)
    c.setFont("Helvetica", 11)

    c.drawString(50, 750, "Patient Name: John Doe")
    c.drawString(50, 735, "Age: 54")
    c.drawString(50, 720, "Gender: Male")
    c.drawString(50, 705, "Date of Admission: 2023-08-12")
    c.drawString(50, 690, "Date of Discharge: 2023-08-18")

    c.drawString(50, 665, "Chief Complaint:")
    c.drawString(70, 650, "Shortness of breath, fever, and productive cough for 3 days.")

    c.drawString(50, 625, "History of Present Illness:")
    text = (
        "The patient presented with symptoms of pneumonia. "
        "He reported chills, mild chest pain, and difficulty breathing. "
        "Chest X-ray revealed patchy infiltrates in the right lower lobe. "
        "Oxygen saturation was 89% on room air."
    )
    c.drawString(70, 610, text)

    c.drawString(50, 575, "Past Medical History:")
    c.drawString(70, 560, "Hypertension, Type 2 Diabetes Mellitus, Hyperlipidemia")

    c.drawString(50, 535, "Medications on Admission:")
    c.drawString(70, 520, "Metformin 500 mg BID, Lisinopril 10 mg daily, Atorvastatin 20 mg nightly")

    c.drawString(50, 495, "Treatment & Hospital Course:")
    text = (
        "Patient was started on intravenous amoxicillin-clavulanate and azithromycin. "
        "Supplemental oxygen was given via nasal cannula. "
        "Blood glucose levels were monitored and controlled. "
        "Patient showed gradual improvement and was discharged after 6 days."
    )
    c.drawString(70, 480, text)

    c.drawString(50, 445, "Discharge Medications:")
    c.drawString(70, 430, "Amoxicillin-Clavulanate 625 mg TID for 7 days")
    c.drawString(70, 415, "Metformin 500 mg BID")
    c.drawString(70, 400, "Lisinopril 10 mg daily")

    c.drawString(50, 375, "Follow-up:")
    c.drawString(70, 360, "Follow up with primary care physician in 2 weeks for repeat chest X-ray.")

    c.drawString(50, 330, "Physician: Dr. Sarah Thompson, MD")
    c.save()

create_fake_ehr_pdf()
print("✅ Fake EHR PDF created: fake_ehr_report.pdf")


# --- Step 1: Extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

pdf_text = extract_text_from_pdf("fake_ehr_report.pdf")

# --- Step 2: Summarize the text ---
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(pdf_text, max_length=300, min_length=80, do_sample=False)[0]['summary_text']

print("\n--- Summary ---\n", summary)

# --- Step 3: Load biomedical NER model ---
model_name = "/content/drive/MyDrive/BERT/models/biomedical-ner-all"  # local path if already saved
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --- Step 4: Extract entities from summary ---
entities = ner_pipeline(summary)
print("\n--- Extracted Entities ---")
for ent in entities:
    print(f"{ent['word']} → {ent['entity_group']} (score: {ent['score']:.3f})")


✅ Fake EHR PDF created: fake_ehr_report.pdf


Device set to use cpu
Your max_length is set to 300, but your input_length is only 293. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=146)



--- Summary ---
 The patient presented with symptoms of pneumonia. He reported chills, mild chest pain, and difficulty breathing. Chest X-ray revealed patchy infiltrates in the right lower lobe. Oxygen saturation was 89% on room air. He was started on intravenous amoxicillin-clavulanate and azithromycin. Blood glucose levels were monitored and controlled. Patient showed gradual improvement and was discharged after 6 days.


Device set to use cpu



--- Extracted Entities ---
chill → Sign_symptom (score: 0.998)
mild → Severity (score: 1.000)
chest → Biological_structure (score: 1.000)
difficulty breathing → Sign_symptom (score: 0.999)
chest → Biological_structure (score: 1.000)
x → Diagnostic_procedure (score: 1.000)
- ray → Diagnostic_procedure (score: 1.000)
patchy → Detailed_description (score: 0.997)
infiltrate → Sign_symptom (score: 1.000)
right lower → Biological_structure (score: 1.000)
oxygen saturation → Diagnostic_procedure (score: 1.000)
89 % → Lab_value (score: 0.999)
room → Detailed_description (score: 0.999)
intra → Administration (score: 1.000)
##ven → Administration (score: 0.999)
amoxicillin - → Medication (score: 0.863)
clavulanate → Medication (score: 0.997)
azith → Medication (score: 0.994)
##romy → Medication (score: 0.682)
blood glucose → Diagnostic_procedure (score: 1.000)
improvement → Lab_value (score: 0.709)
discharged → Clinical_event (score: 1.000)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Choose smaller summarization model
model_name = "facebook/bart-base"

# Load and save locally
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

save_path = "./models/bart-base"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

print("✅ Model saved locally to", save_path)


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]



✅ Model saved locally to ./models/bart-base


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Load from local path
local_model_path = "./models/bart-base"

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(local_model_path)

# Create summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Example text
pdf_text = extract_text_from_pdf("fake_ehr_report.pdf")

summary = summarizer(pdf_text, max_length=120, min_length=40, do_sample=False)[0]['summary_text']

print("\n--- Summary ---\n", summary)


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Summary ---
 Patient Name: John Doe, MDAge: 54Gender: MaleDate of Admission: 2023-08-12, 2017Date of Discharge: 2019-08 -18, 2017Chief Complaint: "Shortness of breath, fever, and productive cough for 3 days.”History of Present Illness: “The patient presented with symptoms of pneumonia. He reported chills, mild chest pain, and difficulty breathing. Chest X-ray revealed patchy infiltrates in the right lower lobe. Oxygen saturation was 89% on room air. ”Past Medical History:“Hypertension, Type 2 Diabetes Mellitus, Hyperlipidemia, and Blood Pressure”Medications on Admission:”Metformin 500 mg BID, Lisinopril 10 mg daily, Atorvastatin 20 mg nightly”Treatment & Hospital Course: ”Patient was started on intravenous amoxicillin-clavulanate and azithromycin. Supplemental oxygen was given via nasal cannula. Blood glucose levels were monitored and controlled. Patient showed gradual improvement and was discharged after 6 days. “”Discharge Medications: ___________________________________________

In [None]:
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
import transformers

print(f"Transformers version: {transformers.__version__}")

# --- Step 1: Load Dataset ---
dataset = load_dataset("ccdv/pubmed-summarization")

# Use small subset for demo
train_data = dataset['train'].select(range(2000))   # 2K samples
val_data = dataset['validation'].select(range(200)) # 200 samples

# --- Step 2: Load Tokenizer & Model ---
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# --- Step 3: Tokenization ---
def preprocess(batch):
    inputs = tokenizer(batch["article"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["abstract"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_train = train_data.map(preprocess, batched=True, remove_columns=["article", "abstract"])
tokenized_val = val_data.map(preprocess, batched=True, remove_columns=["article", "abstract"])

# --- Step 4: Training Arguments ---
training_args_dict = {
    "output_dir": "./bart_pubmed_finetuned",
    "eval_steps": 500,
    "save_steps": 1000,
    "logging_steps": 100,
    "per_device_train_batch_size": 2,
    "per_device_eval_batch_size": 2,
    "num_train_epochs": 2,
    "learning_rate": 3e-5,
    "weight_decay": 0.01,
    "save_total_limit": 2,
    "fp16": False,  # change to True if using GPU
}

args = TrainingArguments(**training_args_dict)


# --- Step 5: Trainer ---
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

trainer.train()

# --- Step 6: Save Model ---
model.save_pretrained("./bart_pubmed_finetuned")
tokenizer.save_pretrained("./bart_pubmed_finetuned")

print("✅ Fine-tuned model saved at ./bart_pubmed_finetuned")

Transformers version: 4.57.1


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]



Step,Training Loss


KeyboardInterrupt: 

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_dir = "./bart_pubmed_finetuned"
tokenizer = BartTokenizer.from_pretrained(model_dir)
model = BartForConditionalGeneration.from_pretrained(model_dir)

ehr_text = """
The patient was admitted with shortness of breath and fever.
Chest X-ray showed bilateral infiltrates. Treated with antibiotics and oxygen support.
Patient improved and was discharged on oral medication.
"""

inputs = tokenizer([ehr_text], return_tensors="pt", truncation=True, max_length=512)
summary_ids = model.generate(**inputs, max_length=120, min_length=30, do_sample=False)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summary)
