In [1]:
import pandas as pd
import numpy as np
import re
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from tqdm import tqdm
tqdm.pandas()

In [2]:
file_path = r"D:\archive\final.csv"

In [3]:
df = pd.read_csv(file_path)

In [8]:
df

Unnamed: 0,text,label
0,<ALLERGIES> No Known Allergies / Adverse Drug...,"HCV cirrhosis c/b ascites, hiv on ART, h/o IV..."
1,<ALLERGIES> Percocet <CHIEF COMPLAINT> abdomi...,"with HIV on HAART, HCV cirrhosis with ascites..."
2,<ALLERGIES> omeprazole <CHIEF COMPLAINT> dysp...,No cardiac disease mentioned
3,<ALLERGIES> omeprazole / Iodine and Iodide Co...,No cardiac disease mentioned
4,<ALLERGIES> No Known Allergies / Adverse Drug...,No cardiac disease mentioned
...,...,...
270028,<ALLERGIES> No Known Allergies / Adverse Drug...,No cardiac disease mentioned
270029,<ALLERGIES> Lamictal / hydrochlorothiazide <C...,No cardiac disease mentioned
270030,<ALLERGIES> Patient recorded as having No Kno...,No cardiac disease mentioned
270031,<ALLERGIES> Patient recorded as having No Kno...,Mr. was admitted with status epilepticus. His ...


In [5]:
pip install pandas numpy scikit-learn transformers torch tqdm


Note: you may need to restart the kernel to use updated packages.


In [None]:
df['text'] = df['augmented_input_tokens']
df['label'] = df['target_tokens']
df = df[["text", "label"]]

In [9]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)


In [10]:
from transformers import AutoTokenizer
from tqdm import tqdm
import torch

# Load tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function (batch-safe)
def batch_tokenize(texts, batch_size=100):
    input_ids = []
    attention_masks = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
        batch_texts = texts[i:i+batch_size]
        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors="pt"  # returns PyTorch tensors directly
        )
        input_ids.append(encodings['input_ids'])
        attention_masks.append(encodings['attention_mask'])
    
    # Concatenate all batches
    all_input_ids = torch.cat(input_ids)
    all_attention_masks = torch.cat(attention_masks)
    return all_input_ids, all_attention_masks


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
train_input_ids, train_attention_masks = batch_tokenize(list(train_texts), batch_size=64)
val_input_ids, val_attention_masks = batch_tokenize(list(val_texts), batch_size=64)


Tokenizing: 100%|██████████| 3376/3376 [02:27<00:00, 22.83it/s]
Tokenizing: 100%|██████████| 844/844 [00:43<00:00, 19.25it/s]


In [None]:
from sklearn.preprocessing import LabelEncoder
import torch

# Initialize encoder
label_encoder = LabelEncoder()

# Fit on all labels (both train + val to avoid unseen values in val)
all_labels = train_labels + val_labels
label_encoder.fit(all_labels)

# Transform string labels to integer labels
train_labels_encoded = label_encoder.transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)

# Convert to PyTorch tensors
train_labels_tensor = torch.tensor(train_labels_encoded)
val_labels_tensor = torch.tensor(val_labels_encoded)


In [None]:
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./ehr_model",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
