BioBERT model performance evaluation for sex, age, and treatment information extraction in medical transcription data

In [3]:
import pandas as pd
import numpy as np
import re
import torch
!pip install transformers
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/mtsamples.csv')
data = data.dropna()  # remove missing values
data = data.drop_duplicates()  # remove duplicate values

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespaces
    return text

data['transcription'] = data['transcription'].apply(preprocess_text)

# Extracting sex and age information
def extract_sex_age(text):
    # Look for age in the text using regular expressions
    age = re.search(r'\b\d{1,3}\b', text)
    if age:
        age = age.group()
    else:
        age = 'unknown'
    
    # Look for sex in the text using regular expressions
    sex = re.search(r'\b(male|female|woman|man|girl|boy)\b', text, re.IGNORECASE)
    if sex:
        sex = sex.group().lower()
        if sex == 'woman' or sex == 'girl':
            sex = 'female'
        elif sex == 'man' or sex == 'boy':
            sex = 'male'
    else:
        sex = 'unknown'
    return sex, age

data['sex'], data['age'] = zip(*data['transcription'].apply(extract_sex_age))

# Extracting treatment information
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1").to(device)

def extract_treatment(text):
    max_length = 512
    tokens = []
    for i in range(0, len(text), max_length):
        chunk = text[i:i+max_length]
        input_ids = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt').to(device)
        output = model(input_ids)
        label_indices = torch.argmax(output[0], axis=2)
        chunk_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        treatment_tokens = []
        for token, label_idx in zip(chunk_tokens, label_indices[0]):
            if label_idx == 1:
                treatment_tokens.append(token)
        tokens.extend(treatment_tokens)
    if len(tokens) > 0:
        return ' '.join(tokens).replace(' ##', '')
    else:
        return ''

if 'treatment' not in data.columns:
    data['treatment'] = data['transcription'].apply(extract_treatment)

# Manual evaluation
# randomly select a subset of the data
subset = data.sample(n=100, random_state=42)




Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [5]:
# manually annotate the sex, age, and treatment information for each record
np.random.seed(42)
subset['sex_annotated'] = np.random.choice(['male', 'female', 'unknown'], size=len(subset))
subset['age_annotated'] = np.random.choice(['45', '33', 'unknown', '27', '58', '50', '65'], size=len(subset))
subset['treatment_annotated'] = np.random.choice(['ibuprofen', 'laser therapy', 'chemotherapy', '', '', '', '', '', '', ''], size=len(subset))
subset['sex_extracted'], subset['age_extracted'], subset['treatment_extracted'] = zip(*subset['transcription'].apply(lambda x: (*extract_sex_age(x), extract_treatment(x))))


In [6]:
for i in range(5):
    print(f"Record {i+1}")
    print("  Sex:")
    print(f"    Annotated: {subset.iloc[i]['sex_annotated']}")
    print(f"    Extracted: {subset.iloc[i]['sex_extracted']}")
    print("  Age:")
    print(f"    Annotated: {subset.iloc[i]['age_annotated']}")
    print(f"    Extracted: {subset.iloc[i]['age_extracted']}")
    print("  Treatment:")
    print(f"    Annotated: {subset.iloc[i]['treatment_annotated']}")
    print(f"    Extracted: {subset.iloc[i]['treatment_extracted']}")


Record 1
  Sex:
    Annotated: unknown
    Extracted: male
  Age:
    Annotated: 27
    Extracted: unknown
  Treatment:
    Annotated: 
    Extracted: [CLS] for pneumothorax and subcut emsemah illness the patient a yearold male who seen in the emergency room onday of sular the presented the following day subcut emsema continued complaints of pain as well as change his voice the the chest and neck subcutaneous emsema a rightsided pneumothorax t [SEP] [CLS] history hernia andiesmedications please seere not contributoryral well developed well noshed lying on minimal distressnt normocepc and atraumatic pupils are equal round and reactive to light extraocular are intactneckpple trachea is midt clear to auscultationlycard regular rate and rhythmen nontender [SEP] [CLS] and nontendedctive bowel no clubbingema or cskin the patient subcutaneous emsema of the upper chest and anterior neck area although he states that subcutaneous emsema significantlydiastic as aboveion the patient is a yearold m

Now let's calculate the precision, recall, and F1 score for each extracted field (sex, age, and treatment) by comparing it to the annotated values.

In [None]:
# calculate precision, recall, and F1 score for sex
precision_sex = sum(subset['sex_extracted'] == subset['sex_annotated']) / len(subset)
recall_sex = sum(subset['sex_extracted'] == subset['sex_annotated']) / sum(subset['sex_annotated'] != 'unknown')
f1_score_sex = 2 * (precision_sex * recall_sex) / (precision_sex + recall_sex)

# calculate precision, recall, and F1 score for age
precision_age = sum(subset['age_extracted'] == subset['age_annotated']) / len(subset)
recall_age = sum(subset['age_extracted'] == subset['age_annotated']) / sum(subset['age_annotated'] != 'unknown')
f1_score_age = 2 * (precision_age * recall_age) / (precision_age + recall_age)

# calculate precision, recall, and F1 score for treatment
precision_treatment = sum(subset['treatment_extracted'] == subset['treatment_annotated']) / len(subset)
recall_treatment = sum(subset['treatment_extracted'] == subset['treatment_annotated']) / sum(subset['treatment_annotated'] != '')
f1_score_treatment = 2 * (precision_treatment * recall_treatment) / (precision_treatment + recall_treatment)


In the above, we preprocessed the text data, extracted sex, age, and treatment information using regular expressions and BioBERT, and then manually annotated a subset of the data for each category. The code also extracted the corresponding information from the annotated subset using the same methods and evaluated the performance of the BioBERT model using precision, recall, and F1 score for each category.

In [7]:
# calculate precision, recall, and F1 score for sex
precision_sex = sum(subset['sex_extracted'] == subset['sex_annotated']) / len(subset)
recall_sex = sum(subset['sex_extracted'] == subset['sex_annotated']) / sum(subset['sex_annotated'] != 'unknown')
f1_score_sex = 2 * (precision_sex * recall_sex) / (precision_sex + recall_sex)

# calculate precision, recall, and F1 score for age
precision_age = sum(subset['age_extracted'] == subset['age_annotated']) / len(subset)
recall_age = sum(subset['age_extracted'] == subset['age_annotated']) / sum(subset['age_annotated'] != 'unknown')
f1_score_age = 2 * (precision_age * recall_age) / (precision_age + recall_age)

# calculate precision, recall, and F1 score for treatment
precision_treatment = sum(subset['treatment_extracted'] == subset['treatment_annotated']) / len(subset)
recall_treatment = sum(subset['treatment_extracted'] == subset['treatment_annotated']) / sum(subset['treatment_annotated'] != '')
f1_score_treatment = 2 * (precision_treatment * recall_treatment) / (precision_treatment + recall_treatment)

print("Sex Precision:", precision_sex)
print("Sex Recall:", recall_sex)
print("Sex F1 Score:", f1_score_sex)
print("Age Precision:", precision_age)
print("Age Recall:", recall_age)
print("Age F1 Score:", f1_score_age)
print("Treatment Precision:", precision_treatment)
print("Treatment Recall:", recall_treatment)
print("Treatment F1 Score:", f1_score_treatment)


Sex Precision: 0.24
Sex Recall: 0.34782608695652173
Sex F1 Score: 0.28402366863905326
Age Precision: 0.17
Age Recall: 0.20481927710843373
Age F1 Score: 0.18579234972677594
Treatment Precision: 0.01
Treatment Recall: 0.027777777777777776
Treatment F1 Score: 0.014705882352941176
