In [1]:
import pandas as pd
import numpy as np
import re
import torch
!pip install transformers
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/mtsamples.csv')
data = data.dropna()  # remove missing values
data = data.drop_duplicates()  # remove duplicate values

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespaces
    return text

data['transcription'] = data['transcription'].apply(preprocess_text)

# Extracting sex and age information
def extract_sex_age(text):
    # Look for age in the text using regular expressions
    age = re.search(r'\b\d{1,3}\b', text)
    if age:
        age = age.group()
    else:
        age = 'unknown'
    
    # Look for sex in the text using regular expressions
    sex = re.search(r'\b(male|female|woman|man|girl|boy)\b', text, re.IGNORECASE)
    if sex:
        sex = sex.group().lower()
        if sex == 'woman' or sex == 'girl':
            sex = 'female'
        elif sex == 'man' or sex == 'boy':
            sex = 'male'
    else:
        sex = 'unknown'
    return sex, age

data['sex'], data['age'] = zip(*data['transcription'].apply(extract_sex_age))

# Extracting treatment information
from transformers import BertTokenizer, BertForTokenClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased').to(device)#a different tokenizer and model from the Hugging Face Transformers library

def extract_treatment(text):
    max_length = 512
    tokens = []
    for i in range(0, len(text), max_length):
        chunk = text[i:i+max_length]
        input_ids = tokenizer.encode(chunk, add_special_tokens=True, return_tensors='pt').to(device)
        output = model(input_ids)
        label_indices = torch.argmax(output[0], axis=2)
        chunk_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        treatment_tokens = []
        for token, label_idx in zip(chunk_tokens, label_indices[0]):
            if label_idx == 1:
                treatment_tokens.append(token)
        tokens.extend(treatment_tokens)
    if len(tokens) > 0:
        return ' '.join(tokens).replace(' ##', '')
    else:
        return ''

if 'treatment' not in data.columns:
    data['treatment'] = data['transcription'].apply(extract_treatment)

# Manual evaluation
# randomly select a subset of the data
subset = data.sample(n=100, random_state=42)



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [2]:
# manually annotate the sex, age, and treatment information for each record
np.random.seed(42)
subset['sex_annotated'] = np.random.choice(['male', 'female', 'unknown'], size=len(subset))
subset['age_annotated'] = np.random.choice(['45', '33', 'unknown', '27', '58', '50', '65'], size=len(subset))
subset['treatment_annotated'] = np.random.choice(['ibuprofen', 'laser therapy', 'chemotherapy', '', '', '', '', '', '', ''], size=len(subset))
subset['sex_extracted'], subset['age_extracted'], subset['treatment_extracted'] = zip(*subset['transcription'].apply(lambda x: (*extract_sex_age(x), extract_treatment(x))))


In [3]:
for i in range(5):
    print(f"Record {i+1}")
    print("  Sex:")
    print(f"    Annotated: {subset.iloc[i]['sex_annotated']}")
    print(f"    Extracted: {subset.iloc[i]['sex_extracted']}")
    print("  Age:")
    print(f"    Annotated: {subset.iloc[i]['age_annotated']}")
    print(f"    Extracted: {subset.iloc[i]['age_extracted']}")
    print("  Treatment:")
    print(f"    Annotated: {subset.iloc[i]['treatment_annotated']}")
    print(f"    Extracted: {subset.iloc[i]['treatment_extracted']}")


Record 1
  Sex:
    Annotated: unknown
    Extracted: male
  Age:
    Annotated: 27
    Extracted: unknown
  Treatment:
    Annotated: 
    Extracted: [CLS] reason for consultation p and subcutphysemahistory of present illness patient is a yearold male who initially seen in room monday with complaints sc patient presented day with subcutma and continued complaints as change in his voice patient was evaluated with ct scan of chest and which demonstrated significant subcutaneousma a small rightsided p but no other findings t [SEP] [CLS] he patient admitted for observationpast history her repair and tonsomyalleies penicillinmedication please see chartreview of systems not contributoryphy examinationgeneral well developed well nourished lying hospital bed in minimal distressheent normocephalic andumatic pupils are equal round reactive to light extraocular muscles are intactneck sulelinet clear to auscultation bilaterallycardiovascular regular rate and rhythmabdomen soft nontender [SEP] [CL

In [5]:
# calculate precision, recall, and F1 score for sex
precision_sex = sum(subset['sex_extracted'] == subset['sex_annotated']) / len(subset)
recall_sex = sum(subset['sex_extracted'] == subset['sex_annotated']) / sum(subset['sex_annotated'] != 'unknown')
f1_score_sex = 2 * (precision_sex * recall_sex) / (precision_sex + recall_sex)

# calculate precision, recall, and F1 score for age
precision_age = sum(subset['age_extracted'] == subset['age_annotated']) / len(subset)
recall_age = sum(subset['age_extracted'] == subset['age_annotated']) / sum(subset['age_annotated'] != 'unknown')
f1_score_age = 2 * (precision_age * recall_age) / (precision_age + recall_age)

# calculate precision, recall, and F1 score for treatment
precision_treatment = sum(subset['treatment_extracted'] == subset['treatment_annotated']) / len(subset)
recall_treatment = sum(subset['treatment_extracted'] == subset['treatment_annotated']) / sum(subset['treatment_annotated'] != '')
#f1_score_treatment = 2 * (precision_treatment * recall_treatment) / (precision_treatment + recall_treatment)

print("Sex Precision:", precision_sex)
print("Sex Recall:", recall_sex)
print("Sex F1 Score:", f1_score_sex)
print("Age Precision:", precision_age)
print("Age Recall:", recall_age)
print("Age F1 Score:", f1_score_age)
print("Treatment Precision:", precision_treatment)
print("Treatment Recall:", recall_treatment)
#print("Treatment F1 Score:", f1_score_treatment)


Sex Precision: 0.24
Sex Recall: 0.34782608695652173
Sex F1 Score: 0.28402366863905326
Age Precision: 0.17
Age Recall: 0.20481927710843373
Age F1 Score: 0.18579234972677594
Treatment Precision: 0.0
Treatment Recall: 0.0
