#### NER on medical dataset to generate the Problem (including diagnosis), Test and treatment 
#### using the HUMADEX/english_ner_dataset

##### About the dataset 
##### Contain a text of medical text that contains the diagnosis, symtoms, test and treatment 

In [7]:
import transformers 
from datasets import load_dataset
from transformers import  AutoTokenizer, TFAutoModelForTokenClassification 

#### Importing and analysis the dataset

In [9]:
data = load_dataset('HUMADEX/english_ner_dataset')
data

README.md:   0%|          | 0.00/4.32k [00:00<?, ?B/s]

en_medical_data.parquet:   0%|          | 0.00/14.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/445284 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'tags'],
        num_rows: 445284
    })
})

In [10]:
#using just the first 40000 text 
subdata = data['train'].select(range(40000))

subdata

Dataset({
    features: ['sentence', 'tags'],
    num_rows: 40000
})

In [11]:
dic = {}
for i, j in zip(subdata['sentence'][6], subdata['tags'][6]):
    dic[i] = j
    
dic

{'this': 0,
 'may': 0,
 'suggest': 0,
 'autoimmune': 1,
 'or': 2,
 'drug': 2,
 'induced': 2,
 'hemolytic': 2,
 'anemia': 3}

In [12]:
# the nar name and the corresponding nar tag 
name = ["O", "B-PROBLEM", "I-PROBLEM", "E-PROBLEM", "S-PROBLEM", "B-TREATMENT", "I-TREATMENT", "E-TREATMENT", "S-TREATMENT", "B-TEST", "I-TEST", "E-TEST", "S-TEST"]
dig = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

name_dig = {}
for i, j in zip(name, dig):
    name_dig[i] = j
    
dig_name = {v: k for k, v in name_dig.items()}
dig_name
name_dig

{'O': 0,
 'B-PROBLEM': 1,
 'I-PROBLEM': 2,
 'E-PROBLEM': 3,
 'S-PROBLEM': 4,
 'B-TREATMENT': 5,
 'I-TREATMENT': 6,
 'E-TREATMENT': 7,
 'S-TREATMENT': 8,
 'B-TEST': 9,
 'I-TEST': 10,
 'E-TEST': 11,
 'S-TEST': 12}

In [13]:
for i, j in dic.items():
    dic[i] = dig_name[j]
dic

{'this': 'O',
 'may': 'O',
 'suggest': 'O',
 'autoimmune': 'B-PROBLEM',
 'or': 'I-PROBLEM',
 'drug': 'I-PROBLEM',
 'induced': 'I-PROBLEM',
 'hemolytic': 'I-PROBLEM',
 'anemia': 'E-PROBLEM'}

In [14]:
# max and min sentence count 

import pandas as pd
df = pd.DataFrame(subdata)

df['count'] = df['sentence'].apply(lambda x: len(x))

print(max(df['count']),'---', min(df['count']))


303 --- 1


#### Tokenisation and adjust ner_tag to match the new tokenised text

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [16]:
# word count is now 17 compare to 9 before tokenisation
Token_data = tokenizer(subdata['sentence'][6], is_split_into_words = True)
print(Token_data.tokens(), '\n', subdata['sentence'][6])
print(len(Token_data.tokens()), '\n', len(subdata['sentence'][6]))

['[CLS]', 'this', 'may', 'suggest', 'auto', '##im', '##mu', '##ne', 'or', 'drug', 'induced', 'hem', '##oly', '##tic', 'an', '##emia', '[SEP]'] 
 ['this', 'may', 'suggest', 'autoimmune', 'or', 'drug', 'induced', 'hemolytic', 'anemia']
17 
 9


In [17]:
# word id to track the work index after tokenisation 
tokenizer.is_fast
word_ids = Token_data.word_ids()
tag = subdata['tags'][6]

print(word_ids, '--', tag)
print(subdata['sentence'][6])

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 7, 7, 8, 8, None] -- [0, 0, 0, 1, 2, 2, 2, 2, 3]
['this', 'may', 'suggest', 'autoimmune', 'or', 'drug', 'induced', 'hemolytic', 'anemia']


In [18]:
# word_id length to token length

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label in [3, 7, 11]:
                new_labels.append(label)
                
            else :
                if label % 2 == 1: #and label not in [3,7, 11]:
                    label += 1
                new_labels.append(label)

    return new_labels




In [19]:
# printing one example 
print(tag)
align_labels_with_tokens(tag, word_ids)

[0, 0, 0, 1, 2, 2, 2, 2, 3]


[-100, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, -100]

In [20]:
def tokenize_labels(data):
    tokenized_inputs = tokenizer(
        data["sentence"], truncation=True, is_split_into_words=True
    )
    ner_labels = data["tags"]
    new_labels = []
    for i, labels in enumerate(ner_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [21]:
tokenized_subdata = subdata.map(
    tokenize_labels,
    batched=True,
    remove_columns=subdata.column_names,
)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [22]:
tokenized_subdata

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 40000
})

In [23]:
tokenized_subdata = tokenized_subdata.train_test_split(train_size = 0.8)
tokenized_subdata

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 32000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
})

In [24]:
# Data collator to batch size each batche
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [25]:
# convert to tf 
tf_train_data = tokenized_subdata["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator, shuffle=True,batch_size=16,
)

tf_eval_data = tokenized_subdata["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,shuffle=False, batch_size=16,
)

#### Training the model

In [26]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    id2label=dig_name,
    label2id=name_dig,
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
model.config.num_labels

13

In [36]:
from transformers import create_optimizer
import tensorflow as tf

tf.keras.mixed_precision.set_global_policy("mixed_float16")


num_epochs = 3
num_train_steps = len(tf_train_data) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [38]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import ModelCheckpoint


callback = PushToHubCallback(output_dir="bert-clinical-ner", tokenizer=tokenizer)

#checkpoint_callback = ModelCheckpoint(
#    filepath='clinic_best_model.keras',           
#    monitor='val_loss',                 
#    save_best_only=True,)

model.fit(
    tf_train_data,
    validation_data=tf_eval_data,
    callbacks=[callback],
    epochs=num_epochs,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/jeje01/bert-clinical-ner into local empty directory.


Download file tf_model.h5:   0%|          | 8.00k/411M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/411M [00:00<?, ?B/s]

Epoch 1/3
Cause: for/else statement not yet supported
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <gast.gast.Expr object at 0x78af868fc880>
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x78b384269330>

#### model eveluation

In [44]:
import seqeval
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [45]:
# concate all the predictions into a single list 


import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_data:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(name[predicted_idx])
            all_labels.append(name[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

{'PROBLEM': {'precision': 0.7518525295032477,
  'recall': 0.7633045416550571,
  'f1': 0.7575352567056871,
  'number': 21534},
 'TEST': {'precision': 0.6954234613361389,
  'recall': 0.6932354483481908,
  'f1': 0.694327731092437,
  'number': 1907},
 'TREATMENT': {'precision': 0.7457210098416773,
  'recall': 0.7444468175993165,
  'f1': 0.7450833689610944,
  'number': 9364},
 'overall_precision': 0.7468817009453051,
 'overall_recall': 0.7538484987044658,
 'overall_f1': 0.7503489289398628,
 'overall_accuracy': 0.8569394906451411}

In [46]:
# Visualising matrics in pandas 

metric_data = metric.compute(predictions=[all_predictions], references=[all_labels])

metric_data = pd.DataFrame(metric_data)

In [47]:
metric_data

Unnamed: 0,PROBLEM,TEST,TREATMENT,overall_precision,overall_recall,overall_f1,overall_accuracy
precision,0.751853,0.695423,0.745721,0.746882,0.753848,0.750349,0.856939
recall,0.763305,0.693235,0.744447,0.746882,0.753848,0.750349,0.856939
f1,0.757535,0.694328,0.745083,0.746882,0.753848,0.750349,0.856939
number,21534.0,1907.0,9364.0,0.746882,0.753848,0.750349,0.856939


#### Model testing 

In [53]:
from transformers import pipeline 
from tensorflow.keras.models import load_model



clinical_data = pipeline("token-classification", 'jeje01/bert-clinical-ner',  aggregation_strategy="simple")
extracted_data = clinical_data('Seyi presented with cough, fever, and chest pain. he had a Chest xray that suggest pnuemonia and sputum culture was taken which later grew Streptococcus pneumoniae. he was started on Co-amox, and paracetamol')
extracted_data2 = clinical_data('James presented with 3 weeks history of fever, headache and vomiting. he had a CT scan that showed viral encephalitis, 4 days after he was having seizures and developed aspiration pneumonia. he was moved to ITU for oxygen, fluids and antibiotics')



Some layers from the model checkpoint at jeje01/bert-clinical-ner were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at jeje01/bert-clinical-ner.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
Hardware accelerator e.g. GPU is available in the environment, but no `devic

In [54]:
import pandas as pd
pd.DataFrame(extracted_data)

Unnamed: 0,entity_group,score,word,start,end
0,PROBLEM,0.994141,cough,20,25
1,PROBLEM,0.996094,fever,27,32
2,PROBLEM,0.97998,chest,38,43
3,PROBLEM,0.99707,pain,44,48
4,TEST,0.976562,a Chest,57,64
5,TEST,0.977539,xray,65,69
6,PROBLEM,0.988281,pnuemonia,83,92
7,TEST,0.971191,sputum,97,103
8,TEST,0.986328,culture,104,111
9,PROBLEM,0.990234,Streptococcus,139,152


In [55]:
import pandas as pd
pd.DataFrame(extracted_data2)

Unnamed: 0,entity_group,score,word,start,end
0,PROBLEM,0.994141,fever,40,45
1,PROBLEM,0.99707,headache,47,55
2,PROBLEM,0.99707,vomiting,60,68
3,TEST,0.985352,a CT,77,81
4,TEST,0.987305,scan,82,86
5,PROBLEM,0.992188,viral,99,104
6,PROBLEM,0.993652,encephalitis,105,117
7,PROBLEM,0.994141,seizures,146,154
8,PROBLEM,0.995605,aspiration,169,179
9,PROBLEM,0.996094,pneumonia,180,189


### Identify all the problems(history and diagnosis), test and treatment correctly

### Comparing with Hugging face question-answering model

In [8]:
from transformers import pipeline

# Initialize the question-answering pipeline
question_answering = pipeline('question-answering')

# Define the context (text)
text = """
James presented with 3 weeks history of fever, headache and vomiting. 
He had a CT scan that showed viral encephalitis, 4 days after he was having seizures 
and developed aspiration pneumonia. He was moved to ITU for oxygen, fluids, and antibiotics.
"""
questions = ['What test was done?', 'What diagnosis was made?', 'What is the treatment?']

# Loop through each question and get the answer
for q in questions:
    answer = question_answering(question=q, context=text)
    print(f"Q: {q}")
    print(f"A: {answer['answer']}\n")


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Q: What test was done?
A: CT scan

Q: What diagnosis was made?
A: aspiration pneumonia

Q: What is the treatment?
A: antibiotics



In [11]:
# imappropriate reply on this text

question_answering = pipeline('question-answering')

text = """ Seyi presented with cough, fever, and chest pain. he had a Chest xray that suggest pnuemonia and sputum culture was taken which later grew Streptococcus pneumoniae. 
he was started on Co-amox, and paracetamol """

questions = ['What test was done?', 'What diagnosis was made?', 'What is the treatment ?']

for q in questions:
    answer = question_answering(question=q, context=text)
    print(f"Q: {q}")
    print(f"A: {answer['answer']}\n")


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Q: What test was done?
A: Co-amox, and paracetamol

Q: What diagnosis was made?
A: Chest xray

Q: What is the treatment ?
A: Chest xray



#### Combine the ner on clinical text to extract the data and question and answer pipeline to extract specific information 

In [13]:
# fuction for NER pipleine 
data_pipeline = pipeline("token-classification", 'jeje01/bert-clinical-ner',  aggregation_strategy="simple")


def detail_extractor(text):
    clinical_data = data_pipeline(text)
    clinical_data = pd.DataFrame(clinical_data)
    problem = clinical_data.loc[clinical_data['entity_group'] =='PROBLEM', 'word']
    test = clinical_data.loc[clinical_data['entity_group'] =='TEST', 'word']
    treatment = clinical_data.loc[clinical_data['entity_group'] =='TREATMENT', 'word']


    return (' '.join(list(problem)), ' '.join(list(test)), ' '.join(list(treatment)))

    
    

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/431M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForTokenClassification.

All the layers of TFBertForTokenClassification were initialized from the model checkpoint at jeje01/bert-clinical-ner.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [15]:
text = 'Patient presented with breast pain, fever and redness around the breast skin. she had a breast Ultrasound scan that showed breast abscess, she was treated with gentamicin and drainage in theatre'
extracted_text = detail_extractor(text)
extracted_text

('breast pain fever redness around the breast skin breast abscess',
 'a breast Ultrasound scan',
 'gentamicin drainage')

In [17]:
# Initialize the question-answering pipeline

def QandA(extracted_text):
    Q_A = pipeline('question-answering')
    final_diagnosis = Q_A(question = 'what is the diagnosis', context = extracted_text[0])
    final_investigations = Q_A(question = 'which investigation was done', context = extracted_text[1])
    final_treatment = Q_A(question = 'what treatment was done', context = extracted_text[2])
    return(final_diagnosis, final_investigations, final_treatment)


In [23]:
QandA(extracted_text)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


({'score': 0.0950365737080574,
  'start': 49,
  'end': 63,
  'answer': 'breast abscess'},
 {'score': 0.36759376525878906,
  'start': 2,
  'end': 24,
  'answer': 'breast Ultrasound scan'},
 {'score': 0.6242347359657288,
  'start': 0,
  'end': 19,
  'answer': 'gentamicin drainage'})

In [110]:
text2 = """ Seyi presented with cough, fever, and chest pain. he had a Chest xray that suggest pnuemonia and sputum culture was taken which later grew Streptococcus pneumoniae. 
he was started on Co-amox, and paracetamol """

text = detail_extractor(text2)
QandA(text)




No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


({'score': 0.2887604832649231,
  'start': 23,
  'end': 57,
  'answer': 'pnuemonia Streptococcus pneumoniae'},
 {'score': 0.23432958126068115,
  'start': 0,
  'end': 27,
  'answer': 'a Chest xray sputum culture'},
 {'score': 0.8832352161407471,
  'start': 0,
  'end': 21,
  'answer': 'Co - amox paracetamol'})

## Worked much better when the 2 models are combined 