In [1]:
# !pip install seqeval

In [2]:
import pandas as pd
import re
from transformers import AutoTokenizer
from datasets import Dataset, Features, Value, ClassLabel, Sequence
from seqeval.metrics import accuracy_score
# from seqeval.metrics import classification_report
# from seqeval.metrics import precision_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('../datasets/mtsamples.csv')
df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


### Required Functions

In [4]:
def get_transcription_types():
    sub_ls = []
    for ts in df['transcription']:
        data = get_initials(ts)
        if data is not None:
            sub_ls.append(data)


    unique_sub_ls = tuple()
    for sub in sub_ls:
        if unique_sub_ls.count(sub) == 0:
            unique_sub_ls = unique_sub_ls + (sub,)

    return sorted(unique_sub_ls)


def get_initials(data):
    data = str(data)
    subject = ''
    subject = data.split(',')
    if len(subject[0]) < 50:
        subject[0] = subject[0].strip()
        words = subject[0].split(' ')
        if words[0] == words[0].upper():
            if subject[0][-1] == ':':
                subject[0] = subject[0][:-1]
                return subject[0]
    

def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

def remove_symptom_notation(text):
    # Remove [symptom] notation
    text = re.sub(r'\[(.*?)\]', r'\1', text)
    # Remove (symptom) notation
    text = re.sub(r'\((.*?)\)', r'\1', text)
    # Remove the word "symptom"
    text = text.replace('symptom', '')
    if not text[-1].isalpha(): text = text[:-1]
    return text

def set_token_entities(text, predicted_text):
    
    symptom_pos = []
    for entity in predicted_text:
        symptom_pos.append((entity['start'], entity['end']))
    
    entity_set = []
    done = []
    
    for coord in symptom_pos:
        extracted_text = text[coord[0]: coord[1]]
        
        words = extracted_text.split(' ')
        if len(words) == 1:
            entity_set.append((words[0], 'B-symptom'))
            done.append(words[0])
        else:
            for i in range(len(words)):
                if i == 0: 
                    entity_set.append((words[i], 'B-symptom'))
                else:
                    entity_set.append((words[i], 'I-symptom'))
                done.append(words[i])
                
    words = text.split(' ')
    for i in range(len(words)):
        if words[i] not in done:
            entity_set.append((words[i], 'O'))
            
    # reorder list 
    final_output = []
    for word in words:
        for item in entity_set:
            if word == item[0]:
                final_output.append(item)
                break
    
    return final_output

def evaluate(y_true, y_pred, verbose):
    
    a = []
    b = []
    for tple in y_true:
        a.append(tple[1])
        
    for tple in y_pred:
        b.append(tple[1])
    
    if verbose:
        print("Accuracy: ", round(accuracy_score([a], [b]), 2))
        print("===\n")
        
    return round(accuracy_score([a], [b]), 2)
    
    
print('done')

done


### Exploring the Dataset
**It's necessary as we want to find out the appropiate data points that are suitable to train our model**

In [5]:
# finding the counts of each of transcription types
ts_type = get_transcription_types()
ts_type_count = {}

for item in ts_type:
    ts_type_count[item] = 0
        
for idx, ndf in df.iterrows():
    data = ndf['transcription']
    initial = get_initials(data)
    if initial is not None:
        ts_type_count[str(initial)] += 1

sorted_ts_count = dict(sorted(ts_type_count.items(), key=lambda item: item[0], reverse=False))
# for ts, cnt in sorted_ts_count.items():
#     print(ts, cnt)

In [6]:
with open('complain.txt', 'w+') as f:
    # save only the subjective transciptions
    for idx, ndf in df.iterrows():
        data = ndf['transcription']
        initial = get_initials(data)
        if initial == 'SUBJECTIVE':
            transcript = ndf['transcription']
            parts = transcript.split(':,')
#             print(transcript)
            f.write(transcript)
            f.write('\n\n')

#             print('>>', end=' ')
#             print(ndf['keywords'])
#             print('--\n')
            
    f.close

### Dataset Maker Function for Training with Transformers

In [15]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=self.unique_entities)),
            "id": Value("int32")
        })
        print("Features: ", features)
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds
    
print('done')

done


### Reading Data 

In [16]:
data = ""
with open('../datasets/annotated.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        data += line

In [17]:
import random

temp = """1. This 35-year-old male patient comes in with complaints of [headache](symptom) and [nausea](symptom) after a recent head injury.
2. A 28-year-old female reports [shortness of breath](symptom), [chest pain](symptom), and [palpitations](symptom) during physical activity.
3. The 42-year-old patient describes [fever](symptom), [cough](symptom), and [fatigue](symptom) as their main symptoms.
4. This 19-year-old student presents with [fever](symptom), [sore throat](symptom), and [loss of taste and smell](symptom) for the past few days.
5. A 50-year-old male patient experiences [joint pain](symptom), [muscle weakness](symptom), and [fatigue](symptom).
6. The 31-year-old athlete complains of [knee pain](symptom), [swelling](symptom), and [limited range of motion](symptom) after a sports injury.
7. This 60-year-old woman reports [back pain](symptom), [numbness in legs](symptom), and [weakness](symptom) in her lower limbs.
8. A 25-year-old individual presents with [abdominal pain](symptom), [bloating](symptom), and [constipation](symptom).
9. The 48-year-old patient describes [vision problems](symptom), [headache](symptom), and [dizziness](symptom) for the past week.
10. This 22-year-old male reports [skin rash](symptom), [itching](symptom), and [redness](symptom) on various parts of the body."""

lines = data.strip().split('\n')
random.shuffle(lines)
print(len(lines))

train_ratio = 0.8  # 80% for training, 20% for testing

# Calculate the split point based on the ratio
split_point = int(len(lines) * train_ratio)

train_lines = lines[:split_point]
test_lines = lines[split_point:]

train_text = '\n'.join(train_lines)
test_text = '\n'.join(test_lines)

train_dm = NERDataMaker(train_text.split('\n'))
test_dm = NERDataMaker(test_text.split('\n'))

print(len(train_dm), len(test_dm))

128
102 26


In [18]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

print(train_dm.unique_entities)

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(train_dm.unique_entities), id2label=train_dm.id2label, label2id=train_dm.label2id)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,             # keep it 2e-5
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=11,
    weight_decay=0.01,
)

train_ds = train_dm.as_hf_dataset(tokenizer=tokenizer)
test_ds = test_dm.as_hf_dataset(tokenizer=tokenizer)

print(train_ds[0])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds, # eval on training set! ONLY for DEMO!! have to split the data into train test split
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

['O', 'B-symptom', 'I-symptom']


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Features:  {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-symptom', 'I-symptom'], id=None), length=-1, id=None), 'id': Value(dtype='int32', id=None)}


Map: 100%|██████████| 102/102 [00:00<00:00, 5407.29 examples/s]


Features:  {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-symptom', 'I-symptom'], id=None), length=-1, id=None), 'id': Value(dtype='int32', id=None)}


Map: 100%|██████████| 26/26 [00:00<00:00, 3885.28 examples/s]


{'id': 0, 'ner_tags': [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['She', 'also', 'experiences', 'vertigo', 'and', 'lightheadedness', 'but', 'does', 'not', 'have', 'these', 'symptoms', 'presently.'], 'input_ids': [101, 2016, 2036, 6322, 28246, 1998, 2422, 4974, 2098, 2791, 2021, 2515, 2025, 2031, 2122, 8030, 12825, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 1, 0, 1, -100, -100, -100, 0, 0, 0, 0, 0, 0, 0, -100, -100]}


  0%|          | 0/77 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                              
  9%|▉         | 7/77 [00:13<01:57,  1.68s/it]

{'eval_loss': 0.7826511859893799, 'eval_runtime': 0.635, 'eval_samples_per_second': 40.946, 'eval_steps_per_second': 3.15, 'epoch': 1.0}


                                               
 18%|█▊        | 14/77 [00:29<02:11,  2.09s/it]

{'eval_loss': 0.6918734908103943, 'eval_runtime': 1.8731, 'eval_samples_per_second': 13.881, 'eval_steps_per_second': 1.068, 'epoch': 2.0}


                                               
 27%|██▋       | 21/77 [00:40<01:30,  1.61s/it]

{'eval_loss': 0.5975918173789978, 'eval_runtime': 0.6101, 'eval_samples_per_second': 42.615, 'eval_steps_per_second': 3.278, 'epoch': 3.0}


                                               
 36%|███▋      | 28/77 [00:51<01:08,  1.41s/it]

{'eval_loss': 0.5262717008590698, 'eval_runtime': 0.5407, 'eval_samples_per_second': 48.087, 'eval_steps_per_second': 3.699, 'epoch': 4.0}


                                               
 45%|████▌     | 35/77 [01:02<00:54,  1.30s/it]

{'eval_loss': 0.4824139475822449, 'eval_runtime': 0.5364, 'eval_samples_per_second': 48.47, 'eval_steps_per_second': 3.728, 'epoch': 5.0}


                                               
 55%|█████▍    | 42/77 [01:13<00:49,  1.40s/it]

{'eval_loss': 0.44765955209732056, 'eval_runtime': 0.5908, 'eval_samples_per_second': 44.007, 'eval_steps_per_second': 3.385, 'epoch': 6.0}


                                               
 64%|██████▎   | 49/77 [01:25<00:40,  1.45s/it]

{'eval_loss': 0.4399275779724121, 'eval_runtime': 0.6226, 'eval_samples_per_second': 41.761, 'eval_steps_per_second': 3.212, 'epoch': 7.0}


                                               
 73%|███████▎  | 56/77 [01:39<00:36,  1.73s/it]

{'eval_loss': 0.4371403455734253, 'eval_runtime': 0.5504, 'eval_samples_per_second': 47.234, 'eval_steps_per_second': 3.633, 'epoch': 8.0}


                                               
 82%|████████▏ | 63/77 [01:49<00:19,  1.40s/it]

{'eval_loss': 0.4178934693336487, 'eval_runtime': 0.6113, 'eval_samples_per_second': 42.53, 'eval_steps_per_second': 3.272, 'epoch': 9.0}


                                               
 91%|█████████ | 70/77 [02:00<00:09,  1.35s/it]

{'eval_loss': 0.41518300771713257, 'eval_runtime': 0.5037, 'eval_samples_per_second': 51.613, 'eval_steps_per_second': 3.97, 'epoch': 10.0}


                                               
100%|██████████| 77/77 [02:12<00:00,  1.72s/it]

{'eval_loss': 0.41558510065078735, 'eval_runtime': 0.6527, 'eval_samples_per_second': 39.835, 'eval_steps_per_second': 3.064, 'epoch': 11.0}
{'train_runtime': 132.101, 'train_samples_per_second': 8.494, 'train_steps_per_second': 0.583, 'train_loss': 0.49372541749632204, 'epoch': 11.0}





TrainOutput(global_step=77, training_loss=0.49372541749632204, metrics={'train_runtime': 132.101, 'train_samples_per_second': 8.494, 'train_steps_per_second': 0.583, 'train_loss': 0.49372541749632204, 'epoch': 11.0})

In [19]:
from datasets import load_metric
metric = load_metric("accuracy")

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Calculate accuracy
accuracy = results
print(f"Validation Loss: {round(accuracy['eval_loss'], 3)}")

  metric = load_metric("accuracy")
Downloading builder script: 4.21kB [00:00, 6.47MB/s]                   
100%|██████████| 2/2 [00:00<00:00,  7.59it/s]

Validation Loss: 0.416





In [20]:
# Evaluating the model
from transformers import pipeline

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu

test_data = test_text.split('\n')
avg_acc = []
for data in test_data:
    
    y_true = get_tokens_with_entities(data)
    
    data = remove_symptom_notation(data)
    predicted_text = pipe(data)
    
    y_pred = set_token_entities(data, predicted_text)
    
    avg_acc.append(evaluate(y_true, y_pred, False))
    
print("Average Accuracy: ", round(sum(avg_acc)/len(avg_acc), 3))

Average Accuracy:  0.809


In [25]:
from transformers import pipeline

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
data = 'A sign for example may be a higher or lower temperature than normal, raised or lowered blood pressure or an abnormality showing on a medical scan.'
data = remove_symptom_notation(data)
predicted_text = pipe(data)
for i in range(len(predicted_text)):
    print('Symptom: ', predicted_text[i]['word'])

Symptom:  higher
Symptom:  lower temperature
Symptom:  raised
Symptom:  lowered blood pressure
Symptom:  abnormal
