In [None]:
# https://figshare.com/articles/dataset/MACCROBAT2018/9764942
# https://brat.nlplab.org/standoff.html

In [None]:
!pip -q install evaluate accelerate -U

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## Load Dataset

In [None]:
!mkdir MACCROBAT2020

In [None]:
!unzip /content/MACCROBAT2020.zip -d ./MACCROBAT2020

Archive:  /content/MACCROBAT2020.zip
  inflating: ./MACCROBAT2020/26530965.ann  
  inflating: ./MACCROBAT2020/25410883.ann  
  inflating: ./MACCROBAT2020/23864579.ann  
  inflating: ./MACCROBAT2020/23468586.ann  
  inflating: ./MACCROBAT2020/23155491.ann  
  inflating: ./MACCROBAT2020/23124805.ann  
  inflating: ./MACCROBAT2020/22520024.ann  
  inflating: ./MACCROBAT2020/19610147.ann  
  inflating: ./MACCROBAT2020/19307547.ann  
  inflating: ./MACCROBAT2020/19816630.ann  
  inflating: ./MACCROBAT2020/21672201.ann  
  inflating: ./MACCROBAT2020/25572898.ann  
  inflating: ./MACCROBAT2020/23033875.ann  
  inflating: ./MACCROBAT2020/23033875.txt  
  inflating: ./MACCROBAT2020/21129213.ann  
  inflating: ./MACCROBAT2020/28154700.txt  
  inflating: ./MACCROBAT2020/28154700.ann  
  inflating: ./MACCROBAT2020/28154281.txt  
  inflating: ./MACCROBAT2020/28154281.ann  
  inflating: ./MACCROBAT2020/27990013.txt  
  inflating: ./MACCROBAT2020/27990013.ann  
  inflating: ./MACCROBAT2020/27842605.t

# Preprocessing

In [None]:
import os
from typing import List, Dict, Tuple

class Preprocessing_Maccrobat:
    def __init__(self, dataset_folder, tokenizer):
        self.file_ids = [f.split('.')[0] for f in os.listdir(dataset_folder) if f.endswith('.txt')]
        self.num_samples = len(self.file_ids)

        self.texts: List[str] = []
        self.tags: List[Dict[str, str]] = []
        for i in range(self.num_samples):
            text_path = os.path.join(dataset_folder, self.file_ids[i] + '.txt')
            tag_path = os.path.join(dataset_folder, self.file_ids[i] + '.ann')

            # Text
            with open(text_path, 'r') as file:
                self.texts.append(file.read())

            # Tag
            with open(tag_path, 'r') as file:
                text_bound_ann = [t.split('\t') for t in file.read().split('\n') if t.startswith('T')]
                text_bound_lst = []
                for text_bound in text_bound_ann:
                    label = text_bound[1].split(' ')
                    try:
                        _ = int(label[1])
                        _ = int(label[2])
                        tag = {
                            'text': text_bound[-1],
                            'label': label[0],
                            'start': label[1],
                            'end': label[2]
                        }
                        text_bound_lst.append(tag)
                    except:
                        pass
                self.tags.append(text_bound_lst)
        self.tokenizer = tokenizer

    def process(self) -> Tuple[List[List[str]], List[List[str]]]:
        input_texts = []
        input_labels = []

        for i in range(self.num_samples):
            full_text = self.texts[i]
            tags = self.tags[i]

            label_offset = []
            continuous_label_offset = []
            for tag in tags:
                # {'text': '53-year-old', 'label': 'Age', 'start': '2', 'end': '13'}
                offset = list(range(int(tag['start']), int(tag['end'])+1))
                label_offset.append(offset) # [[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14, 15, 16, 17], [18, 19, 20, 21, 22], [30, 31, 32, 33, 34, 35, 36, 37, 38]]
                continuous_label_offset.extend(offset) # [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 30, 31, 32, 33, 34, 35, 36, 37, 38]
            all_offset = list(range(len(full_text)))
            zero_offset = [offset for offset in all_offset if offset not in continuous_label_offset] # [0, 1, 23, 24, 25, 26, 27, 28, 29, 39, 40, 41, 42, 43, 44, 45,...
            zero_offset = Preprocessing_Maccrobat.find_continuous_ranges(zero_offset) # [[0, 1], [23, 24, 25, 26, 27, 28, 29], [39, 40, 41, 42, 43, 44, 45, 46, 47,...],...]

            self.tokens = [] # ['a', '53', '-', 'year', '-', 'old', 'man', 'came', 'to', 'our', 'hospital',...]
            self.labels = [] # ['O', 'B-Age', 'I-Age', 'I-Age', 'I-Age', 'I-Age', 'B-Sex', 'B-Clinical_event',...]
            self._merge_offset(full_text, tags, zero_offset, label_offset)
            assert len(self.tokens) == len(self.labels), f"Length of tokens and labels are not equal"

            input_texts.append(self.tokens)
            input_labels.append(self.labels)

        return input_texts, input_labels

    def _merge_offset(self, full_text, tags, zero_offset, label_offset):
        # [[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14, 15, 16, 17], [18, 19, 20, 21, 22], [30, 31, 32, 33, 34, 35, 36, 37, 38]...] label_offset
        # [[0, 1], [23, 24, 25, 26, 27, 28, 29], [39, 40, 41, 42, 43, 44, 45, 46, 47,...],...] zero_offset
        i = j = 0
        while i < len(zero_offset) and j < len(label_offset):
            if zero_offset[i][0] < label_offset[j][0]:
                self._add_zero(full_text, zero_offset, i)
                i += 1
            else:
                self._add_label(full_text, label_offset, j, tags)
                j += 1

        while i < len(zero_offset):
            self._add_zero(full_text, zero_offset, i)
            i += 1

        while j < len(label_offset):
            self._add_label(full_text, label_offset, j, tags)
            j += 1

    def _add_zero(self, full_text, offset, index):
        start, *_ ,end =  offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0]+1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            ["O"]*len(text_tokens)
        )

    def _add_label(self, full_text, offset, index, tags):
        start, *_ ,end =  offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0]+1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            [f"B-{tags[index]['label']}"] + [f"I-{tags[index]['label']}"]*(len(text_tokens)-1)
        )

    @staticmethod
    def build_label2id(tokens: List[List[str]]):
        label2id = {}
        id_counter = 0
        for token in [token for sublist in tokens for token in sublist]:
            if token not in label2id:
                label2id[token] = id_counter
                id_counter += 1
        return label2id

    @staticmethod
    def find_continuous_ranges(data: List[int]): # [0, 1, 23, 24, 25, 26, 27, 28, 29, 39, 40, 41, 42, 43, 44, 45,...
        if not data:
            return []
        ranges = []
        start = data[0]
        prev = data[0]
        for number in data[1:]:
            if number != prev + 1:
                ranges.append(list(range(start, prev + 1)))
                start = number
            prev = number
        ranges.append(list(range(start, prev + 1)))
        return ranges

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")

dataset_folder = "./MACCROBAT2020"

In [None]:
Maccrobat_builder = Preprocessing_Maccrobat(dataset_folder, tokenizer)
input_texts, input_labels = Maccrobat_builder.process()

label2id = Preprocessing_Maccrobat.build_label2id(input_labels)
id2label = {v: k for k, v in label2id.items()}

In [None]:
label2id

{'O': 0,
 'B-Age': 1,
 'I-Age': 2,
 'B-Sign_symptom': 3,
 'I-Sign_symptom': 4,
 'B-Disease_disorder': 5,
 'I-Disease_disorder': 6,
 'B-Detailed_description': 7,
 'B-Clinical_event': 8,
 'B-History': 9,
 'I-History': 10,
 'B-Medication': 11,
 'B-Family_history': 12,
 'I-Family_history': 13,
 'B-Date': 14,
 'I-Date': 15,
 'B-Biological_structure': 16,
 'I-Detailed_description': 17,
 'B-Diagnostic_procedure': 18,
 'I-Diagnostic_procedure': 19,
 'B-Lab_value': 20,
 'I-Lab_value': 21,
 'B-Severity': 22,
 'I-Biological_structure': 23,
 'B-Qualitative_concept': 24,
 'I-Qualitative_concept': 25,
 'B-Therapeutic_procedure': 26,
 'I-Therapeutic_procedure': 27,
 'I-Medication': 28,
 'B-Quantitative_concept': 29,
 'I-Quantitative_concept': 30,
 'B-Dosage': 31,
 'I-Dosage': 32,
 'B-Duration': 33,
 'I-Duration': 34,
 'B-Frequency': 35,
 'I-Frequency': 36,
 'B-Coreference': 37,
 'B-Sex': 38,
 'B-Nonbiological_location': 39,
 'I-Nonbiological_location': 40,
 'B-Occupation': 41,
 'I-Occupation': 42,
 '

# Dataset loader

In [None]:
from sklearn.model_selection import train_test_split

inputs_train, inputs_val, labels_train, labels_val = train_test_split(
    input_texts,
    input_labels,
    test_size=0.2,
    random_state=42
)

In [None]:
import torch
from torch.utils.data import Dataset

MAX_LEN = 512

class NER_Dataset(Dataset):
    def __init__(self, input_texts, input_labels, tokenizer, label2id, max_len=MAX_LEN):
        super().__init__()
        self.tokens = input_texts
        self.labels = input_labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        input_token = self.tokens[idx]
        label_token = [self.label2id[label] for label in self.labels[idx]]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)

        input_ids = self.pad_and_truncate(input_token, pad_id= self.tokenizer.pad_token_id)
        labels = self.pad_and_truncate(label_token, pad_id=0)
        attention_mask =  self.pad_and_truncate(attention_mask, pad_id=0)

        return {
            "input_ids": torch.as_tensor(input_ids),
            "labels": torch.as_tensor(labels),
            "attention_mask": torch.as_tensor(attention_mask)
            }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return padded_inputs

    def label2id(self, labels: List[str]):
        return [self.label2id[label] for label in labels]

In [None]:
train_set = NER_Dataset(inputs_train, labels_train, tokenizer, label2id)
val_set = NER_Dataset(inputs_val, labels_val, tokenizer, label2id)

In [None]:
train_set[5]

{'input_ids': tensor([ 1037,  6421,  1011,  2095,  1011,  2214,  2158,  2040,  2018,  2042,
          4015,  2000,  2256,  5057,  2282,  2349,  2000,  3255,  3108,  5573,
          2001, 11441,  2007,  2358,  1011,  6903,  6678,  2026, 24755, 25070,
          1999, 14971,  7542,  1006,  7872,  2072,  1012,  2002,  2018,  2053,
         21887,  2854,  3891,  5876,  3272,  2005,  1037,  2381,  1997,  9422,
          2005,  2753,  2086, 21887,  2854,  8830,  2566, 12690, 17191,  3078,
          2001,  2864,  1010,  1998,  1037,  5688, 21262,  2001,  5159,  1999,
          1996,  2187, 15099, 15127, 16749,  5164,  1011,  5044,  2146,  1006,
         20965,  1012, 20720,  1007,  1012,  9380,  2522,  5886, 10127,  3419,
          9888,  1006, 13323, 12126,  3936,  2019,  2753,  1011,  3461,  2146,
          4874,  5744,  1011, 15791,  5976,  1006, 20965,  1012, 26314,  1007,
          1012,  1037,  2029,  2001,  5140,  3081, 26721, 27108,  7856,  2854,
          2004, 16781,  4874,  2246,  2

# Model

In [None]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "d4data/biomedical-ner-all",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)
model

config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at d4data/biomedical-ner-all and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([84]) in the checkpoint and torch.Size([83]) in the model instantiated
- classifier.weight: found shape torch.Size([84, 768]) in the checkpoint and torch.Size([83, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

# Training

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != 0
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="model",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.742134,0.273221
2,No log,1.087138,0.583934
3,No log,0.806651,0.678778
4,No log,0.698121,0.740226
5,No log,0.65836,0.748152
6,No log,0.638683,0.758839
7,No log,0.625571,0.76623
8,No log,0.636787,0.767121
9,No log,0.628584,0.77193
10,No log,0.625821,0.776472


TrainOutput(global_step=200, training_loss=0.42792247772216796, metrics={'train_runtime': 277.8425, 'train_samples_per_second': 11.517, 'train_steps_per_second': 0.72, 'total_flos': 418702245888000.0, 'train_loss': 0.42792247772216796, 'epoch': 20.0})

In [None]:
import shutil
shutil.move(
    '/content/model/checkpoint-200',
    '/content/drive/MyDrive/FPT/Semester 5/AIH/assignment 2')

'/content/drive/MyDrive/FPT/Semester 5/AIH/assignment 2/checkpoint-200'

# Inference

In [None]:
def replace(text, replacements = [(' - ', '-'), ('.', ''), (',', '')]):
  for pattern, replacement in replacements:
    text = text.replace(pattern, replacement)
  return text

In [None]:
def process_data(data, labels):
  result = []
  current_phrase = ""

  for i, (datum, label) in enumerate(zip(data, labels)):
    if label == "B" or (label == 'I' and i == 0):
      # Start a new phrase
      current_phrase = datum
    else:
      # Append current data point to the phrase
      if current_phrase:
        current_phrase += " " + datum

    # Check if next label is "B" or end of list, then add phrase to result
    if (i+1 < len(labels) and labels[i+1] == "B") or i == len(labels) - 1:
      result.append(replace(current_phrase))
      current_phrase = ""

  return result

In [None]:
data = ['vaginal', 'bleeding', 'abnormal', 'invasive,', 'non', '-', 'keratinizing.', 'SCC', 'salpingo,', '-', 'oophorectomy']
labels = ['B', 'B', 'B', 'B', 'B', 'I', 'I', 'I', 'B', 'I', 'I']

processed_data = process_data(data, labels)
print(processed_data)

['vaginal', 'bleeding', 'abnormal', 'invasive', 'non-keratinizing SCC', 'salpingo-oophorectomy']


In [None]:
data = ['pulmonary', 'tuberculosis']
labels = ['I', 'I']

processed_data = process_data(data, labels)
print(processed_data)

['pulmonary tuberculosis']


In [None]:
label2id= {'O': 0, 'B-Age': 1,'I-Age': 2,'B-History': 3,'I-History': 4,'B-Occupation': 5,'B-Family_history': 6,'I-Family_history': 7,'B-Clinical_event': 8,'B-Nonbiological_location': 9,'I-Nonbiological_location': 10,'B-Time': 11,'I-Time': 12,'B-Sign_symptom': 13,'I-Sign_symptom': 14,'B-Biological_structure': 15,'I-Biological_structure': 16,'B-Detailed_description': 17,'B-Diagnostic_procedure': 18,'I-Detailed_description': 19,'I-Diagnostic_procedure': 20,'B-Therapeutic_procedure': 21,'I-Therapeutic_procedure': 22,'B-Lab_value': 23,'I-Lab_value': 24,'B-Disease_disorder': 25,'I-Disease_disorder': 26,'B-Severity': 27,'B-Coreference': 28,'B-Shape': 29,'I-Shape': 30,'B-Medication': 31,'I-Medication': 32,'B-Date': 33,'I-Date': 34,'B-Other_entity': 35,'I-Other_entity': 36,'I-Clinical_event': 37,'B-Sex': 38,'B-Activity': 39,'I-Activity': 40,'B-Duration': 41,'I-Duration': 42,'B-Distance': 43,'I-Distance': 44,'I-Coreference': 45,'B-Dosage': 46,'I-Dosage': 47,'B-Administration': 48,'I-Administration': 49,'B-Texture': 50,'I-Severity': 51,'B-Personal_background': 52,'I-Occupation': 53,'I-Texture': 54,'B-Frequency': 55,'B-Volume': 56,'I-Volume': 57,'B-Weight': 58,'I-Weight': 59,'B-Subject': 60,'I-Subject': 61,'B-Outcome': 62,'B-Color': 63,'I-Color': 64,'B-Quantitative_concept': 65,'B-Area': 66,'I-Area': 67,'I-Quantitative_concept': 68,'I-Outcome': 69,'I-Frequency': 70,'B-Qualitative_concept': 71,'B-Other_event': 72,'I-Qualitative_concept': 73,'I-Personal_background': 74,'B-Biological_attribute': 75,'I-Biological_attribute': 76,'I-Other_event': 77,'B-Height': 78,'I-Height': 79,'I-Sex': 80,'B-Mass': 81,'I-Mass': 82}
id2label = {v: k for k, v in label2id.items()}

def predict(model, tokenizer, sentence):
    input = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())]).to("cpu")
    outputs = model(input)
    _, preds = torch.max(outputs.logits, -1)
    preds = preds[0].cpu().numpy()

    entity = set([id2label[pred][2:] for pred in preds if len(id2label[pred]) >= 2])

    infos = {}
    labels = {}
    for ent in entity:
        infos[ent] = []
        labels[ent] = []
        for token, pred in zip(test_sentence.split(), preds):
            if id2label[pred].endswith(ent):
                infos[ent].append(token)
                labels[ent].append(id2label[pred][:1])

        data = process_data(infos[ent], labels[ent])
        print(f'\033[36m{ent}\033[0m: ', end = '')
        for i, info in enumerate(data, 1):
            print(f'{info}', end = ', ' if i < len(data) else '.')
        print()


In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_path = "/content/drive/MyDrive/FPT/Semester 5/AIH/assignment 2/checkpoint-200"

model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
test_sentence = """A 48 year - old female presented with vaginal bleeding and abnormal Pap smears. Upon diagnosis of invasive non-keratinizing SCC of the cervix,
                   she underwent a radical hysterectomy with salpingo - oophorectomy which demonstrated positive spread to the pelvic lymph nodes and the parametrium .
                   Pathological examination revealed that the tumour also extensively involved the lower uterine segment. """

predict(model, tokenizer, test_sentence)

[36mDetailed_description[0m: vaginal, bleeding, invasive, salpingo-oophorectomy, lower uterine segment.
[36mAge[0m: 48 year-old.
[36mClinical_event[0m: presented.
[36mDiagnostic_procedure[0m: Pathological.
[36mLab_value[0m: abnormal, positive spread.
[36mSex[0m: female.


In [None]:
test_sentence = """The patient was a 34-yr-old man who presented with complaints of fever and a chronic cough.
He was a smoker and had a history of pulmonary tuberculosis that had been treated and cured.
A computed tomographic (CT) scan revealed multiple tiny nodules in both lungs.
A thoracoscopic lung biopsy was taken from the right upper lobe.
The microscopic examination revealed a typical LCH.
The tumor cells had vesicular and grooved nuclei, and they formed small aggregations around the bronchioles (Fig.1).
The tumor cells were strongly positive for S-100 protein, vimentin, CD68 and CD1a.
There were infiltrations of lymphocytes and eosinophils around the tumor cells.
With performing additional radiologic examinations, no other organs were thought to be involved.
He quit smoking, but he received no other specific treatment.
He was well for the following one year.
After this, a follow-up CT scan was performed and it showed a 4 cm-sized mass in the left lower lobe, in addition to the multiple tiny nodules in both lungs (Fig.2).
A needle biopsy specimen revealed the possibility of a sarcoma; therefore, a lobectomy was performed.
Grossly, a 4 cm-sized poorly-circumscribed lobulated gray-white mass was found (Fig.3), and there were a few small satellite nodules around the main mass.
Microscopically, the tumor cells were aggregated in large sheets and they showed an infiltrative growth.
The cytologic features of some of the tumor cells were similar to those seen in a typical LCH.
However, many tumor cells showed overtly malignant cytologic features such as pleomorphic/hyperchromatic nuclei and prominent nucleoli (Fig.4), and multinucleated tumor giant cells were also found.
There were numerous mitotic figures ranging from 30 to 60 per 10 high power fields, and some of them were abnormal.
A few foci of typical LCH remained around the main tumor mass.
Immunohistochemically, the tumor cells were strongly positive for S-100 protein (Fig.5) and vimentin; they were also positive for CD68 (Dako N1577, Clone KPI), and focally positive for CD1a (Fig.6), and they were negative for cytokeratin, epithelial membrane antigen, CD3, CD20 and HMB45.
The ultrastructural analysis failed to demonstrate any Birbeck granules in the cytoplasm of the tumor cells.
Now, at five months after lobectomy, the patient is doing well with no significant change in the radiologic findings."""

predict(model, tokenizer, test_sentence)

[36mBiological_structure[0m: lung, right upper, other organs, left lower, both lungs, nuclei, giant cells tumor.
[36mDetailed_description[0m: computed, tiny There, additional radiologic examinations, quit, tiny, needle, biopsy, specimen, small, satellite, large sheets, typical, some of them, membrane.
[36mSign_symptom[0m: fever, mass, mass, tumor well.
[36mCoreference[0m: tumor cells, tumor cells, tumor, tumor cells, tumor cells, tumor cells, tumor cells, tumor.
[36mHistory[0m: pulmonary tuberculosis.
[36mDate[0m: five months after lobectomy.
[36mDiagnostic_procedure[0m: microscopic examination, S-100 protein, analysis.
[36mLab_value[0m: cured, A, tomographic (CT), nodules, lungs, A, thoracoscopic, biopsy, lobe, The, LCH The, vesicular, grooved nuclei, small, aggregations, bronchioles, (Fig1), The, strongly positive, S-100, protein, vimentin, CD68, CD1a, infiltrations, lymphocytes, eosinophils, cells, With, involved, He, smoking, well, year, After, this, follow-up, 4 cm