# 1) Install/import important librairies

In [1]:
# Install spacy to access to transformer models
!pip install spacy-transformers



In [2]:
import spacy
import numpy as np
import os
import glob
import re
from sklearn.model_selection import train_test_split

#2) Preparing processing data

In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# create a function to process annotation with text data
def process_annotation_file(text_file, ann_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        texts= f.read()

    with open(ann_file, 'r', encoding='utf-8') as f:
        content = f.readlines()
        labels = []
        for line in content:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                _, span_info, _ = parts
                label_parts = span_info.split()
                tag = label_parts[0]
                span_ranges = ' '.join(label_parts[1:])

                for span in span_ranges.split(';'):
                    start, end = map(int, span.split())
                    labels.append((start, end, tag))
    return texts, labels

# load the joined list of text/annotation
def load_data_from_directory(directory):
    text_files = glob.glob(os.path.join(directory, '*.txt'))
    texts, tags_list = [], []

    for text_file in text_files:
        ann_file = text_file.replace('.txt', '.ann')
        if os.path.exists(ann_file):
            file_texts, file_tags = process_annotation_file(text_file, ann_file)

            texts.append(file_texts)
            tags_list.append(file_tags)
        else:
            print(f"Warning: No matching .ann file found for {text_file}")

    return texts, tags_list

In [5]:
# Define the data directory
data_directory = '/content/drive/MyDrive/Saniia/MACCROBAT2020_Simplified/'

In [6]:
# Let's load our data now
texts, tags_list = load_data_from_directory(data_directory)

In [29]:
# Just for checking
texts[0]

'A 76-year-old woman presented to our hospital with complaints of epigastralgia since a day prior to admission.\nLaboratory data on admission revealed an elevation of aminotransferase, alanine aminotransferase, ɤ-guanosine triphosphate, and alkaline phosphatase.\nSerum total bilirubin and tumor markers, carcinoembryonic antigen (CEA), carbohydrate antigen 19-9 (CA19-9), SPan-1, and neuron-specific enolase (NSE), were all within normal ranges.\nAbdominal computed tomography (CT) and magnetic resonance cholangiopancreatography (MRCP) showed a mass in an enlarged gallbladder and bulky hepatic lymph nodes surrounding the hepatic hilum (Fig.1a, \u200bb).\nThere were also no apparent lesions in upper and lower gastrointestinal endoscopy.\nEndoscopic ultrasound-guided fine-needle aspiration (EUS-FNA) was performed to obtain tissue from the hilar lymph node.\nImmunohistochemical staining of the specimen identified diffuse positivity for keratin, CD56, and synaptophysin in the tumor cells, whic

In [9]:
# Create the dataset
data = []
for i in range(len(texts)):
  temp_dict = {}
  temp_dict['text'] = texts[i]
  temp_dict['entities'] = tags_list[i]
  data.append(temp_dict)

In [11]:
# Create train/test dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [12]:
# Just for checking
train_data[:5]

[{'text': 'A 62-year-old male presented with a 15-day history of dyspnea on exertion, associated with both lower extremity edema.\nBefore this admission, he also had suffered from abdominal bloating and tasteless for a year with noticeable body weight loss at the same time (up to 20 kg).\nOver the past 6 months, he developed a multiple system disorder, which included painless paresthesias in the lower limbs, erectile dysfunction, and chronic diarrhea.\nHe had an average stool frequency of up to ten times per day, with no obvious blood or mucus and no abdominal pain or tenesmus.\nUnfortunately, previous stomach and rectum biopsy did not examine for accumulations of amyloid fibril protein.\nHis family history was unremarkable.\nOn physical examination, his blood pressure was 82/56 mmHg and heart rate was 52 bpm.\nSignificant jugular venous distention, moderate hepatomegaly, and lower extremity edema were noted.\nA neurological examination revealed weakness and muscular atrophy in the bil

# 3) Modeling

In [13]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a spacy model
doc_bin = DocBin()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [24]:
# Create train.spacy file

from spacy.util import filter_spans

for training_example  in tqdm(train_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

 29%|██▉       | 46/160 [00:00<00:00, 213.90it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 56%|█████▌    | 89/160 [00:00<00:00, 193.56it/s]

Skipping entity
Skipping entity


100%|██████████| 160/160 [00:00<00:00, 204.42it/s]

Skipping entity
Skipping entity
Skipping entity





In [15]:
# Download pretrained model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [16]:
# Just for help command
!python -m spacy init fill-config --help

[1m                                                                                                    [0m
[1m [0m[1;33mUsage: [0m[1mpython [0m[1;32m-m[0m[1m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE][0m[1m                         [0m[1m [0m
[1m                                                                                                    [0m
 Fill partial config file with default values. Will add all missing settings from the default       
 config and will create all objects, check the registered functions for their default values and    
 update the base config. This command can be used with a config generated via the training          
 quickstart widget: https://spacy.io/usage/training#quickstart                                      
 [2mDOCS: https://spacy.io/api/cli#init-fill-config[0m                                                    
                                                                                                    
[2m╭

In [25]:
# Intialize confing file downloaded from https://spacy.io/usage/training#quickstart (take care to check "ner")
!python -m spacy init fill-config 'base_config.cfg' 'config.cfg'

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [28]:
# Train the model
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    171.39    0.34    0.37    0.31    0.00
  0     200       8104.31  13131.66   50.94   71.18   39.66    0.51
  1     400        714.31   5183.67   72.37   76.80   68.43    0.72
  1     600        460.24   4071.24   76.78   73.61   80.24    0.77
  2     800       1421.74   3256.98   82.55   82.99   82.11    0.83
  3    1000        539.70   2939.41   87.27   86.83   87.71    0.87
  3    1200        547.93   2295.86   89.55   91.11   88.04    0.90
  4    1400        574.17   2042.30   91.70   9

# 3) Inference

In [30]:
# Load the best model for inference
nlp_ner = spacy.load("model-best")



In [36]:
# Inference

doc = nlp_ner(test_data[10]['text'])

colors = {
    "Age": "#F67DE3",
    "Sex": "#7DF6D9",
    "Symptoms": "#a6e22d",
    "Disease Disorder": "#8A2BE2",

}

options = {"colors": colors}

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)