### 0. Initializing convenience functions

In [1]:
import spacy
from spacy import displacy

def display_ner(nlp_pipe=None, test_data='./data/CoNLL2003/spacy/test.spacy'):
    base_nlp = spacy.blank('en')
    doc_bin = spacy.tokens.DocBin().from_disk(test_data)
    docs = doc_bin.get_docs(base_nlp.vocab)
    doc = next(docs)
    if nlp_pipe is not None:
        if isinstance(nlp_pipe, str):
            nlp_pipe = spacy.load(nlp_pipe)
        doc = nlp_pipe(doc.text)
    displacy.render(doc, style='ent')

2023-06-01 23:18:45.837873: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-01 23:18:47.301413: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-01 23:18:47.301639: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-01 23:18:47.301664: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been bu

### 1. Converting data from conll2003 format to spacy format.

In [2]:
!\
cd ./data/CoNLL2003; \
TRAIN_FILE=./spacy/train.spacy; \
VALID_FILE=./spacy/valid.spacy; \
TEST_FILE=./spacy/test.spacy; \
\
if [[ ! -f "$TRAIN_FILE" ]]; then \
    echo "$TRAIN_FILE does not exist, generating..."; \
    python -m spacy convert ./train.txt ./spacy/ -c conll -n 10; \
else \
    echo "$TRAIN_FILE already exists."; \
fi; \
\
if [[ ! -f "$VALID_FILE" ]]; then \
    echo "$VALID_FILE does not exist, generating..."; \
    python -m spacy convert ./valid.txt ./spacy/ -c conll -n 10; \
else \
    echo "$VALID_FILE already exists."; \
fi; \
\
if [[ ! -f "$TEST_FILE" ]]; then \
    echo "$TEST_FILE does not exist, generating..."; \
    python -m spacy convert ./test.txt ./spacy/ -c conll -n 10; \
else \
    echo "$TEST_FILE already exists."; \
fi; \
\
ls -al $TRAIN_FILE; \
ls -al $VALID_FILE; \
ls -al $TEST_FILE

./spacy/train.spacy already exists.
./spacy/valid.spacy already exists.
./spacy/test.spacy already exists.
-rwxrwxrwx 1 alexsis alexsis 1906793 May 31 20:44 ./spacy/train.spacy
-rwxrwxrwx 1 alexsis alexsis 502856 May 31 20:44 ./spacy/valid.spacy
-rwxrwxrwx 1 alexsis alexsis 458202 May 31 20:44 ./spacy/test.spacy


In [3]:
display_ner()

### 2. Establishing the baseline performance

In [4]:
# !python -m spacy benchmark accuracy en_core_web_sm ./data/CoNLL2003/spacy/test.spacy

Output example:

In [5]:
# !python -m spacy benchmark speed en_core_web_sm ./data/CoNLL2003/spacy/test.spacy

Example output:

In [6]:
display_ner('en_core_web_sm')

### 3. Initializing configs

In [7]:
!\
cd ./config; \
SM_CFG_FILE=ner_conll2003_sm.cfg; \
XSM_CFG_FILE=ner_conll2003_xsm.cfg; \
\
if [[ ! -f "$SM_CFG_FILE" ]]; then \
    echo "$SM_CFG_FILE does not exist, initializing..."; \
    python -m spacy init config $SM_CFG_FILE -p "tagger,ner" -o "efficiency" -G; \
else \
    echo "$SM_CFG_FILE already exists."; \
fi; \
\
if [[ ! -f "$XSM_CFG_FILE" ]]; then \
    echo "$XSM_CFG_FILE does not exist, initializing..."; \
    python -m spacy init config ner_conll2003_xsm.cfg -p "tagger,ner" -o "efficiency" -G; \
else \
    echo "$XSM_CFG_FILE already exists."; \
fi; \
\
ls -al | grep -E "$SM_CFG_FILE|$XSM_CFG_FILE"

ner_conll2003_sm.cfg already exists.
ner_conll2003_xsm.cfg already exists.
-rwxrwxrwx 1 alexsis alexsis 3309 Jun  1 16:22 ner_conll2003_sm.cfg
-rwxrwxrwx 1 alexsis alexsis 3162 Jun  2  2023 ner_conll2003_xsm.cfg


### 4. Training

#### 4.1. The lightweight model

In [8]:
!\
MODEL_PATH=model/NER_POS_CoNLL2003_SM; \
if [ -z "$(ls -A $MODEL_PATH)" ]; then \
   echo "Model not found, training"; \
    python -m spacy train ./config/ner_conll2003_sm.cfg \
       --output $MODEL_PATH \
       --paths.train ./data/CoNLL2003/spacy/train.spacy \
       --paths.dev ./data/CoNLL2003/spacy/valid.spacy \
       --gpu-id 0 \
       --verbose; \
else \
   echo "Model already exists"; \
fi; \
ls -al $MODEL_PATH

Model already exists
total 0
drwxrwxrwx 1 alexsis alexsis 4096 May 31 21:51 .
drwxrwxrwx 1 alexsis alexsis 4096 Jun  1 14:46 ..
drwxrwxrwx 1 alexsis alexsis 4096 May 31 20:52 model-best
drwxrwxrwx 1 alexsis alexsis 4096 May 31 20:52 model-last


Output example:

In [9]:
# !python -m spacy benchmark accuracy ./model/NER_POS_CoNLL2003_SM/model-best/ ./data/CoNLL2003/spacy/test.spacy --gpu-id 0

Example output:

In [10]:
# !python -m spacy benchmark accuracy ./model/NER_POS_CoNLL2003_SM/model-last/ ./data/CoNLL2003/spacy/test.spacy --gpu-id 0

Example output:

In [11]:
# !python -m spacy benchmark speed ./model/NER_POS_CoNLL2003_SM/model-best/ ./data/CoNLL2003/spacy/test.spacy

Output example:

In [12]:
display_ner('./model/NER_POS_CoNLL2003_SM/model-best/')

#### 4.2. The extra lightweight model

In [13]:
!\
MODEL_PATH=model/NER_POS_CoNLL2003_XSM; \
if [ -z "$(ls -A $MODEL_PATH)" ]; then \
   echo "Model not found, training"; \
    python -m spacy train ./config/ner_conll2003_xsm.cfg \
       --output $MODEL_PATH \
       --paths.train ./data/CoNLL2003/spacy/train.spacy \
       --paths.dev ./data/CoNLL2003/spacy/valid.spacy \
       --gpu-id -1 \
       --verbose; \
else \
   echo "Model already exists"; \
fi; \
ls -al $MODEL_PATH

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Model already exists
total 0
drwxrwxrwx 1 alexsis alexsis 4096 Jun  2  2023 .
drwxrwxrwx 1 alexsis alexsis 4096 Jun  1 14:46 ..
drwxrwxrwx 1 alexsis alexsis 4096 Jun  2  2023 model-best
drwxrwxrwx 1 alexsis alexsis 4096 Jun  2  2023 model-last


Output example:

In [14]:
# !python -m spacy benchmark accuracy ./model/NER_POS_CoNLL2003_XSM/model-best/ ./data/CoNLL2003/spacy/test.spacy --gpu-id -1

Output example:

In [15]:
# !python -m spacy benchmark speed ./model/NER_POS_CoNLL2003_XSM/model-best/ ./data/CoNLL2003/spacy/test.spacy

Output example:

In [16]:
display_ner('./model/NER_POS_CoNLL2003_XSM/model-best/')