## This script is used to run NER with scispacy models and run evaluations

In [29]:
from seqeval.metrics import classification_report
import scispacy
import spacy
import pandas as pd
from spacy.tokens import DocBin

# Convert iob2 to spacy

In [41]:
## Input file in IOB2 format
infile_true = "../../test_prediction/data/gs_iob2_sonja_biober_preprocessed/disease/test.txt"
## Output file with suffix
outfile_true = "../../test_prediction/data/gs_iob2_sonja_spacy/temp_disease_test1.txt"

In [20]:
## Here we convert the IOB2 file to include the following suffix
## For example, if it is a disease annotation, we include the suffix DISEASE
## Therefore B, I, O terms are converted to B-DISEASE, I-DISEASE and O
## This is necessary for spacy NER

suffix="DISEASE"

with open(infile_true, encoding="utf8") as f, open(outfile, "w", encoding="utf8") as out_f:
    for line in f.readlines():
        l=line.strip().split(" ")
#         print(l)
        if len(l)==2:
            if l[1]=="B":
                out_f.write(f"{l[0]} B-{suffix}\n")
            elif l[1]=="I":
                out_f.write(f"{l[0]} I-{suffix}\n")
            else:
                out_f.write(f"{l[0]} O\n")
        else:
            out_f.write(line)
            

## Once the file is created, use the spacy NER to convert it to a DocBin
### The Destination file will be with the same name but in .spacy format
### The format for the following command is:
```python
!python -m spacy convert source destination -c ner
```


In [28]:
!python -m spacy convert "../../test_prediction/data/gs_iob2_sonja_spacy/temp_disease_test1.txt" "../../test_prediction/data/gs_iob2_sonja_spacy/" -c ner

[i] Auto-detected token-per-line NER format
[i] Grouping every 1 sentences into a document.
[!] To generate better training data, you may want to group sentences into
documents with `-n 10`.
[+] Generated output file (27 documents):
..\..\test_prediction\data\gs_iob2_sonja_spacy\temp_disease_test1.spacy


## Load and apply seqeval
Now run the model and compare

In [30]:
nlp = spacy.load("en_ner_bionlp13cg_md")#spacy.load("en_ner_craft_md")
nlp_sc=nlp

In [31]:
infile = "../data/gs_iob2_sonja_spacy/temp_disease_test1.spacy"
doc = DocBin().from_disk(path=infile)
doc

<spacy.tokens._serialize.DocBin at 0x1fc3758aee0>

In [32]:
## Get entities
ents = []
for d in doc.get_docs(nlp_sc.vocab):
    for txt in nlp(d.text_with_ws):
#         print(txt, txt.ent_iob_)
        ents.append([txt,txt.ent_iob_])
    #for sent in d.text_with_ws:
#         print(sent)

In [33]:
# Check results
len(ents), ents

(2442,
 [[Return, 'O'],
  [of, 'O'],
  [the, 'O'],
  [Coronavirus, 'O'],
  [:, 'O'],
  [2019-nCoV, 'O'],
  [The, 'O'],
  [emergence, 'O'],
  [of, 'O'],
  [a, 'O'],
  [novel, 'O'],
  [coronavirus, 'O'],
  [(, 'O'],
  [2019-nCoV, 'O'],
  [), 'O'],
  [has, 'O'],
  [awakened, 'O'],
  [the, 'O'],
  [echoes, 'O'],
  [of, 'O'],
  [SARS-CoV, 'B'],
  [from, 'O'],
  [nearly, 'O'],
  [two, 'O'],
  [decades, 'O'],
  [ago, 'O'],
  [Yet, 'O'],
  [,, 'O'],
  [with, 'O'],
  [technological, 'O'],
  [advances, 'O'],
  [and, 'O'],
  [important, 'O'],
  [lessons, 'O'],
  [gained, 'O'],
  [from, 'O'],
  [previous, 'O'],
  [outbreaks, 'O'],
  [,, 'O'],
  [perhaps, 'O'],
  [the, 'O'],
  [world, 'O'],
  [is, 'O'],
  [better, 'O'],
  [equipped, 'O'],
  [to, 'O'],
  [deal, 'O'],
  [with, 'O'],
  [the, 'O'],
  [most, 'O'],
  [recent, 'O'],
  [emergent, 'O'],
  [group, 'O'],
  [2B, 'O'],
  [coronavirus, 'O'],
  [Mast, 'B'],
  [cells, 'I'],
  [contribute, 'O'],
  [to, 'O'],
  [coronavirus-induced, 'O'],
  [inflamm

In [42]:
## Run predictions

y_pred = []
y_true = []

# infile_true = "../../test_prediction/data/gs_iob2_sonja_spacy/temp_disease.txt"
# Script to go over the IOB tags
idx = 0
with open(outfile_true, encoding="utf8") as f:
    for line in f.readlines():
        line=line.strip().split(" ")
        ent = str(ents[idx][0])
        primary_tag = str(ents[idx][1])
#         print(ents[idx][0])
        if line[0]==ent:
#             print("yt", line[0], "yp", ent)
            y_true.append(line[1])
            y_pred.append(str(ents[idx][1]))
            idx+=1
        else:
            if line[0]=="":
                pass
                
            else:
                ## Checking differences
                if len(line[0])>len(ent):
                    if ent in line[0]:
                        temp_str = ent
                        temp_tags = [primary_tag]
                        while temp_str!=line[0] and line[0]>temp_str:
                            idx+=1
                            temp_str+=str(ents[idx][0])
                            temp_tags.append(str(ents[idx][1]))
                        
                        y_true.append(line[1])
                        
                        if "B" in temp_tags:
                            y_pred.append("B")
                        elif "I" in temp_tags:
                            y_pred.append("I")
                        else:
                            y_pred.append("O")
                else:
                    raise Exception("err")
                            
                        
                print(line[0], ent)
                idx+=1
#                 raise Exception("ERR!!")
                
          
for idx, val in enumerate(y_true):
    if val!="B" and val[0]=="B":
        y_true[idx]="B"
    elif val!="I" and val[0]=="I":
        y_true[idx]="I"
        
len(y_true), len(y_pred)

96-2 96
89-4 89
2002-2003 2002
decade's decade
'Public '
Concern' Concern
9-32 9
health1-3 health1
'WH-Human '
1' 1
'2019-nCoV' '
41·0-58·0 41·0
5·0-13·0 5·0
"2019 "
world's world


(2419, 2419)

In [43]:
## Predictions
set(y_true), set(y_pred)
print(classification_report([y_true],[y_pred], digits=5))

              precision    recall  f1-score   support

           _    0.00862   0.03333   0.01370        30

   micro avg    0.00862   0.03333   0.01370        30
   macro avg    0.00862   0.03333   0.01370        30
weighted avg    0.00862   0.03333   0.01370        30

