In [None]:
import spacy
import pandas as pd
import json
import os
import re
import random
import spacy.cli
from spacy.tokens import DocBin
from tqdm import tqdm

from spacy.pipeline import EntityRuler
from spacy.lang.en import English


nlp=spacy.blank("en")
ruler=nlp.add_pipe("entity_ruler")

In [None]:
#script in order to convert ubiai IOB format into spacy format input for training
#for correct read, delete the first raw of the annotations
import re
import pandas as pd


train_all = pd.read_csv("training_IOBall.tsv", sep='\t+', header=None, engine='python') #skiprows=1,
dev_all = pd.read_csv("development_IOBall.tsv", sep='\t+', header=None, engine='python')
eval_all = pd.read_csv("evaluation_IOBall.tsv", sep='\t+', header=None, engine='python')

def iob_to_spacy(df):
    words = df[0]
    tags = df[1]
    total_words = len(words)
    data = []
    data_i = 0
    i = 0

    sentence = ""
    sentence_len = 0
    sentence_entities = []
    for word in words:
        dot_index = word.find('.')
        if (dot_index != -1):
            # found in dot_index position
            if (dot_index == len(word)-1):
                # last char
                # check the 1st letter in the next word
                if (total_words-1 > i):
                    # exists in next word
                    # check if the first letter is capital 
                    if (words[i+1][0].isupper()):
                        sentence += word
                        if (tags[i] != 'O'):
                            entity = (sentence_len, len(sentence), tags[i])
                            sentence_entities.append(entity)
                        sentence_len = len(sentence)
                        new_val = (sentence, {"entities": sentence_entities})
                        data.append(new_val)
                        sentence = ""
                        sentence_len = 0
                        sentence_entities = []
                    else:
                        sentence += word + " "
                        if (tags[i] != 'O'):
                            entity = (sentence_len, len(sentence), tags[i])
                            sentence_entities.append(entity)
                        sentence_len = len(sentence)
        else:
            sentence += word + " "
            if (tags[i] != 'O'):
                entity = (sentence_len, len(sentence), tags[i])
                sentence_entities.append(entity)
            sentence_len = len(sentence)
        i += 1
    for sent in data:
        entities = sent[1]['entities']
        new_entities = []
        entity_start = 0
        entity_end = 0
        entity_type = ""
        for entity in entities:
            if entity[2][0] == "B":
                if entity_end != 0:
                    new_entity = (entity_start, entity_end, entity_type)
                    new_entities.append(new_entity)
                entity_start = entity[0]
                entity_end = entity[1]
                entity_type = entity[2]
            else:
                entity_end = entity[1]
        if entity_end != 0:
            new_entity = (entity_start, entity_end, entity_type)
            new_entities.append(new_entity)
            sent[1]['entities']=new_entities

    for sent in data:
        entity_start = 0
        entity_end = 0
        entity_type = ""
        new_entities=[]
        for entity in sent[1]['entities']:
            entity_start = entity[0]
            entity_end = entity[1]
            entity_type = entity[2]
            entity_type=re.sub(r'.', '', entity_type , count = 2)
            new_entity = (entity_start, entity_end, entity_type)
            new_entities.append(new_entity)
            sent[1]['entities']=new_entities
    return data

train_fulltext=iob_to_spacy(train_all)
dev_fulltext=iob_to_spacy(dev_all)
eval_fulltext=iob_to_spacy(eval_all)


In [None]:
#dictionaries
df=pd.read_csv("dictionaries-annot\Distribution_descriptors.csv",header=None)
df1=pd.read_csv("dictionaries-annot\Life_stages.csv",header=None)
df2=pd.read_csv("dictionaries-annot\Body_size.csv",header=None)
df3=pd.read_csv("dictionaries-annot\Sampling_devices.csv",header=None,encoding='cp1252')

df_distr_descr=df.iloc[:,0]
df_life_stages=df1.iloc[:,0]
df_body_size=df2.iloc[:,0]
df_sampl_devices=df3.iloc[:,0]


In [None]:
#in order to create custom ner model

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe('ner')

#adding labels-entities to ner 
ner.add_label("DISTRIBUTION_DESCRIPTOR")
ner.add_label("LIFE_STAGE")
ner.add_label("BODY_SIZE")
ner.add_label("SAMPLING_DEVICE")
ner.labels


In [None]:
from spacy.util import filter_spans

#converting data into .spacy file
def training_data_format(train_data,name):
    doc_bin = DocBin()
    for training_example in tqdm(train_data): 
        text = training_example[0]
        # print(text)
        labels = training_example[1]['entities']
        # print(training_example[1])
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents 
        doc_bin.add(doc)
    
    return(doc_bin.to_disk("train_" + name + ".spacy"))


training_data_format(train_fulltext,"train_full")
training_data_format(dev_fulltext,"dev_full")
training_data_format(eval_fulltext,"eval_full")


In [None]:
#configuration file initiallization
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True' #for error fixing

#training the model
!python -m spacy train config.cfg --output ./ --paths.train ./train_train_full.spacy --paths.dev ./train_eval_full.spacy

In [None]:
from spacy import displacy

#load trained model
nlp_full=spacy.load("model-bestfulltxt")
doc=nlp_full("Atherinids are small marine, estuarine and freshwater fishes not exceeding 120 mm SL (a soon to be described species of Craterocephalus may reach 300 mm SL), occurring predominantly in the Old World, with only Alepidomus evermanni (freshwaters of Cuba) and two marine species, Atherinomorus stipes and Hypoatherina harringtonensis (predominantly in the shore waters of the Caribbean) known from the New World. I")

print([(ent.text, ent.label_ ,ent.start_char, ent.end_char, ent.ent_id_) for ent in doc.ents])
print("\n")

displacy.render(doc, style="ent")


In [None]:
from spacy.tokenizer import Tokenizer
tokenizer = nlp.tokenizer

#in order to put the entity_ruler into nlp_ner pipeline
#entity ruler's patterns(entities) evaluate only if they are not annotaded in training data, so the ”entity_ruler” will only add new entities that match to the patterns only if they don’t overlap with existing entities predicted by the statistical model
def entity_ruler(nlp_model,model):
    if "entity_ruler" not in nlp_model.pipe_names:
        ruler=nlp_model.add_pipe("entity_ruler")
    else:
        ruler=nlp_model.get_pipe("entity_ruler")

    #FOR PATTERNS
    def dict_func(df,linkdf,label):
        patterns=[]
        j=0
        for i in df:
            dict={"label": label}
            dict["pattern"]=[{"LOWER" : str.lower(i)}]
            # dict["id"]=linkdf[[1]][1][j]
            patterns.append(dict)
            tokens=tokenizer(i)
            if len(tokens) == 2:
                dict={"label": label}
                dict["pattern"]=[{"LOWER" : str.lower(str(tokens[0]))}, {"IS_PUNCT": True}, {"LOWER" : str.lower(str(tokens[1]))}]
                # dict["id"]=linkdf[[1]][1][j]
                patterns.append(dict)
            j=j+1
        ruler.add_patterns(patterns)

    dict_func(df_distr_descr,df,"DISTRIBUTION_DESCRIPTOR")
    dict_func(df_life_stages,df1,"LIFE_STAGE")
    dict_func(df_body_size,df2,"BODY_SIZE")
    dict_func(df_sampl_devices,df3,"SAMPLING_DEVICE")

    #in order to put entity ruler into the trained model pipeline
    nlp_model.to_disk(model)

nlp_full=spacy.load("model-bestfulltxt")
entity_ruler(nlp_full,"model-bestfulltxt_ruler")



In [None]:
#evaluation of trained model
!python -m spacy evaluate model-bestfulltxt_ruler/ train_eval_full.spacy

In [None]:
#load trained model

nlp_ner=spacy.load("model-bestfulltxt")
nlp_ner.pipe_names

#load trained model with entity ruler

nlp_ner_ruler=spacy.load("model-bestfulltxt_ruler")
nlp_ner_ruler.pipe_names

paper2=nlp_ner("They are inhabitants of shallow reef areas, usually encountered in less than 10 m depth. During the day they are mainly sedentary, frequently seen resting on the bottom under rock or coral outcrops on substrata containing substantial amounts of sand, silt, mud, or algae.")
doc=nlp_ner_ruler("They are inhabitants of shallow reef areas, usually encountered in less than 10 m depth. During the day they are mainly sedentary, frequently seen resting on the bottom under rock or coral outcrops on substrata containing substantial amounts of sand, silt, mud, or algae.")

print([(ent.text, ent.label_ ,ent.start_char, ent.end_char, ent.ent_id_) for ent in doc.ents])
print("\n")
print([(ent.text, ent.label_ ,ent.start_char, ent.end_char, ent.ent_id_) for ent in paper2.ents])

displacy.render(paper2, style="ent" )
displacy.render(doc, style="ent" )
nlp_ner_ruler.pipe_names

In [None]:
# #combining 2 ner from trained models (roberta and spacy model)

# # nlp_roberta=spacy.load("model-roberta_dev_ruler")
# # nlp_roberta.rename_pipe("ner", "ner_roberta")
# # nlp_roberta.to_disk("model-roberta_dev_ruler")

# nlp1 = spacy.load("model-corpusIOB_dev_ruler")

# # Load the second model
# nlp2 = spacy.load("model-roberta_dev_ruler")
# # Add the components from the second model to the first
# for name, component in nlp2.pipeline:
#     if name  not in nlp1.pipe_names:
#         nlp1.add_pipe(name=name,source=nlp2,factory_name=name)
    
# nlp1.to_disk("combined_IOBspacy_roberta_dev_ruler")

# nlp1=spacy.load("combined_IOBspacy_roberta_dev_ruler")
# doc = nlp1("Most marine fish and invertebrate species produce free and small early-stages which are part of the plankton. These incompletely developed individuals are highly vulnerable to unsuitable conditions like starvation and environmental variability, and it was early recognized that survival during these stages often regulates recruitment and adult population size (Cowan and Shaw, 2002, Pineda et al., 2007). Recruitment theories have thus focused on the environmental modulation of larval survival, and they generally assume that while spawning occurs within relatively fixed time-frames along the year cycle, hydrographic conditions and plankton production show higher inter-annual variability.")

# displacy.render(doc, style="ent" )