# Terminology and Ontology Project

By 

---- Henri Joël Azemena   azemehenri@gmail.com / azemena1u@etu.univ-lorraine.fr

---- Guillaume Richez    guillaume.richez9@etu.univ-lorraine.fr / guillaume.richez9@etu.univ-lorraine.fr

---- Goal : develop a term identification system for a specific domain.

---- Chosen domain: Natural Language Generation

---- Tools: Linux, Python 3.9, Spacy, textract, os, string, nltk, __future__,plac, random, pathlib, tqdm


### Importing the libraries

In [None]:
from os import listdir
import textract
import string 
from nltk import word_tokenize
import spacy
from spacy.matcher import Matcher
from nltk import word_tokenize
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 
from spacy.training.example import Example

## Function that will extract automatically the text  from the set of articles

In [None]:

def extract_articles_content(path):
    texts = ""
    for file in listdir(path):
        paper = path + file
        if paper != path + '.DS_Store':
            text = textract.process(paper, method='pdfminer',encoding='UTF-8')
            text = text.decode("utf-8") 
            texts = texts + text
        
    return texts

## Creation of our Corpora from the  20  articles in a NLG domain

In [None]:
corpora = extract_articles_content("path_to_the_scientific_articles_folder")

# Feature engineering 

## Function we use to  clean the corpora 

In [None]:

def clean_corpora(text_set):
    text_set = text_set.replace('\n' , ' ')
    text_set = text_set.replace('- ' , '')
    text_set = text_set.lower()
    
    #remove punctuation
    
    tokens = word_tokenize(text_set)
    for i,word in enumerate(tokens):
        if 'https' in word or '\\uf' in word:
            del tokens[i]
        if 'hal' in word:
            del tokens[i]
        if 'cid' in word or '¥' in word or '¢' in word:
            del tokens[i]
        if len(word)==1 and word != 'a':
            del tokens[i]
    
    text_set = ' '.join(tokens)
    return text_set

## Clean and save the corpora 

In [None]:
corpora = clean_corpora(corpora)

with open('corpora.txt','w') as file:
    file.write(corpora)

# Implementation of our SILVER and definition of patterns 

In [None]:

nlp = spacy.load("en_core_web_sm")
corpora = nlp(corpora)
matcher = Matcher(nlp.vocab)
terms = []
patterns = [[{"POS": "NOUN"}],[{"POS": "NOUN"},{"POS": "NOUN"}],[{"POS": "ADJ"},{"POS": "NOUN"}],[{"POS": "NOUN"},{"POS": "ADJ"}],[{"POS": "NOUN"},{"POS": "ADP"},{"POS": "NOUN"}],
           [{"POS": "NOUN"},{"POS": "ADP"},{"POS": "DET"},{"POS": "NOUN"}],[{"POS": "NOUN"},{"POS": "NOUN"},{"POS": "NOUN"}]]
matcher.add("Compound nouns", patterns)
matches = matcher(corpora)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  
    span = corpora[start:end] 
    terms.append(span.text)


## Clean and save the Silver after manual filterring

In [None]:
def remove_useless(text_set):
    result = x
    for i,word in enumerate(x):
        if len(word_tokenize(word)) <= 4 or 'CSV' in word or '%' in word:
            del result[i]
        #remove all word with numbers
        if any(char.isdigit() for char in word):
            del result[i]
        if 'CSV Q' in word:
            del result[i]
            
    return result

silver = remove_useless(terms)
silver = set(silver)

In [None]:
def cleaning_terms(l):
    new_l = [term for term in l if len(term) >= 4 if 'csv' not in term.lower() if '%' not in term.lower()]
    new_l = [term for term in new_l if any(i.isdigit() for i in term) == False]
    new_l = [term for term in new_l if any(token for token in term.split() if len(token) < 2 if len(term) < 10) == False]
    return new_l

len(cleaning_terms(silver))

with open('silver.txt','w') as file:
    for i in silver:
        file.write(i+'\n')


## Load the filter SILVER

In [47]:
silver = []
with open('silver.txt') as file:
    lines = file.readlines()
    
    for line in lines :
        silver.append(line)
silver = [x[:-1] for x in silver]

## Load the copora

In [48]:
with open('corpora.txt') as file:
    corpora = file.readlines()

corpus = corpus[0]

## Creation of IBO finction for automatic ananotation

In [50]:
def annot(text, silver):
    tokens = word_tokenize(text)
    #print(tokens)
    offset = 0
    entities = []
    i = 0
    while i < (len(tokens)):
        offset = text.find(tokens[i], offset)
        if i < len(tokens) - 3 and " ".join(tokens[i:i+3]) in silver:
            entities.append((offset,offset+len(tokens[i]),'B'))
            entities.append((offset+len(tokens[i])+1,offset+len(tokens[i])+len(tokens[i+1])+1,'I'))
            entities.append((offset+len(tokens[i])+len(tokens[i+1])+2,offset+len(tokens[i])+len(tokens[i+1])+len(tokens[i+2])+2,'I'))
            i = i+3
            #offset=offset+len(tokens[i])+len(tokens[i+1])+len(tokens[i+2])
        elif i < len(tokens) - 2 and " ".join(tokens[i:i+2]) in silver:
            entities.append((offset,offset+len(tokens[i]),'B'))
            entities.append((offset+len(tokens[i])+1,offset+len(tokens[i])+len(tokens[i+1])+2,'I'))
            i = i+2 
            #offset=offset+len(tokens[i])+len(tokens[i+1])
        elif i < len(tokens) - 1 and tokens[i] in silver:
            entities.append((offset,offset+len(tokens[i]),'B'))
            i = i+1
            #offset=offset+len(tokens[i])
        else:
            entities.append((offset,offset+len(tokens[i]),'O'))
            i = i+1
        offset = entities[-1][1]
    return {'entities': entities}


## Annotation of the corpora using the filter SILVER

In [51]:
tokens = word_tokenize(corpora)
TRAIN_DATA = []
for i in range(0,len(tokens),20):
    text = " ".join(tokens[i:i+20])
    TRAIN_DATA.append((text, annot(text, silver)))


## Saving the training set

In [52]:
with open('traindata.txt','w') as file:
    for a, annotations in TRAIN_DATA:
        record = "'"+a + "'," + str(annotations) +'\n'
        file.write(record)

# NER setting using Spacy

## Creation of our Model 

In [54]:
model_ner = None
output_dir=Path("path_to_the_folder_where_we want_to_save_the_model")
n_iter=150

In [55]:
if model_ner is not None:
    nlp = spacy.load(model_ner)  
    print("Loaded model '%s'" % model_ner)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model_ner")

Created blank 'en' model


In [56]:
if "ner" not in nlp.pipe_names:
        #ner = nlp.create_pipe("ner")
        ner = nlp.add_pipe("ner", last=True)
    # otherwise, get it so we can add labels
else:
        ner = nlp.get_pipe("ner")

## Training of our model

In [None]:

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_

{'ner': 10673.924832065906}


100%|██████████| 2209/2209 [01:51<00:00, 19.85it/s]
  0%|          | 2/2209 [00:00<02:01, 18.16it/s]

{'ner': 6273.9614340944545}


100%|██████████| 2209/2209 [01:51<00:00, 19.86it/s]
  0%|          | 2/2209 [00:00<02:17, 16.07it/s]

{'ner': 5290.906506937412}


100%|██████████| 2209/2209 [01:45<00:00, 20.94it/s]
  0%|          | 2/2209 [00:00<02:02, 17.99it/s]

{'ner': 4980.222842961371}


100%|██████████| 2209/2209 [01:41<00:00, 21.67it/s]
  0%|          | 3/2209 [00:00<01:52, 19.61it/s]

{'ner': 4424.863197846076}


100%|██████████| 2209/2209 [01:41<00:00, 21.83it/s]
  0%|          | 3/2209 [00:00<01:42, 21.55it/s]

{'ner': 4137.504696947721}


100%|██████████| 2209/2209 [01:42<00:00, 21.62it/s]
  0%|          | 2/2209 [00:00<02:02, 18.03it/s]

{'ner': 3944.081666478946}


100%|██████████| 2209/2209 [01:45<00:00, 20.88it/s]
  0%|          | 2/2209 [00:00<02:13, 16.53it/s]

{'ner': 3793.0215544042867}


100%|██████████| 2209/2209 [01:37<00:00, 22.61it/s]
  0%|          | 2/2209 [00:00<02:17, 16.07it/s]

{'ner': 3632.2598790434517}


100%|██████████| 2209/2209 [01:35<00:00, 23.21it/s]
  0%|          | 3/2209 [00:00<01:34, 23.45it/s]

{'ner': 3454.291831511156}


100%|██████████| 2209/2209 [01:35<00:00, 23.09it/s]
  0%|          | 3/2209 [00:00<01:43, 21.40it/s]

{'ner': 3324.6059806620833}


100%|██████████| 2209/2209 [01:39<00:00, 22.25it/s]
  0%|          | 3/2209 [00:00<01:34, 23.31it/s]

{'ner': 3132.6354799212972}


100%|██████████| 2209/2209 [01:35<00:00, 23.04it/s]
  0%|          | 3/2209 [00:00<01:33, 23.53it/s]

{'ner': 3008.337826971755}


100%|██████████| 2209/2209 [01:35<00:00, 23.17it/s]
  0%|          | 3/2209 [00:00<01:35, 23.12it/s]

{'ner': 2999.6819350341384}


100%|██████████| 2209/2209 [01:35<00:00, 23.09it/s]
  0%|          | 3/2209 [00:00<01:32, 23.80it/s]

{'ner': 2827.67689145681}


100%|██████████| 2209/2209 [01:35<00:00, 23.23it/s]
  0%|          | 3/2209 [00:00<01:38, 22.35it/s]

{'ner': 2783.528827411639}


100%|██████████| 2209/2209 [01:35<00:00, 23.17it/s]
  0%|          | 3/2209 [00:00<01:37, 22.66it/s]

{'ner': 2708.6090359373134}


100%|██████████| 2209/2209 [01:35<00:00, 23.06it/s]
  0%|          | 3/2209 [00:00<01:34, 23.43it/s]

{'ner': 2623.9558293887076}


100%|██████████| 2209/2209 [01:35<00:00, 23.03it/s]
  0%|          | 2/2209 [00:00<01:55, 19.11it/s]

{'ner': 2397.5073310528087}


100%|██████████| 2209/2209 [01:35<00:00, 23.10it/s]
  0%|          | 3/2209 [00:00<01:31, 24.13it/s]

{'ner': 2375.9285068143204}


100%|██████████| 2209/2209 [01:36<00:00, 22.95it/s]
  0%|          | 3/2209 [00:00<01:33, 23.70it/s]

{'ner': 2372.1277097473735}


100%|██████████| 2209/2209 [01:35<00:00, 23.18it/s]
  0%|          | 2/2209 [00:00<01:54, 19.19it/s]

{'ner': 2258.790973002139}


100%|██████████| 2209/2209 [01:35<00:00, 23.08it/s]
  0%|          | 3/2209 [00:00<01:36, 22.90it/s]

{'ner': 2165.6616456000256}


100%|██████████| 2209/2209 [01:35<00:00, 23.19it/s]
  0%|          | 3/2209 [00:00<01:31, 24.23it/s]

{'ner': 2087.1514674381065}


100%|██████████| 2209/2209 [01:35<00:00, 23.21it/s]
  0%|          | 3/2209 [00:00<01:34, 23.44it/s]

{'ner': 1987.941106880923}


100%|██████████| 2209/2209 [01:35<00:00, 23.12it/s]
  0%|          | 3/2209 [00:00<01:32, 23.78it/s]

{'ner': 2024.7600668281443}


100%|██████████| 2209/2209 [01:35<00:00, 23.02it/s]
  0%|          | 3/2209 [00:00<01:33, 23.53it/s]

{'ner': 2001.254650743472}


100%|██████████| 2209/2209 [01:35<00:00, 23.01it/s]
  0%|          | 3/2209 [00:00<01:37, 22.66it/s]

{'ner': 1923.5555158444035}


100%|██████████| 2209/2209 [01:35<00:00, 23.14it/s]
  0%|          | 3/2209 [00:00<01:30, 24.27it/s]

{'ner': 1883.3334529966503}


 22%|██▏       | 492/2209 [00:21<01:12, 23.73it/s]

## Saving of the Model 

In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)    

# Loading and using the created model on new data 

In [None]:
corpora_validation = corpora = extract_articles_content("path_to_the_scientific_articles_test_folder")

In [None]:
nlp_validation = spacy.load("the value of output_dir = Pathpath_to_the_folder_where_we_save_the_model")
doc_validation = nlp_validation(corpora_validation)
spacy.displacy.render(doc_validation, style="ent", jupyter=True)

In [None]:
annot(corpora_validation, silver)

In [None]:
nlp_validation = spacy.load("en_core_web_sm")
doc_validation = nlp_validation(corpora_validation[0])

for token in doc_validation:
   print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop + len(token.text) - 1)

# Conclusion

In this project, we learned to recognize from a set of documents, the entities belonging to a specific domain by using the basic principles of terminology. To optimize the result, we can increase the number of epochs.
