In [51]:
import pandas as pd
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

import spacy
import glob
from tqdm import tqdm

## Named entities extraction

In [None]:
nlp = spacy.load("en_core_web_sm")

# modify tokenizer infix patterns for avoiding hyphenated words being split (several terms are hyphenated)

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

nlp.max_length = 1500000

In [None]:
ners = set()

for txt in tqdm(glob.iglob("../corpus_txt/*.txt")):
    with open(txt, "r", encoding="utf-8") as f:
        txt = f.readlines()[0]
        doc = nlp(txt)
        local_ners = doc.ents
        str_ners = [el.text.lower() for el in local_ners]
        ners.update(set(str_ners))

In [147]:
seed = 20

## Checking process

Give the whole corpus to spacy -> Extract NER 

For each term extraction:
- filter the terms (remove terms which are NER AND the ones which are variations if primary term) using list of NER
- Remove the duplicates
- random sample 200 and check (measure precision and recall)


1st extraction : Basic stuff of termsuite 

## Extractions

### 1st extraction : Basic 

In [148]:
extract_1 = pd.read_csv('../extractions/1.tsv', sep='\t')
extract_1.key = extract_1.key.apply(lambda x: x.split(':')[1][1:])
extract_1.drop_duplicates(subset="key", keep='first', inplace=True)

In [149]:
not_ner_extract_1 = extract_1[~extract_1["key"].isin(ners)]

In [150]:
# Adding a placeholder for evaluation
# (FALSE means un-thicked box in excel)
not_ner_extract_1['evaluation'] = 'FALSE'

In [153]:
not_ner_extract_1.sample(400, random_state = seed)[['key','evaluation']].to_csv('../evaluation/extract_1.csv', index=False, header=['candidate','evaluation'])