In [1]:
import pandas as pd

import spacy
import glob

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

from tqdm import tqdm

## Cleaning process of extracted terms

In [2]:
nlp = spacy.load("en_core_web_sm")

# modify tokenizer infix patterns for avoiding hyphenated words being split (several terms are hyphenated)

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

nlp.max_length = 1500000

In [3]:
# we let spaCy extract NEs. we create a set with them that will be used to clean the extracted terms

ners = set()

for txt in tqdm(glob.iglob("../corpus_txt/*.txt")):
    with open(txt, "r", encoding="utf-8") as f:
        txt = f.readlines()[0]
        doc = nlp(txt)
        local_ners = doc.ents
        str_ners = [el.text.lower() for el in local_ners]
        ners.update(set(str_ners))

69it [00:26,  2.65it/s]


In [4]:
extract = pd.read_csv('../extraction/swimming-terms_spec_top3k.tsv', sep='\t')
extract.key = extract.key.apply(lambda x: x.split(':')[1][1:])
extract.drop_duplicates(subset="key", keep='first', inplace=True) # remove duplicates

In [5]:
clean_extract = extract[~extract["key"].isin(ners)] # remove NEs

In [6]:
clean_extract

Unnamed: 0,#,type,key,freq
0,1,T,asymmetry,409
1,2,T,muscle fiber,329
2,2,V[s],ftx muscle fiber,24
3,2,V[s],st muscle fiber,7
4,2,V[s],muscle fiber type,18
...,...,...,...,...
3318,2996,T,sum of reading,2
3319,2997,T,olympic-gold-medal -,2
3320,2998,T,energetically efficient manner,2
3321,2999,T,usa cdepartment,2


In [7]:
# Adding a placeholder for evaluation
# (FALSE means un-thicked box in excel)
# used for later manual checking

clean_extract['evaluation'] = 'FALSE'

In [9]:
# export csv to be manually validated

clean_extract[['key','evaluation']].to_csv('../evaluation/extraction.csv', index=False, header=['candidate','evaluation'])