In [1]:
import spacy
import csv
import pandas as pd
import glob

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

from numpy import nan

In [2]:
nlp = spacy.load("en_core_web_sm")

# modify tokenizer infix patterns for avoiding hyphenated words being split (several terms are hyphenated)

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

In [3]:
def read_tsv(link):
    with open(link, "r", encoding="utf-8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        next(reader) # skip header
        return {row[2].split(": ")[1] for row in reader}

terms_lexicon = read_tsv("../extraction/swimming-terms_spec_top3k_VALIDATED.tsv") # set of extracted terms

In [4]:
# open corpus, spacy it

corpus = []

for txt in glob.iglob("../corpus_txt/*.txt"):
    with open(txt, "r", encoding="utf-8") as f:
        doc = f.readlines()[0]
        corpus.append(doc)

In [5]:
nlp.max_length = 1500000
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(" ".join(corpus), disable=['ner', 'parser'])

In [6]:
df = pd.DataFrame([item for sub in [[tok.text for tok in s]+['<eos>'] for s in doc.sents] for item in sub], columns=["token"]) # create df with a token in each cell, sentences separated by <eos>

df["lemma"] = [item for sub in [[token.lemma_.lower() if token.lemma_ != '-PRON-' else token.lower_ for token in s]+['<eos>'] for s in doc.sents] for item in sub] # create a column with lemmas

In [7]:
df

Unnamed: 0,token,lemma
0,J.,j.
1,Swimming,swimming
2,Research,research
3,",",","
4,Vol,vol
...,...,...
291036,http://www.wasa.asn.au/articles/article.asp?Gr...,http://www.wasa.asn.au/articles/article.asp?gr...
291037,",",","
291038,1999,1999
291039,.,.


In [8]:
# initially populate 3rd column with "O" tags, which will be changed later
# replace <eos> with nan values for handy exporting later

df["tag"] = df["token"].apply(lambda x: nan if x == "<eos>" else "O")
df["token"] = df["token"].apply(lambda x: nan if x == "<eos>" else x)

In [9]:
df

Unnamed: 0,token,lemma,tag
0,J.,j.,O
1,Swimming,swimming,O
2,Research,research,O
3,",",",",O
4,Vol,vol,O
...,...,...,...
291036,http://www.wasa.asn.au/articles/article.asp?Gr...,http://www.wasa.asn.au/articles/article.asp?gr...,O
291037,",",",",O
291038,1999,1999,O
291039,.,.,O


In [10]:
max_term_len = max([len(x.split(" ")) for x in terms_lexicon])
print(max_term_len)

6


In [11]:
# Creating a new df which contains the terms ngrams (n from 1 to 6)
# And the index of the first word of this ngram in the original df

# around 120 secs to run

ngram_df_entries = []

for i in range(1,max_term_len+1):
    for j in range(len(df)):
        if j+i > len(df):
            pass
        n_gram = ' '.join(df.iloc[j: j+i]["lemma"])
        if n_gram in terms_lexicon:
            ngram_df_entries.append([j, n_gram, i])

In [12]:
ngram_df = pd.DataFrame(columns=["starting_index", "ngram", "ngram_len"], data=ngram_df_entries)

In [13]:
ngram_df

Unnamed: 0,starting_index,ngram,ngram_len
0,1,swimming,1
1,80,swimmer,1
2,92,swimmer,1
3,112,swimming,1
4,188,perfectionism,1
...,...,...,...
29571,151181,computation of lift and drag force,6
29572,172105,computation of lift and drag force,6
29573,230736,journal of strength and conditioning research,6
29574,272348,journal of strength and conditioning research,6


In [15]:
# Filtering duplicates (keeping the biggest ngrams)

ngram_df["is_duplicate"] = ngram_df.duplicated(subset="starting_index", keep='last')

In [16]:
ngram_df = ngram_df.query('is_duplicate == False')

In [17]:
ngram_df

Unnamed: 0,starting_index,ngram,ngram_len,is_duplicate
1,12,breathing,1,False
2,16,issue,1,False
3,23,frank,1,False
4,25,abrahamsen,1,False
5,30,norwegian,1,False
...,...,...,...,...
93018,225862,ratio of shoulder and hip flexor,6,False
93019,230736,journal of strength and conditioning research,6,False
93020,253970,epidemiology of injury and prevention strategy,6,False
93021,272348,journal of strength and conditioning research,6,False


In [18]:
for index, row in ngram_df.iterrows():
    for i in range(row.ngram_len):
        if i==0:
            tag = "B"
        else:
            tag = "I"
        df.at[row.starting_index+i,"tag"] = tag

## Exploration, saving, splitting dataset

In [19]:
df.tag.value_counts()

O    186547
B     62935
I     27956
Name: tag, dtype: int64

In [20]:
df.query('tag=="B"')

Unnamed: 0,token,lemma,tag
1,Swimming,swimming,B
12,Breathing,breathing,B
16,Issue,issue,B
23,Frank,frank,B
25,Abrahamsen,abrahamsen,B
...,...,...,...
291011,October,october,B
291019,Richards,richards,B
291022,Reduction,reduction,B
291024,Swimming,swimming,B


In [57]:
train_len = int(len(df)/100*90)
dev_len = train_len+int(len(df)/100*5)

df.iloc[:train_len].to_csv("../corpus_bio/train.txt", sep=" ", header=False, index=False, columns=["token", "tag"])
df.iloc[train_len:dev_len].to_csv("../corpus_bio/dev.txt", sep=" ", header=False, index=False, columns=["token", "tag"])
df.iloc[dev_len:].to_csv("../corpus_bio/test.txt", sep=" ", header=False, index=False, columns=["token", "tag"])