In [26]:
import spacy
import csv
import pandas as pd

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

from numpy import nan

In [2]:
nlp = spacy.load("en_core_web_sm")

# modify tokenizer infix patterns for avoiding hyphenated words being split (several terms are hyphenated)

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

In [3]:
def read_tsv(link):
    with open(link, "r", encoding="utf-8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        next(reader) # skip header
        return {row[2].split(": ")[1] for row in reader}

terms_lexicon = read_tsv("../swimming-terms.tsv") 
# list of extracted terms

In [4]:
# open 1 document, spacy it
with open("../corpus_txt/manuscript-maglischo-vol22.txt") as f: # corpus here
    doc = f.readlines()[0]
    doc = nlp(doc)


In [49]:
df = pd.DataFrame([item for sub in [[tok.text for tok in s]+['<eos>'] for s in doc.sents] for item in sub], columns=["token"])

df["lemma"] = [item for sub in [[token.lemma_.lower() if token.lemma_ != '-PRON-' else token.lower_ for token in s]+['<eos>'] for s in doc.sents] for item in sub]


In [51]:
df

Unnamed: 0,token,lemma
0,Swimming,swimming
1,Research,research
2,",",","
3,Vol,vol
4,.,.
...,...,...
9869,Swimming,swimming
9870,.,.
9871,<eos>,<eos>
9872,23,23


In [53]:
# populate 3rd column with "O" tags
df["tag"] = df["token"].apply(lambda x: nan if x == "<eos>" else "O")
df["token"] = df["token"].apply(lambda x: nan if x == "<eos>" else x)

In [54]:
df

Unnamed: 0,token,lemma,tag
0,Swimming,swimming,O
1,Research,research,O
2,",",",",O
3,Vol,vol,O
4,.,.,O
...,...,...,...
9869,Swimming,swimming,O
9870,.,.,O
9871,,<eos>,
9872,23,23,O


In [55]:
max_term_len = max([len(x.split(" ")) for x in terms_lexicon])
print(max_term_len)

6


# Attempts

## 2 df n grams

In [56]:
# Creating a new df which contains the terms ngrams (n from 1 to 6)
# And the index of the first word of this ngram in the original df

ngram_df_entries = []

for i in range(1,max_term_len+1):
    for j in range(len(df)):
        if j+i > len(df):
            pass
        n_gram = ' '.join(df.iloc[j: j+i]["lemma"])
        if n_gram in terms_lexicon:
            ngram_df_entries.append([j, n_gram, i])

In [57]:
ngram_df = pd.DataFrame(columns=["starting_index", "ngram", "ngram_len"], data=ngram_df_entries)

In [58]:
ngram_df

Unnamed: 0,starting_index,ngram,ngram_len
0,0,swimming,1
1,14,time,1
2,21,swimming,1
3,22,backstroke,1
4,25,ernest,1
...,...,...,...
3180,2958,middle and right hand photo,5
3181,7438,period of trial and error,5
3182,7901,short and long backstroke event,5
3183,7919,male and female backstroke swimmer,5


In [59]:
# Filtering duplicates (keeping the biggest ngrams)
ngram_df["is_duplicate"] = ngram_df.duplicated(subset="starting_index", keep='last')

In [60]:
ngram_df = ngram_df.query('is_duplicate == False')

In [61]:
ngram_df

Unnamed: 0,starting_index,ngram,ngram_len,is_duplicate
1,14,time,1,False
2,21,swimming,1,False
3,22,backstroke,1,False
4,25,ernest,1,False
5,27,maglischo,1,False
...,...,...,...,...
3180,2958,middle and right hand photo,5,False
3181,7438,period of trial and error,5,False
3182,7901,short and long backstroke event,5,False
3183,7919,male and female backstroke swimmer,5,False


In [62]:
for index, row in ngram_df.iterrows():
    #breakpoint()
    for i in range(row.ngram_len):
        if i==0:
            tag = "B"
        else:
            tag = "I"
        df.at[row.starting_index+i,"tag"] = tag

## exploration

In [63]:
df.tag.value_counts()

O    6524
B    2029
I     848
Name: tag, dtype: int64

In [64]:
df.query('tag=="B"')

Unnamed: 0,token,lemma,tag
0,Swimming,swimming,B
14,Time,time,B
21,Swimming,swimming,B
22,Backstroke,backstroke,B
25,Ernest,ernest,B
...,...,...,...
9841,Mark,mark,B
9849,Backstroke,backstroke,B
9852,Online,online,B
9855,Feb.,february,B


In [65]:
"swimming backstroke" in terms_lexicon

False

In [66]:
df.to_csv("dataset.txt", sep=" ",header=False, index=False, columns=["token", "tag"])

In [43]:
df.iloc[20:34]

Unnamed: 0,token,lemma,tag
20,of,of,O
21,Swimming,swimming,B
22,Backstroke,backstroke,B
23,?,?,O
24,\n,\n,
25,Ernest,ernest,B
26,W.,w.,O
27,Maglischo,maglischo,B
28,1970,1970,O
29,Lazy,lazy,B
