In [None]:
!pip3 install spacy
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download en_core_web_lg

In [2]:
import numpy as np
import pandas as pd
import datasets as ds
import spacy
from spacy.symbols import ORTH
import nltk
from transformers import AutoTokenizer, AutoModelForTokenClassification
from tqdm.notebook import tqdm

from bs4 import BeautifulSoup
import os
from collections import defaultdict

In [3]:
token_length = 64
bert_name = "bert-large-uncased"
nltk_name = 'averaged_perceptron_tagger'

In [4]:
spacy_to_nltk = {
    "ADJ": ["JJ", "JJR", "JJS"],
    "ADP": ["IN", "TO"],
    "ADV": ["RB", "RBR", "RBS"],
    "AUX": ["MD"],
    "CONJ": ["CC"],
    "CCONJ": ["CC"],
    "DET": ["DT", "PDT", "WDT"],
    "INTJ": [],
    "NOUN": ["NN", "NNS"],
    "NUM": [],
    "PART": ["POS", "RP", "TO"],
    "PRON": [],
    "PROPN": ["NNP", "NNPS"],
    "PUNCT": [".", ",", ":", "(", ")", "''", "``"],
    "SCONJ": ["IN"],
    "SYM": [],
    "VERB": ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
    "X": [],
    "SPACE": []
}



label2id = defaultdict(int, {
    "ADJ": 0,
    "ADP": 1,
    "ADV": 2,
    "AUX": 3,
    "CONJ": 4,
    "CCONJ": 5,
    "DET": 6,
    "INTJ": 7,
    "NOUN": 8,
    "NUM": 9,
    "PART": 10,
    "PRON": 11,
    "PROPN": 12,
    "PUNCT": 13,
    "SCONJ": 14,
    "SYM": 15,
    "VERB": 16,
    "X": 17,
    "SPACE": 18
})

In [5]:
import time
tokenizer = AutoTokenizer.from_pretrained(bert_name)
model = AutoModelForTokenClassification.from_pretrained(bert_name)
nltk.download(nltk_name)

def extract_data(path: str, filename: str):
    if filename.endswith('.html'):
        data = open(path + "/" + filename, mode="r", encoding="UTF-8").read()
        return data
    return ''

def extract_all_data(dataset: pd.DataFrame, path: str):
    return [extract_data(path, x) for x in dataset['Filename']]

def normalize_text(text: str):
    return (text
            .lower()
            .replace("\n", "")
            )

def split_on_period(arr, math_tokens):
    result_toks = []
    current_toks = []

    result_index = []
    current_index = []

    tok_number = 0

    for i, item in enumerate(arr):
        current_toks.append(item)

        if i in math_tokens:
            current_index.append(tok_number)

        tok_number += 1
        if item.strip() == '.':
            result_toks.append(current_toks)
            result_index.append(current_index)
            current_toks = []
            current_index = []
            tok_number = 0

    if current_toks:
        result_toks.append(current_toks)
        result_index.append(current_index)

    return result_toks, result_index

def tokenize(paragraph):
    toks = []
    math_index = []
    for content in paragraph.contents:
        if content.name is not None:
            if content.name == "mml:math":
                math_index.append(len(toks))
                toks.append("[MATH]")
            if content.name == "ce:display":
                math_index.append(len(toks))
                toks.append("[DISPLAY]")
            elif content.name == "ce:italic":
                if len(content.text) > 2:
                    content = normalize_text(content.text)
                    textToks = nltk.word_tokenize(content)
                    toks.extend(textToks)
                    continue
                math_index.append(len(toks))
                toks.append("[ITALIC]")
            elif content.name == "ce:cross-refs":
                continue
        else:
            content = normalize_text(content)
            textToks = nltk.word_tokenize(content)
            toks.extend(textToks)

    return split_on_period(toks, math_index)

def tokenize_file(row):
    filename = row["Filename"]
    complete_text = row["CompleteText"]

    data = {
        "file_name": [],
        "title": [],
        "tokens": [],
        "math_index": [],
    }

    soup = BeautifulSoup(complete_text)
    for section in soup.find_all("ce:section"):
        title = section.find_next("ce:section-title").text
        for paragraph in section.find_all("ce:para"):
            toks_arr, math_toks_arr = tokenize(paragraph)
            for toks, math_toks in zip(toks_arr, math_toks_arr):
                
                if len(toks) == 0 or len(math_toks) == 0:
                    continue
                data["tokens"].append(toks)
                data["title"].append(title)
                data["file_name"].append(filename)
                data["math_index"].append(math_toks)
    return pd.DataFrame(data)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tomato/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
nltk.word_tokenize(' ')

In [None]:
path = "../data"
fileList = np.array(os.listdir(path))
dataset = pd.DataFrame(fileList, columns=['Filename'])
dataset["CompleteText"] = extract_all_data(dataset, path)

In [None]:
df = pd.concat([tokenize_file(row) for _, row in tqdm(dataset.iterrows(), total=len(dataset), desc='Processing Rows')], ignore_index=True)
df

In [None]:
tqdm.pandas()
df_non_empty = df[df['tokens'].str.len() > 0]

In [None]:
df_non_empty['nltk_annotate'] = df_non_empty.progress_apply(lambda row: [x[1] for x in nltk.pos_tag(row['tokens'])], axis=1)

In [None]:
group_size = len(df_non_empty)
dataset = ds.Dataset.from_pandas(df_non_empty)
dataset.save_to_disk('full_df_non_empty')

In [None]:
#
# SAVE POINT
# SAVE POINT
# SAVE POINT
#

from datasets import load_from_disk
df_non_empty = load_from_disk(f'full_df_non_empty').to_pandas()

tqdm.pandas()

In [None]:
df_non_empty['tokens']

In [None]:
for i, tokens in enumerate(df_non_empty['tokens']):
    if i % 100000 == 0:
        print(i)
    for j, token in enumerate(tokens):
        if token == '':
            print(i, ":", tokens)

In [None]:
spacy.require_gpu()
nlp = spacy.load('en_core_web_lg')
special_case_math = [{ORTH: "[MATH]"}]
special_case_display = [{ORTH: "[DISPLAY]"}]
special_case_italic = [{ORTH: "[ITALIC]"}]
nlp.tokenizer.add_special_case("[MATH]", special_case_math)
nlp.tokenizer.add_special_case("[DISPLAY]", special_case_display)
nlp.tokenizer.add_special_case("[ITALIC]", special_case_italic)
def spacy_annotate_row(row):
    tokens = row['tokens']
    
    if len(tokens) == 0:
        return []
    
    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    
    for name, proc in nlp.pipeline:
        doc = proc(doc)
    
    anno = []
    spacy_toks = []
    for token in doc:
        spacy_toks.append(token.text)
        anno.append(token.pos_)
    return anno

df_non_empty['spacy'] = df_non_empty.progress_apply(spacy_annotate_row, axis=1)

In [None]:
group_size = len(df_non_empty)
dataset = ds.Dataset.from_pandas(df_non_empty)
dataset.save_to_disk('full_df_non_empty_spacy')

In [6]:
from tqdm.notebook import tqdm
from datasets import load_from_disk
df_non_empty = load_from_disk(f'full_df_non_empty_spacy').to_pandas()

tqdm.pandas()

In [7]:
df_non_empty

Unnamed: 0,file_name,title,tokens,math_index,nltk_annotate,spacy
0,1375.html,Introduction,"[for, example, ,, it, was, shown, in, that, in...",[21],"[IN, NN, ,, PRP, VBD, VBN, IN, DT, IN, DT, NN,...","[ADP, NOUN, PUNCT, PRON, AUX, VERB, ADP, PRON,..."
1,1375.html,Introduction,"[there, is, also, a, bijection, to, basic, clu...","[16, 20, 38]","[EX, VBZ, RB, DT, NN, TO, VB, NN, VBG, NNS, IN...","[PRON, VERB, ADV, DET, NOUN, ADP, ADJ, NOUN, N..."
2,1375.html,Introduction,"[for, an, integer, [MATH], ,, the, notion, of,...","[3, 8]","[IN, DT, NN, NN, ,, DT, NN, IN, NNP, JJ, NNS, ...","[ADP, DET, NOUN, ADJ, PUNCT, DET, NOUN, ADP, P..."
3,1375.html,Introduction,"[in, a, [ITALIC], -abelian, category, ,, kerne...",[2],"[IN, DT, JJ, JJ, NN, ,, NNS, ,, NNS, CC, NNS, ...","[ADP, DET, NOUN, ADJ, NOUN, PUNCT, NOUN, PUNCT..."
4,1375.html,Introduction,"[the, notion, of, wide, subcategories, was, ge...",[8],"[DT, NN, IN, JJ, NNS, VBD, VBN, TO, VB, JJ, NN...","[DET, NOUN, ADP, ADJ, NOUN, AUX, VERB, ADP, NO..."
...,...,...,...,...,...,...
2029492,4226.html,Second order energy estimates,"[then, ,, we, regard, [MATH], as, the, initial...","[4, 19, 27]","[RB, ,, PRP, VBP, PRP, IN, DT, JJ, NN, CC, NN,...","[ADV, PUNCT, PRON, VERB, ADJ, ADP, DET, ADJ, N..."
2029493,4226.html,Second order energy estimates,"[this, is, a, contradiction, to, the, definiti...","[8, 12]","[DT, VBZ, DT, NN, TO, DT, NN, IN, NNP, ,, RB, ...","[PRON, AUX, DET, NOUN, ADP, DET, NOUN, ADP, AD..."
2029494,4226.html,Second order energy estimates,"[therefore, -, has, a, unique, solution, [MATH...","[6, 8, 10]","[RB, :, VBZ, DT, JJ, NN, NN, IN, NN, IN, NN, .]","[ADV, PUNCT, VERB, DET, ADJ, NOUN, ADJ, ADP, N..."
2029495,4226.html,Second order energy estimates,"[moreover, ,, let, [DISPLAY], differentiating,...","[3, 8, 14, 25, 27]","[RB, ,, VB, NNP, VBG, IN, NN, TO, VB, ,, PRP, ...","[ADV, PUNCT, VERB, X, VERB, ADP, NOUN, ADP, NO..."


In [8]:
for i in range(0, len(df_non_empty)):
    index = i
    spacy_notation = df_non_empty.iloc[index]['spacy']
    nltk_notation = df_non_empty.iloc[index]['nltk_annotate']
    tokens = df_non_empty.iloc[index]['tokens']
    if not(len(tokens) == len(spacy_notation) == len(nltk_notation)):
        print(len(tokens))
        print(len(spacy_notation))
        print(len(nltk_notation))

In [9]:
def check_spacy_to_nltk(row):
    tags = row['spacy']
    nltk_tags = row['nltk_annotate']
    math_ids = row['math_index'].astype(int)
    return [nltk_tags[i] in spacy_to_nltk[tags[i]] for i in math_ids]
df_non_empty['is_valid'] = df_non_empty.progress_apply(check_spacy_to_nltk, axis=1)

  0%|          | 0/2029497 [00:00<?, ?it/s]

In [10]:
y = []

for i, row in tqdm(df_non_empty.iterrows(), total=len(df_non_empty), desc='Processing Rows'):
    y.append(all(row['is_valid']) and 'X' not in row['spacy'])

Processing Rows:   0%|          | 0/2029497 [00:00<?, ?it/s]

In [11]:
df_matching = df_non_empty[y]
df_not_matching = df_non_empty[[not x for x in y]]
print("Starting len:", len(df_non_empty),"\nmatching:", len(df_matching), "\nnot matching:", len(df_not_matching))

Starting len: 2029497 
matching: 76975 
not matching: 1952522


In [12]:
df_matching

Unnamed: 0,file_name,title,tokens,math_index,nltk_annotate,spacy,is_valid
13,1375.html,Introduction,"[recently, ,, higher, auslander, algebras, of,...",[7],"[RB, ,, JJR, NN, NNS, IN, NN, NNS, VBP, VBN, V...","[ADV, PUNCT, ADJ, NOUN, NOUN, ADP, NOUN, NOUN,...",[True]
31,1375.html,Preliminaries,"[we, call, [MATH], krull-schmidt, if, each, ob...",[2],"[PRP, VBP, JJ, JJ, IN, DT, NN, VBZ, IN, DT, JJ...","[PRON, VERB, ADJ, NOUN, SCONJ, DET, NOUN, VERB...",[True]
64,1375.html,Preliminaries,"[moreover, ,, this, d-kernel, appears, as, a, ...",[14],"[RB, ,, DT, JJ, VBZ, IN, DT, JJ, NN, (, IN, DT...","[ADV, PUNCT, DET, PROPN, VERB, ADP, DET, ADJ, ...",[True]
67,1375.html,Preliminaries,"[moreover, ,, this, d-cokernel, appears, as, a...",[14],"[RB, ,, DT, JJ, VBZ, IN, DT, JJ, NN, (, IN, DT...","[ADV, PUNCT, DET, PROPN, VERB, ADP, DET, ADJ, ...",[True]
78,1375.html,Preliminaries,"[(, 3, ), existence, of, [MATH], follows, in, ...",[5],"[(, CD, ), NN, IN, NN, VBZ, IN, DT, JJ, NN, IN...","[PUNCT, NUM, PUNCT, NOUN, ADP, NOUN, VERB, ADP...",[True]
...,...,...,...,...,...,...,...
2028828,1024.html,Proof of Theorem 4.4\n,"[let, d, be, a, primitive, centralizer, of, u,...",[16],"[VB, NN, VB, DT, JJ, NN, IN, JJ, CC, JJ, VB, D...","[VERB, NOUN, AUX, DET, ADJ, NOUN, ADP, PROPN, ...",[True]
2028833,1024.html,Proof of Theorem 4.4\n,"[as, [ITALIC], is, invertible, ,, this, can, n...",[1],"[IN, NN, VBZ, JJ, ,, DT, MD, RB, VB, DT, NN, .]","[SCONJ, NOUN, AUX, ADJ, PUNCT, PRON, AUX, PART...",[True]
2028847,1024.html,Proof of Theorem 4.4\n,"[finally, ,, a, square, matrix, [ITALIC], is, ...",[5],"[RB, ,, DT, JJ, NN, NN, VBZ, VBN, DT, NN, IN, ...","[ADV, PUNCT, DET, ADJ, NOUN, NOUN, AUX, VERB, ...",[True]
2028864,1024.html,Proof of Theorem 4.4\n,"[assuming, (, 2, ), and, taking, [MATH], yield...",[6],"[VBG, (, CD, ), CC, VBG, JJ, NNS, (, CD, ), .]","[VERB, PUNCT, NUM, PUNCT, CCONJ, VERB, ADJ, NO...",[True]


In [13]:
def convert_labels_to_ids(row):
    labels  = [label2id[x] for x in row['spacy']]
    return [-100] + labels + [-100] * (token_length - (len(labels) + 1))

df_matching['labels'] = df_matching.progress_apply(convert_labels_to_ids, axis=1)

  0%|          | 0/76975 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_matching['labels'] = df_matching.progress_apply(convert_labels_to_ids, axis=1)


In [14]:
df_matching

Unnamed: 0,file_name,title,tokens,math_index,nltk_annotate,spacy,is_valid,labels
13,1375.html,Introduction,"[recently, ,, higher, auslander, algebras, of,...",[7],"[RB, ,, JJR, NN, NNS, IN, NN, NNS, VBP, VBN, V...","[ADV, PUNCT, ADJ, NOUN, NOUN, ADP, NOUN, NOUN,...",[True],"[-100, 2, 13, 0, 8, 8, 1, 8, 8, 3, 3, 16, 1, 1..."
31,1375.html,Preliminaries,"[we, call, [MATH], krull-schmidt, if, each, ob...",[2],"[PRP, VBP, JJ, JJ, IN, DT, NN, VBZ, IN, DT, JJ...","[PRON, VERB, ADJ, NOUN, SCONJ, DET, NOUN, VERB...",[True],"[-100, 11, 16, 0, 8, 14, 6, 8, 16, 1, 6, 0, 8,..."
64,1375.html,Preliminaries,"[moreover, ,, this, d-kernel, appears, as, a, ...",[14],"[RB, ,, DT, JJ, VBZ, IN, DT, JJ, NN, (, IN, DT...","[ADV, PUNCT, DET, PROPN, VERB, ADP, DET, ADJ, ...",[True],"[-100, 2, 13, 6, 12, 16, 1, 6, 0, 8, 13, 1, 6,..."
67,1375.html,Preliminaries,"[moreover, ,, this, d-cokernel, appears, as, a...",[14],"[RB, ,, DT, JJ, VBZ, IN, DT, JJ, NN, (, IN, DT...","[ADV, PUNCT, DET, PROPN, VERB, ADP, DET, ADJ, ...",[True],"[-100, 2, 13, 6, 12, 16, 1, 6, 0, 8, 13, 1, 6,..."
78,1375.html,Preliminaries,"[(, 3, ), existence, of, [MATH], follows, in, ...",[5],"[(, CD, ), NN, IN, NN, VBZ, IN, DT, JJ, NN, IN...","[PUNCT, NUM, PUNCT, NOUN, ADP, NOUN, VERB, ADP...",[True],"[-100, 13, 9, 13, 8, 1, 8, 16, 1, 6, 0, 8, 1, ..."
...,...,...,...,...,...,...,...,...
2028828,1024.html,Proof of Theorem 4.4\n,"[let, d, be, a, primitive, centralizer, of, u,...",[16],"[VB, NN, VB, DT, JJ, NN, IN, JJ, CC, JJ, VB, D...","[VERB, NOUN, AUX, DET, ADJ, NOUN, ADP, PROPN, ...",[True],"[-100, 16, 8, 3, 6, 0, 8, 1, 12, 5, 3, 3, 6, 8..."
2028833,1024.html,Proof of Theorem 4.4\n,"[as, [ITALIC], is, invertible, ,, this, can, n...",[1],"[IN, NN, VBZ, JJ, ,, DT, MD, RB, VB, DT, NN, .]","[SCONJ, NOUN, AUX, ADJ, PUNCT, PRON, AUX, PART...",[True],"[-100, 14, 8, 3, 0, 13, 11, 3, 10, 3, 6, 8, 13..."
2028847,1024.html,Proof of Theorem 4.4\n,"[finally, ,, a, square, matrix, [ITALIC], is, ...",[5],"[RB, ,, DT, JJ, NN, NN, VBZ, VBN, DT, NN, IN, ...","[ADV, PUNCT, DET, ADJ, NOUN, NOUN, AUX, VERB, ...",[True],"[-100, 2, 13, 6, 0, 8, 8, 3, 16, 6, 8, 14, 6, ..."
2028864,1024.html,Proof of Theorem 4.4\n,"[assuming, (, 2, ), and, taking, [MATH], yield...",[6],"[VBG, (, CD, ), CC, VBG, JJ, NNS, (, CD, ), .]","[VERB, PUNCT, NUM, PUNCT, CCONJ, VERB, ADJ, NO...",[True],"[-100, 16, 13, 9, 13, 5, 16, 0, 8, 13, 9, 13, ..."


In [15]:
print(len(df_matching))
group_size = len(df_matching)
dataset = ds.Dataset.from_pandas(df_matching)

train_dataset = dataset.train_test_split(test_size=0.2, seed=42)
test_dataset = train_dataset['test'].train_test_split(test_size=0.3, seed=42)
d = {'train': train_dataset['train'], 'test': test_dataset['train'], 'eval': test_dataset['test']}
for key in d:
    d[key].save_to_disk(f'full_valid_dataset_sentence-{group_size}/{key}')


76975


Saving the dataset (0/1 shards):   0%|          | 0/61580 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10776 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4619 [00:00<?, ? examples/s]

In [16]:
print(len(df_not_matching))
group_size = len(df_not_matching)
dataset = ds.Dataset.from_pandas(df_not_matching)
dataset.save_to_disk(f'full_invalid_dataset_sentence-{group_size}')

1952522


Saving the dataset (0/3 shards):   0%|          | 0/1952522 [00:00<?, ? examples/s]

In [None]:
def get_senetence_with_label(d, label):
    for i, row in d.iterrows():
        for l, t in zip(row['spacy'], row['tokens']):
            if l == label:
                print(row['tokens'])
                print(row['nltk_annotate'])
                print(row['spacy'])
                return row
    print("None found")
get_senetence_with_label(df_matching, 'X')

In [None]:
def check_spacy_to_nltk(row):
    tags = row['spacy']
    nltk_tags = row['nltk_annotate']
    math_ids = row['math_index']
    return [nltk_tags[i] in spacy_to_nltk[tags[i]] for i in math_ids]

row = {
    'spacy': ['ADV', 'ADV', 'PUNCT', 'X', 'PUNCT'],
    'nltk_annotate': ['RB', 'IN', ',', 'FW', 'NN'],
    'math_index': [3]
}

check_spacy_to_nltk(row)