In [1]:
from collections import defaultdict

import nltk
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_from_disk, concatenate_datasets
import torch
import lib
import numpy as np
import evaluate
import seqeval
from transformers import GenerationConfig

In [2]:

dataset_full = load_from_disk('full_invalid_dataset_sentence-1952522')

In [2]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
    print('Cuda')

Cuda


In [3]:
num_labels = len(lib.id2label)
max_length = 64

gen_config = GenerationConfig(max_length=max_length)

tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased')
model = BertForTokenClassification.from_pretrained('trained_bert_model_full_data_modified', num_labels=num_labels)

new_tokens = ['[MATH]', '[DISPLAY]', '[ITALIC]']
num_added_tokens = tokenizer.add_tokens(new_tokens)

model.resize_token_embeddings(len(tokenizer))

model.to(device)
batch_size = 256

In [5]:
def create_pos_batch(token_lists):
    padded_input_ids = [tokenizer.convert_tokens_to_ids(['[CLS]'] + tokens[0:62] + ['[SEP]']) + [0] * (max_length - (len(tokens) +2)) for tokens in token_lists]
    input_ids_tensor = torch.tensor(padded_input_ids, device=device)
    attention_mask = torch.tensor([[1]*(min(len(tokens) + 2, 64)) + [0]*(max_length - (len(tokens) + 2)) for tokens in token_lists], device=device)
    
    inputs = {
        "input_ids": input_ids_tensor,
        "attention_mask": attention_mask
    }
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    predicted_labels_batch = torch.argmax(logits, dim=2)
    
    return [[lib.id2label[label.item()] for label in sequence if label != -100] for sequence in predicted_labels_batch]

labels_list = []
tokens_list = []

from tqdm.notebook import tqdm

batches_per_batch = 40

iterator2 = range(0, len(dataset_full['tokens']), batch_size*batches_per_batch)

In [6]:
for j in tqdm(iterator2, total=len(iterator2), desc='Resizing'):
    print(j, j + batches_per_batch*batch_size - 1)
    dataset = dataset_full.select(range(j, min(j + batches_per_batch*batch_size - 1, len(dataset_full))))
    iterator1 = range(0, len(dataset['tokens']), batch_size)
    
    for i in tqdm(iterator1, total=len(iterator1), desc='Processing Rows'):
        batch_tokens = dataset['tokens'][i:i + batch_size]
        labels_list.extend(create_pos_batch(batch_tokens))
        tokens_list.extend(batch_tokens)

Resizing:   0%|          | 0/191 [00:00<?, ?it/s]

0 10239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

10240 20479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

20480 30719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

30720 40959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

40960 51199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

51200 61439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

61440 71679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

71680 81919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

81920 92159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

92160 102399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

102400 112639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

112640 122879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

122880 133119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

133120 143359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

143360 153599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

153600 163839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

163840 174079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

174080 184319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

184320 194559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

194560 204799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

204800 215039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

215040 225279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

225280 235519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

235520 245759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

245760 255999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

256000 266239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

266240 276479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

276480 286719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

286720 296959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

296960 307199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

307200 317439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

317440 327679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

327680 337919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

337920 348159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

348160 358399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

358400 368639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

368640 378879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

378880 389119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

389120 399359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

399360 409599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

409600 419839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

419840 430079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

430080 440319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

440320 450559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

450560 460799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

460800 471039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

471040 481279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

481280 491519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

491520 501759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

501760 511999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

512000 522239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

522240 532479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

532480 542719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

542720 552959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

552960 563199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

563200 573439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

573440 583679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

583680 593919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

593920 604159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

604160 614399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

614400 624639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

624640 634879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

634880 645119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

645120 655359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

655360 665599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

665600 675839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

675840 686079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

686080 696319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

696320 706559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

706560 716799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

716800 727039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

727040 737279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

737280 747519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

747520 757759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

757760 767999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

768000 778239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

778240 788479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

788480 798719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

798720 808959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

808960 819199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

819200 829439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

829440 839679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

839680 849919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

849920 860159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

860160 870399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

870400 880639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

880640 890879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

890880 901119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

901120 911359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

911360 921599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

921600 931839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

931840 942079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

942080 952319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

952320 962559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

962560 972799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

972800 983039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

983040 993279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

993280 1003519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1003520 1013759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1013760 1023999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1024000 1034239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1034240 1044479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1044480 1054719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1054720 1064959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1064960 1075199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1075200 1085439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1085440 1095679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1095680 1105919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1105920 1116159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1116160 1126399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1126400 1136639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1136640 1146879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1146880 1157119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1157120 1167359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1167360 1177599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1177600 1187839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1187840 1198079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1198080 1208319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1208320 1218559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1218560 1228799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1228800 1239039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1239040 1249279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1249280 1259519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1259520 1269759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1269760 1279999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1280000 1290239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1290240 1300479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1300480 1310719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1310720 1320959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1320960 1331199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1331200 1341439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1341440 1351679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1351680 1361919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1361920 1372159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1372160 1382399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1382400 1392639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1392640 1402879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1402880 1413119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1413120 1423359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1423360 1433599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1433600 1443839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1443840 1454079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1454080 1464319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1464320 1474559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1474560 1484799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1484800 1495039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1495040 1505279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1505280 1515519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1515520 1525759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1525760 1535999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1536000 1546239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1546240 1556479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1556480 1566719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1566720 1576959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1576960 1587199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1587200 1597439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1597440 1607679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1607680 1617919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1617920 1628159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1628160 1638399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1638400 1648639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1648640 1658879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1658880 1669119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1669120 1679359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1679360 1689599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1689600 1699839


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1699840 1710079


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1710080 1720319


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1720320 1730559


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1730560 1740799


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1740800 1751039


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1751040 1761279


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1761280 1771519


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1771520 1781759


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1781760 1791999


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1792000 1802239


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1802240 1812479


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1812480 1822719


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1822720 1832959


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1832960 1843199


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1843200 1853439


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1853440 1863679


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1863680 1873919


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1873920 1884159


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1884160 1894399


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1894400 1904639


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1904640 1914879


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1914880 1925119


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1925120 1935359


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1935360 1945599


Processing Rows:   0%|          | 0/40 [00:00<?, ?it/s]

1945600 1955839


Processing Rows:   0%|          | 0/28 [00:00<?, ?it/s]

In [7]:
token_to_label = []

for token, label in zip(tokens_list, labels_list):
    token_to_label.append([(t, l) for t, l in zip(token, label)])



In [8]:
import pandas as pd
from datasets import Dataset

df = pd.DataFrame({'toks': tokens_list, 'predicted_labels': labels_list, 'token_to_label': token_to_label})
res = Dataset.from_pandas(df)

In [9]:
res.save_to_disk('full_large_model_predicted_data')

Saving the dataset (0/5 shards):   0%|          | 0/1952332 [00:00<?, ? examples/s]

In [4]:
from datasets import load_from_disk
dataset = load_from_disk(f'full_large_model_predicted_data')

In [5]:
len(dataset)

1952332

In [3]:
len(dataset['toks'][0])

34

In [6]:
def update_tokens(row):
    length = len(row['toks']) + 2
    row['toks'] = ['[CLS]'] + row['toks'][0:62] + ['[SEP]']
    return row


dataset = dataset.map(update_tokens)

In [5]:
len(dataset['toks'][0])

36

In [6]:
len(dataset['predicted_labels'][0])

64

In [7]:
def update_token_to_label(row):
    row['token_to_label'] = [(t, l) for t, l in zip(row['toks'], row['predicted_labels'])]
    return row
        
dataset = dataset.map(update_token_to_label)


In [8]:
token_to_label = dataset['token_to_label']

In [None]:
dataset['token_to_label'][0]

In [9]:
dict_math_labels = defaultdict(int)
dict_display_label = defaultdict(int)
dict_italic_label = defaultdict(int)
for token_label_arr in token_to_label:
    for token, label in token_label_arr:
        if token == "[MATH]":
            dict_math_labels[label] += 1
        elif token == "[DISPLAY]":
            dict_display_label[label] += 1
        elif token == "[ITALIC]":
            dict_italic_label[label] += 1


In [10]:
def to_latex(token_list):
    s = str(token_list)
    s = s.replace("[", "")
    s = s.replace("]", "")
    s = s.replace("('", "")
    s = s.replace("),", "\\\\ \n")
    s = s.replace("',", " &")
    s = s.replace(")", "")
    return s

In [11]:
math_labels = dict_math_labels.items()
math_total = 0
for key, value in math_labels:
    math_total += value

print(math_total)
print(to_latex(sorted(math_labels, key=lambda x: x[1], reverse=True)))


4437657
NOUN & 1957137\\ 
 ADJ & 1920963\\ 
 VERB & 493526\\ 
 PROPN & 62691\\ 
 INTJ & 1677\\ 
 ADV & 1439\\ 
 AUX & 146\\ 
 NUM & 67\\ 
 PRON & 4\\ 
 PUNCT & 4\\ 
 PART & 3


In [12]:
display_labels = dict_display_label.items()
display_total = 0
for key, value in display_labels:
    display_total += value

print(display_total)

print(to_latex(sorted(display_labels, key=lambda x: x[1], reverse=True)))

722174
NOUN & 479226\\ 
 ADJ & 127220\\ 
 VERB & 92748\\ 
 PROPN & 19963\\ 
 ADV & 979\\ 
 CCONJ & 775\\ 
 SCONJ & 396\\ 
 PUNCT & 239\\ 
 PRON & 200\\ 
 DET & 131\\ 
 AUX & 127\\ 
 ADP & 85\\ 
 INTJ & 78\\ 
 NUM & 5\\ 
 PART & 2


In [13]:
italic_labels = dict_italic_label.items()
italics_total = 0
for key, value in italic_labels:
    italics_total += value

print(italics_total)

print(to_latex(sorted(italic_labels, key=lambda x: x[1], reverse=True)))

1051831
NOUN & 947115\\ 
 PROPN & 52425\\ 
 ADJ & 45084\\ 
 VERB & 4714\\ 
 ADV & 1499\\ 
 SYM & 901\\ 
 PRON & 53\\ 
 NUM & 13\\ 
 PART & 11\\ 
 INTJ & 5\\ 
 CCONJ & 4\\ 
 AUX & 4\\ 
 SCONJ & 3


In [14]:
def sum_all_labels(standard_dict, other_dict):
    for key, value in other_dict.items():
        standard_dict[key] += value

In [15]:
full_dict = defaultdict(int)
sum_all_labels(full_dict, dict_math_labels)
sum_all_labels(full_dict, dict_display_label)
sum_all_labels(full_dict, dict_italic_label)
full_labels = full_dict.items()
print(to_latex(sorted(full_labels, key=lambda x: x[1], reverse=True)))

NOUN & 3383478\\ 
 ADJ & 2093267\\ 
 VERB & 590988\\ 
 PROPN & 135079\\ 
 ADV & 3917\\ 
 INTJ & 1760\\ 
 SYM & 901\\ 
 CCONJ & 779\\ 
 SCONJ & 399\\ 
 AUX & 277\\ 
 PRON & 257\\ 
 PUNCT & 243\\ 
 DET & 131\\ 
 NUM & 85\\ 
 ADP & 85\\ 
 PART & 16


In [None]:
adjectives = defaultdict(int)

for token_label_arr in token_to_label:
    for token, label in token_label_arr:
        if label in ['ADJ']:
            adjectives[token] += 1
        
            
        

In [None]:
adjectives

In [23]:
adjectives_dict = lambda: defaultdict(list)
adjectives_seperated_math = defaultdict(adjectives_dict)
adjectives_seperated_display = defaultdict(adjectives_dict)
adjectives_seperated_italic = defaultdict(adjectives_dict)

for token_label_arr in token_to_label:
    for i, (token, label) in enumerate(token_label_arr):
        
        if token == '[MATH]' and label == 'NOUN':
            before = token_label_arr[max(i-3, 0):i]
            for t, label in before:
                if label in ['ADJ']:
                    adjectives_seperated_math[i][label].append(t)
                    
        if token == '[DISPLAY]' and label == 'NOUN':
            before = token_label_arr[max(i-3, 0):i]
            for t, label in before:
                if label in ['ADJ']:
                    adjectives_seperated_display[i][label].append(t)
                    
        if token == '[ITALIC]' and label == 'NOUN':
            before = token_label_arr[max(i-3, 0):i]
            for t, label in before:
                if label in ['ADJ']:
                    adjectives_seperated_display[i][label].append(t)

In [24]:
adjectives_seperated_math

defaultdict(<function __main__.<lambda>()>,
            {6: defaultdict(list,
                         {'ADJ': ['-cluster',
                           '[MATH]',
                           'morphism',
                           '[MATH]',
                           '[MATH]',
                           'morphism',
                           '[MATH]',
                           'constant',
                           'simple',
                           'simple',
                           'separable',
                           'invariable',
                           'covariant',
                           'covariant',
                           'pythagorean',
                           'ideal',
                           'ideal',
                           'such',
                           '[MATH]',
                           'such',
                           '[MATH]',
                           'such',
                           'such',
                           'same',
             

In [25]:
adjectives_seperated_display

defaultdict(<function __main__.<lambda>()>,
            {22: defaultdict(list,
                         {'ADJ': ['acyclic',
                           'separable',
                           'non-zero',
                           'proper',
                           'constant',
                           'many',
                           'many',
                           '[MATH]',
                           '[MATH]',
                           'eigenvalue',
                           '[MATH]',
                           '[MATH]',
                           '[MATH]',
                           '[MATH]',
                           '[MATH]',
                           '[MATH]',
                           'linear',
                           'linear',
                           'parabolic',
                           'parabolic',
                           'nonzero',
                           'greater',
                           'greater',
                           'proper',
         