### 0. Import libraries and read data

In [137]:
import copy
import json
import tqdm
from itertools import groupby
from operator import itemgetter
from flair.data import Sentence


In [3]:
def read_data(input_file_path):
    data = []

    with open(input_file_path) as f:
        for line in f:
            data.append(json.loads(line))

    return data

In [4]:
def write_data(output_file_path, data):
    # Open a new JSON file for writing
    with open(output_file_path, "w") as output_file:
        for data_line in data:
            output_file.write(json.dumps(data_line) + "\n")

In [5]:
data = read_data("../../data/squad_data_train_pos_ner_agg.json")

In [237]:
NER_TAGS = ['[LOC]', '[MISC]', '[ORG]', '[PER]']

### 1. Find the token index where the answer starts

In [6]:
def is_subustring_in_string(string, substring, symbols):
    for symbol in symbols:
        if symbol in string and (substring in string or substring.replace(" ", "") in string.replace(" ", "")):
           return True
        
    return False

def is_one_text_in_context(text, context, symbols):
    count = 0

    for symbol in symbols:
        if context.count(text + symbol):
            count += 1

    return count == 1

In [7]:
def find_start_position(text, pos_context, answer, context):
    tokens = []
    sentence = Sentence(text)
    symbols = [" ", ")", "(", "%", "-", ".", ","]

    # get how many spaces are until the start position
    spaces_no = context[:answer].count(" ")

    # get tokens of the answer
    for word in sentence:
        tokens.append(word.text)

    start_position = 0
    token_number = 0
    match = False
    no_match = -1

    # check if the answer appear only once
    one_apperance = is_one_text_in_context(text, context, symbols)

    # iterate over positions in the pos context
    for start_position in range(len(pos_context) - len(tokens) + 1):
        # store in k_words_context_ and k_words_context the span of tokens from pos_context
        k_words_context_ = [
            bytes(list(line.values())[0][0], "utf-8").decode("unicode_escape")
            for line in pos_context[start_position : start_position + len(tokens)]
        ]

        k_words_context = [
            list(line.values())[0][0].replace("\\", "")
            for line in pos_context[start_position : start_position + len(tokens)]
        ]

        # get the next word after the context
        next_word = [
            list(line.values())[0][0]
            for line in pos_context[
                start_position + len(tokens) : start_position + len(tokens) + 1
            ]
        ]


        # join all words in a string
        k_words_context_string = " ".join(k_words_context)
        k_words_context_string_ = " ".join(k_words_context_)

        # define exact mach condition
        exact_match = k_words_context == tokens or k_words_context_ == tokens


        # if there is only one appearence and the answer was found, stop
        if context.count(text) == 1 and exact_match:
            match = True
            break

        # if there is only one appearence with the answer followed by a token, stop
        if one_apperance and (
            exact_match
            or is_subustring_in_string(k_words_context_string, text, symbols)
            or is_subustring_in_string(k_words_context_string_, text, symbols)
        ):
            match = True
            break

        # if the occurence is not the first one, but the start position is correct, stop
        if (
            text in k_words_context_string
            or text in k_words_context_string_
            or exact_match
            or is_subustring_in_string(k_words_context_string, text, symbols)
            or is_subustring_in_string(k_words_context_string_, text, symbols)
        ) and (
            token_number + spaces_no == answer or token_number + spaces_no + 1 == answer or token_number + spaces_no - 1 == answer
        ):
            match = True
            break

        # else consider the next word
        elif next_word in symbols and (token_number + spaces_no == answer or token_number + spaces_no + 1 == answer or token_number + spaces_no - 1 == answer):
            no_match = start_position
            break

        if "'" in k_words_context_:
            token_number += len(k_words_context_[0])
        else:
            token_number += len(k_words_context[0])

    if match:
        return start_position
    
    return no_match


In [8]:
def check_start_answer_detection(data):
    incorrect_lines_indices = []
    
    for i, line in tqdm.tqdm(enumerate(data)):
        original_answer = line["answers"]["text"][0]
        original_answer_start = line["answers"]["answer_start"][0]
        original_answer_list = [token.text for token in list(Sentence(original_answer))]

        index = find_start_position(original_answer, line["POS_context"], original_answer_start, line["context"])
        found_answer = [list(token.values())[0][0] for token in line['POS_context'][index:index+len(original_answer_list)]]

        if not " ".join(found_answer).startswith(" ".join(original_answer_list)):
            incorrect_lines_indices.append(i)

    return incorrect_lines_indices

In [9]:
incorrect_lines_indices = check_start_answer_detection(data)
len(incorrect_lines_indices)

0it [00:00, ?it/s]

87599it [01:13, 1185.53it/s]


398

In [10]:
for incorrect_index in incorrect_lines_indices:
    line = data[incorrect_index]
    
    original_answer = line["answers"]["text"][0]
    original_answer_start = line["answers"]["answer_start"][0] 
    original_answer_list = [token.text for token in list(Sentence(original_answer))]

    index = find_start_position(original_answer, line["POS_context"], original_answer_start, line["context"])
    found_answer = [list(token.values())[0][0] for token in line['POS_context'][index:index+len(original_answer_list)]]

    print(original_answer_list, found_answer, index)

['Father', 'Joseph', 'Carrier', ',', 'C.S.C', '.'] ['Father', 'Joseph', 'Carrier', ',', 'C.S.C.', 'was'] 0
['Rev', '.', 'John', 'J.', 'Cavanaugh', ',', 'C.S.C', '.'] ['The', 'Rev', '.', 'John', 'J.', 'Cavanaugh', ',', 'C.S.C.'] 0
['five', '.'] ['top-five', '.'] 149
['Jay', 'Z', '.'] ['married', 'Jay', 'Z.'] 7
["'", '03', 'Bonnie', '&', 'Clyde'] ['"\'', '03', 'Bonnie', '&', 'Clyde'] 16
['B.I.C', '.'] ['"', 'B.I.C.'] 91
['B.I.C', '.'] ['"', 'B.I.C.'] 91
['50'] [] -1
['J.', 'S.', 'Bach', ',', 'Mozart', 'and', 'Schubert'] [] -1
['J.', 'S.', 'Bach', ',', 'Mozart', 'and', 'Schubert'] [] -1
['J.', 'S.', 'Bach', ',', 'Mozart', 'and', 'Schubert'] [] -1
['7'] [] -1
['7'] [] -1
['7'] [] -1
['Rondo', 'Op', '.', '1', '.'] ['his', 'Rondo', 'Op', '.', '1.'] 162
['the', 'Canuts', '.'] [',', 'the', 'Canuts.'] 42
['J.S', '.', 'Bach', "'s", 'The', 'Well-Tempered', 'Clavier'] ['by', 'J.S.', 'Bach', "'s", 'The', 'Well-Tempered', 'Clavier'] 38
['Op', '.', '58'] [',', 'Op.', '58'] 18
['Tibet'] ['Sino-Tibetan

- 106 answers that were not found at all
- 292 answers that were found but a bit different


### 2. Insert NER in the context

In [270]:
errors = 0

for line_index, line in tqdm.tqdm(enumerate(data)):
    """
    Find start index for answer
    """
    original_answer = line["answers"]["text"][0]
    original_answer_start = line["answers"]["answer_start"][0] 
    original_answer_list = [token.text for token in list(Sentence(original_answer))]

    index = find_start_position(original_answer, line["POS_context"], original_answer_start, line["context"])
    found_answer = [list(token.values())[0][0] for token in line['POS_context'][index:index+len(original_answer_list)]]

    original_answer_list = [token.text for token in list(Sentence(original_answer + "."))[:-1]]

    #print(original_answer_list, found_answer, index, line["context"])
    
    """
    Get spans of continuous indices
    """
    # get tokens
    tokens = [list(token.values())[0][0] for token in line["POS_context"]]

    continuous_spans_indices = [
        list(map(itemgetter(1), g))
        for _, g in groupby(
            enumerate([int(list(token.keys())[0]) for token in line["NER_context"]]),
            lambda i_x: i_x[0] - i_x[1],
        )
    ]

    token_index_pos_map = {int(list(token.keys())[0]):index for index, token in enumerate(line["NER_context"])}


    """
    Insert NE tags and find the new start index for answer
    """
    updated_context = []
    span_index = 0
    token_index = 0
    new_index = 0
    initial_token_index = 0

    while token_index < len(tokens):
        #print(token_index, len(tokens))
        try:
            continuous_spans_indices[span_index][0]
        except:
            updated_context.append(tokens[token_index])
            if initial_token_index == index:
                #print("found on else")
                new_index = len(updated_context) - 1

            token_index += 1
            initial_token_index += 1
            continue

        if token_index + 1 > continuous_spans_indices[span_index][0] and len(continuous_spans_indices[span_index]) == 1:
            ne_tags = [tag[1] for tag in list(line["NER_context"][token_index_pos_map[token_index]].values())[0]]

            for ne_tag in ne_tags:
                updated_context.append("[" + ne_tag + "]")

            updated_context.append(tokens[token_index])

            if initial_token_index == index:
                #print("found one ner")
                
                if updated_context[-1][0] == "[" and updated_context[-1][-1] == "]":
                    new_index = len(updated_context) - 1 - len(ne_tags)
                else:
                    new_index = len(updated_context) - 1

            for ne_tag in ne_tags[::-1]:
                updated_context.append("[" + ne_tag + "]")

            span_index += 1
            token_index += 1
            initial_token_index += 1


        elif token_index + 1 > continuous_spans_indices[span_index][0] and len(continuous_spans_indices[span_index]) > 1:
            ne_tags = [tag[1] for tag in list(line["NER_context"][token_index_pos_map[token_index]].values())[0]]

            for ne_tag in ne_tags:
                updated_context.append("[" + ne_tag + "]")

            for i in range(len(continuous_spans_indices[span_index])):
                updated_context.append(tokens[token_index])

                if initial_token_index == index:
                    #print("found more ner")

                    if updated_context[-1][0] == "[" and updated_context[-1][-1] == "]":
                        new_index = len(updated_context) - 1 - len(ne_tags)
                    else:
                        new_index = len(updated_context) - 1

                token_index += 1
                initial_token_index += 1

            for ne_tag in ne_tags[::-1]:
                updated_context.append("[" + ne_tag + "]")

            span_index += 1

        else:
            updated_context.append(tokens[token_index])
            if initial_token_index == index:
                #print("found on else")
                new_index = len(updated_context) - 1

            token_index += 1
            initial_token_index += 1

        if token_index == len(tokens) - 1:
            updated_context.append(tokens[token_index])
            if initial_token_index == index:
                #print("found at the end")
                new_index = len(updated_context) - 1

    print(tokens[index:index+len(original_answer_list)])
    print(updated_context[new_index:new_index+10])
    print(updated_context[new_index-2:new_index+10])
    print("")

0it [00:00, ?it/s]

48it [00:00, 477.10it/s]

['Saint', 'Bernadette', 'Soubirous']
['Saint', '[PER]', 'Bernadette', 'Soubirous', '[PER]', 'in', '1858', '.', 'At', 'the']
['appeared', 'to', 'Saint', '[PER]', 'Bernadette', 'Soubirous', '[PER]', 'in', '1858', '.', 'At', 'the']

['a', 'copper', 'statue', 'of', 'Christ']
['a', 'copper', 'statue', 'of', '[PER]', 'Christ', '[PER]', 'with', 'arms', 'upraised']
[',', 'is', 'a', 'copper', 'statue', 'of', '[PER]', 'Christ', '[PER]', 'with', 'arms', 'upraised']

['the', 'Main', 'Building']
['the', '[LOC]', 'Main', 'Building', '[LOC]', "'s", 'gold', 'dome', 'is', 'a']
['.', 'Atop', 'the', '[LOC]', 'Main', 'Building', '[LOC]', "'s", 'gold', 'dome', 'is', 'a']

['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection']
['a', '[MISC]', 'Marian', '[MISC]', 'place', 'of', 'prayer', 'and', 'reflection', '.']
['[MISC]', ',', 'a', '[MISC]', 'Marian', '[MISC]', 'place', 'of', 'prayer', 'and', 'reflection', '.']

['a', 'golden', 'statue', 'of', 'the', 'Virgin', 'Mary']
['a', 'golden', 'statue', 'of',

139it [00:00, 392.30it/s]

['Rev', '.', 'William', 'Corby']
['Rev', '.', '[PER]', 'William', 'Corby', '[PER]', ',', 'immediately', 'planned', 'for']
[',', 'the', 'Rev', '.', '[PER]', 'William', 'Corby', '[PER]', ',', 'immediately', 'planned', 'for']

['17th', 'of', 'May']
['17th', 'of', 'May', 'and', 'by', 'the', 'incredible', 'zeal', 'of', 'administrator']
['on', 'the', '17th', 'of', 'May', 'and', 'by', 'the', 'incredible', 'zeal', 'of', 'administrator']

['LaFortune', 'Student', 'Center']
['LaFortune', 'Student', 'Center', '[LOC]', '[ORG]', ')', 'was', 'built', 'in', '1883']
['[ORG]', '[LOC]', 'LaFortune', 'Student', 'Center', '[LOC]', '[ORG]', ')', 'was', 'built', 'in', '1883']

['scholastic', 'and', 'classical']
['scholastic', 'and', 'classical', 'emphasis', '.', 'By', 'contrast', ',', 'the', '[MISC]']
["'s", 'traditional', 'scholastic', 'and', 'classical', 'emphasis', '.', 'By', 'contrast', ',', 'the', '[MISC]']

['College', 'of', 'Commerce']
['College', 'of', 'Commerce', '[ORG]', ',', '[ORG]', '[PER]', '[L

239it [00:00, 448.16it/s]

['over', '1,200']
['over', '1,200', 'undergraduates', 'in', 'six', 'departments', 'of', 'study', '–', 'biology']
[',', 'includes', 'over', '1,200', 'undergraduates', 'in', 'six', 'departments', 'of', 'study', '–', 'biology']

['School', 'of', 'Architecture']
['School', '[ORG]', 'of', 'Architecture', '[ORG]', 'was', 'established', 'in', '1899', ',']
[]

['Bond', 'Hall']
['Bond', 'Hall', '[LOC]', '[ORG]', ',', 'offers', 'a', 'five-year', 'undergraduate', 'program']
['[ORG]', '[LOC]', 'Bond', 'Hall', '[LOC]', '[ORG]', ',', 'offers', 'a', 'five-year', 'undergraduate', 'program']

['five-year']
['five-year', 'undergraduate', 'program', 'leading', 'to', 'the', 'Bachelor', '[MISC]', 'of', 'Architecture']
['offers', 'a', 'five-year', 'undergraduate', 'program', 'leading', 'to', 'the', 'Bachelor', '[MISC]', 'of', 'Architecture']

['Rome']
['Rome', '[LOC]', '.', 'The', 'university', 'is', 'globally', 'recognized', 'for', 'its']
['in', '[LOC]', 'Rome', '[LOC]', '.', 'The', 'university', 'is', 'gl

362it [00:00, 526.92it/s]

['Dangerously', 'in', 'Love']
['Dangerously', 'in', 'Love', '[MISC]', '(', '2003', ')', ',', 'which', 'established']
[',', '[MISC]', 'Dangerously', 'in', 'Love', '[MISC]', '(', '2003', ')', ',', 'which', 'established']

['2003']
['2003', ')', ',', 'which', 'established', 'her', 'as', 'a', 'solo', 'artist']
['[MISC]', '(', '2003', ')', ',', 'which', 'established', 'her', 'as', 'a', 'solo', 'artist']

['five']
['five', '[MISC]', 'Grammy', 'Awards', '[MISC]', 'and', 'featured', 'the', '[MISC]', 'Billboard']
[',', 'earned', 'five', '[MISC]', 'Grammy', 'Awards', '[MISC]', 'and', 'featured', 'the', '[MISC]', 'Billboard']

['lead', 'singer']
['lead', 'singer', 'of', 'R', '&', 'B', 'girl-group', '[PER]', '[PER]', 'Destiny']
['1990s', 'as', 'lead', 'singer', 'of', 'R', '&', 'B', 'girl-group', '[PER]', '[PER]', 'Destiny']

['Dangerously', 'in', 'Love']
['Dangerously', 'in', 'Love', '[MISC]', '(', '2003', ')', ',', 'which', 'established']
[',', '[MISC]', 'Dangerously', 'in', 'Love', '[MISC]', '('

483it [00:00, 541.30it/s]

['Dangerously', 'in', 'Love']
['Dangerously', 'in', 'Love', '[MISC]', 'was', 'released', 'on', 'June', '24', ',']
['album', '[MISC]', 'Dangerously', 'in', 'Love', '[MISC]', 'was', 'released', 'on', 'June', '24', ',']

['number', 'four']
['number', 'four', 'on', 'the', '[MISC]', 'U.S.', 'Billboard', 'Hot', '100', '[MISC]']
['peaking', 'at', 'number', 'four', 'on', 'the', '[MISC]', 'U.S.', 'Billboard', 'Hot', '100', '[MISC]']

['Luther', 'Vandross']
['Luther', 'Vandross', '[PER]', '.', '.']
['with', '[PER]', 'Luther', 'Vandross', '[PER]', '.', '.']

['Jay', 'Z']
['Jay', 'Z', '[PER]', "'s", '"\'', '03', '[MISC]', 'Bonnie', '&', 'Clyde']
['on', '[PER]', 'Jay', 'Z', '[PER]', "'s", '"\'', '03', '[MISC]', 'Bonnie', '&', 'Clyde']

['June', '24', ',', '2003']
['June', '24', ',', '2003', ',', 'after', '[PER]', 'Michelle', 'Williams', '[PER]']
['released', 'on', 'June', '24', ',', '2003', ',', 'after', '[PER]', 'Michelle', 'Williams', '[PER]']

['Crazy', 'in', 'Love']
['Crazy', 'in', 'Love', '[MI

619it [00:01, 601.37it/s]

['Life', 'Is', 'But', 'a', 'Dream']
['Life', 'Is', 'But', 'a', 'Dream', '[MISC]', ',', 'first', 'aired', 'on']
[',', '[MISC]', 'Life', 'Is', 'But', 'a', 'Dream', '[MISC]', ',', 'first', 'aired', 'on']

['global', 'publishing', 'agreement']
['global', 'publishing', 'agreement', 'with', '[ORG]', 'Warner', '/', 'Chappell', 'Music', '[ORG]']
['signed', 'a', 'global', 'publishing', 'agreement', 'with', '[ORG]', 'Warner', '/', 'Chappell', 'Music', '[ORG]']

['January', '2013']
['January', '2013', ',', '[PER]', '[ORG]', 'Destiny', '[ORG]', '[PER]', "'s", 'Child']
[]

['Nuclear']
['Nuclear', '[MISC]', '"', '.', '[PER]', 'Beyoncé', '[PER]', 'performed', 'the', '[MISC]']
['"', '[MISC]', 'Nuclear', '[MISC]', '"', '.', '[PER]', 'Beyoncé', '[PER]', 'performed', 'the', '[MISC]']

['President', 'Obama']
['President', '[PER]', 'Obama', '[PER]', "'s", 'second', 'inauguration', 'in', '[LOC]', 'Washington']
['track', 'at', 'President', '[PER]', 'Obama', '[PER]', "'s", 'second', 'inauguration', 'in', '[LO

744it [00:01, 608.26it/s]

['2011']
['2011', ',', '[ORG]', 'Beyoncé', '[ORG]', 'earned', 'an', 'average', 'of', '$']
['2009', 'to', '2011', ',', '[ORG]', 'Beyoncé', '[ORG]', 'earned', 'an', 'average', 'of', '$']

['115', 'million']
['115', 'million', 'throughout', 'June', '2013', '–', 'June', '2014', '.', 'This']
['estimated', '$', '115', 'million', 'throughout', 'June', '2013', '–', 'June', '2014', '.', 'This']

['250', 'million']
['250', 'million', '.']
['be', '$', '250', 'million', '.']

['Forbes']
['Forbes', 'magazine', '[ORG]', 'began', 'reporting', 'on', '[ORG]', '[PER]', 'Beyoncé', '[PER]']
[]

['April', '2014']
['April', '2014', '.', 'In', 'June', '2014', ',', '[ORG]', '[PER]', 'Beyoncé']
['so', 'in', 'April', '2014', '.', 'In', 'June', '2014', ',', '[ORG]', '[PER]', 'Beyoncé']

['MTV']
['MTV', '[ORG]', 'estimated', 'that', 'by', 'the', 'end', 'of', '2014', ',']
['.', '[ORG]', 'MTV', '[ORG]', 'estimated', 'that', 'by', 'the', 'end', 'of', '2014', ',']

['2013']
['2013', ',', '[PER]', '[ORG]', 'Beyoncé', 

896it [00:01, 560.33it/s]

['January', '2013']
['January', '2013', ',', '[ORG]', 'GQ', '[ORG]', 'placed', 'her', 'on', 'its']
['.', 'In', 'January', '2013', ',', '[ORG]', 'GQ', '[ORG]', 'placed', 'her', 'on', 'its']

['VH1']
['VH1', '[ORG]', 'listed', 'her', 'at', 'number', '1', 'on', 'its', '100']
['.', '[ORG]', 'VH1', '[ORG]', 'listed', 'her', 'at', 'number', '1', 'on', 'its', '100']

['People']
['People', '[MISC]', '[MISC]', 'and', 'the', '"', 'Hottest', '[MISC]', 'Female', 'Singer']
['[MISC]', '[MISC]', 'People', '[MISC]', '[MISC]', 'and', 'the', '"', 'Hottest', '[MISC]', 'Female', 'Singer']

['Complex']
['Complex', '[ORG]', 'in', '2012', '.', 'In', 'January', '2013', ',', '[ORG]']
['by', '[ORG]', 'Complex', '[ORG]', 'in', '2012', '.', 'In', 'January', '2013', ',', '[ORG]']

['2013']
['2013', ',', '[ORG]', 'GQ', '[ORG]', 'placed', 'her', 'on', 'its', 'cover']
['In', 'January', '2013', ',', '[ORG]', 'GQ', '[ORG]', 'placed', 'her', 'on', 'its', 'cover']

['number', '1']
['number', '1', 'on', 'its', '100', 'Sex

971it [00:01, 607.17it/s]

['The', 'Recording', 'Industry', 'Association', 'of', 'America']
['The', '[ORG]', 'Recording', 'Industry', 'Association', 'of', 'America', '[ORG]', '(', '[ORG]']
['time', '.', 'The', '[ORG]', 'Recording', 'Industry', 'Association', 'of', 'America', '[ORG]', '(', '[ORG]']

['64']
['64', 'certifications', '.', 'Her', 'songs', '"', '[MISC]', 'Crazy', 'in', 'Love']
['total', 'of', '64', 'certifications', '.', 'Her', 'songs', '"', '[MISC]', 'Crazy', 'in', 'Love']

['the', '2008', 'World', 'Music', 'Awards']
['the', '2008', '[MISC]', 'World', 'Music', 'Awards', '[MISC]', 'and', 'the', '[MISC]']
['[MISC]', 'at', 'the', '2008', '[MISC]', 'World', 'Music', 'Awards', '[MISC]', 'and', 'the', '[MISC]']

['20']
['20', '[MISC]', 'Grammy', 'Awards', '[MISC]', ',', 'both', 'as', 'a', 'solo']
['has', 'won', '20', '[MISC]', 'Grammy', 'Awards', '[MISC]', ',', 'both', 'as', 'a', 'solo']

['Alison', 'Krauss']
['Alison', 'Krauss', '[PER]', 'and', 'the', 'most', 'nominated', 'woman', 'in', '[MISC]']
['behind

1099it [00:01, 574.12it/s]

['her', 'mother']
['her', 'mother', '[PER]', 'Tina', 'Knowles', '[PER]', 'that', 'her', 'gift', 'was']
['said', 'about', 'her', 'mother', '[PER]', 'Tina', 'Knowles', '[PER]', 'that', 'her', 'gift', 'was']

['Sandy', 'Hook', 'Elementary', 'School', 'shooting']
['Sandy', 'Hook', 'Elementary', 'School', '[ORG]', 'shooting', '.', '[PER]', 'Beyoncé', '[PER]']
['the', '[ORG]', 'Sandy', 'Hook', 'Elementary', 'School', '[ORG]', 'shooting', '.', '[PER]', 'Beyoncé', '[PER]']

['spread', 'female', 'empowerment']
['spread', 'female', 'empowerment', '.', 'The', 'campaign', ',', 'which', 'aired', 'on']
['aims', 'to', 'spread', 'female', 'empowerment', '.', 'The', 'campaign', ',', 'which', 'aired', 'on']

['Catapult']
['Catapult', '[MISC]', ',', 'visitors', 'of', 'the', 'concert', 'could', 'choose', 'between']
['platform', '[MISC]', 'Catapult', '[MISC]', ',', 'visitors', 'of', 'the', 'concert', 'could', 'choose', 'between']

['Demand', 'A', 'Plan']
['Demand', 'A', 'Plan', '[MISC]', '"', ',', 'a', 'bi

1240it [00:02, 583.93it/s]

['1875']
['1875', ',', 'General', '[PER]', 'Philip', 'Sheridan', '[PER]', 'pleaded', 'to', 'a']
['.', 'In', '1875', ',', 'General', '[PER]', 'Philip', 'Sheridan', '[PER]', 'pleaded', 'to', 'a']

['1881']
['1881', 'and', 'from', 'the', 'east', 'in', '1882', '.', 'However', ',']
['west', 'in', '1881', 'and', 'from', 'the', 'east', 'in', '1882', '.', 'However', ',']

['1882']
['1882', '.', 'However', ',', 'the', 'railroad', 'played', 'a', 'major', 'role']
['east', 'in', '1882', '.', 'However', ',', 'the', 'railroad', 'played', 'a', 'major', 'role']

['1871', ',', '1872', 'and', '1873']
['1871', ',', '1872', 'and', '1873', 'which', 'were', 'challenged', 'forcefully', 'by']
['[LOC]', 'in', '1871', ',', '1872', 'and', '1873', 'which', 'were', 'challenged', 'forcefully', 'by']

['1876']
['1876', '[MISC]', '.', 'The', 'transcontinental', '[ORG]', 'NPR', '[ORG]', 'was', 'completed']
['War', 'of', '1876', '[MISC]', '.', 'The', 'transcontinental', '[ORG]', 'NPR', '[ORG]', 'was', 'completed']

['1

1364it [00:02, 594.90it/s]

['preventing', 'births']
['preventing', 'births', 'or', 'forcibly', 'transferring', 'children', 'out', 'of', 'the', 'group']
['such', 'as', 'preventing', 'births', 'or', 'forcibly', 'transferring', 'children', 'out', 'of', 'the', 'group']

['Adrian', 'Gallagher']
['Adrian', 'Gallagher', '[PER]', 'defined', 'genocide', 'as', "'", 'When', 'a', 'source']
[',', '[PER]', 'Adrian', 'Gallagher', '[PER]', 'defined', 'genocide', 'as', "'", 'When', 'a', 'source']

['collective', 'power']
['collective', 'power', '(', 'usually', 'a', 'state', ')', 'intentionally', 'uses', 'its']
['source', 'of', 'collective', 'power', '(', 'usually', 'a', 'state', ')', 'intentionally', 'uses', 'its']

['intent']
['intent', ',', 'the', 'multidimensional', 'understanding', 'of', 'destroy', ',', 'broadens', 'the']
['centrality', 'of', 'intent', ',', 'the', 'multidimensional', 'understanding', 'of', 'destroy', ',', 'broadens', 'the']

['group', 'identity']
['group', 'identity', 'beyond', 'that', 'of', 'the', '1948', '

1481it [00:02, 522.01it/s]

['subtherapeutic', 'antibiotic', 'treatment']
['subtherapeutic', 'antibiotic', 'treatment', '(', '[MISC]', 'STAT', '[MISC]', ')–', 'with', 'either']
['exposed', 'to', 'subtherapeutic', 'antibiotic', 'treatment', '(', '[MISC]', 'STAT', '[MISC]', ')–', 'with', 'either']

['penicillin', ',', 'vancomycin', ',', 'penicillin', 'and', 'vancomycin', ',', 'or', 'chlortetracycline']
['penicillin', ',', 'vancomycin', ',', 'penicillin', 'and', 'vancomycin', ',', 'or', 'chlortetracycline']
['with', 'either', 'penicillin', ',', 'vancomycin', ',', 'penicillin', 'and', 'vancomycin', ',', 'or', 'chlortetracycline']

['unclear']
['unclear', 'whether', 'or', 'not', 'antibiotics', 'cause', 'obesity', 'in', 'humans', '.']
['it', 'is', 'unclear', 'whether', 'or', 'not', 'antibiotics', 'cause', 'obesity', 'in', 'humans', '.']

['weighed', 'against', 'the', 'beneficial', 'effects']
['weighed', 'against', 'the', 'beneficial', 'effects', 'of', 'clinically', 'indicated', 'treatment', 'with']
['to', 'be', 'weighe

1535it [00:02, 474.26it/s]

['superbugs']
['superbugs', "'", '.', 'Under', 'this', 'Act', ',', '[ORG]', 'FDA', '[ORG]']
['of', "'", 'superbugs', "'", '.', 'Under', 'this', 'Act', ',', '[ORG]', 'FDA', '[ORG]']

['Allan', 'Coukell', ',']
['Allan', 'Coukell', '[PER]', ',', 'senior', 'director', 'for', 'health', 'programs', 'at']
['to', '[PER]', 'Allan', 'Coukell', '[PER]', ',', 'senior', 'director', 'for', 'health', 'programs', 'at']

['Polish', 'and', 'French']
['Polish', '[MISC]', 'and', '[MISC]', 'French', '[MISC]', '(', 'by', 'citizenship', 'and']
['a', '[MISC]', 'Polish', '[MISC]', 'and', '[MISC]', 'French', '[MISC]', '(', 'by', 'citizenship', 'and']

['Romantic', 'era']
['Romantic', '[MISC]', 'era', ',', 'who', 'wrote', 'primarily', 'for', 'the', 'solo']
['the', '[MISC]', 'Romantic', '[MISC]', 'era', ',', 'who', 'wrote', 'primarily', 'for', 'the', 'solo']

['solo', 'piano']
['solo', 'piano', '.', 'He', 'gained', 'and', 'has', 'maintained', 'renown', 'worldwide']
['for', 'the', 'solo', 'piano', '.', 'He', 'gain

1714it [00:03, 643.92it/s]

['Kazimierz', 'Palace']
['Kazimierz', 'Palace', '[LOC]', '(', 'today', 'the', 'rectorate', 'of', '[ORG]', 'Warsaw']
['the', '[LOC]', 'Kazimierz', 'Palace', '[LOC]', '(', 'today', 'the', 'rectorate', 'of', '[ORG]', 'Warsaw']

['Belweder', 'Palace']
['Belweder', 'Palace', '[LOC]', 'as', 'playmate', 'to', 'the', 'son', 'of', 'the']
['the', '[LOC]', 'Belweder', 'Palace', '[LOC]', 'as', 'playmate', 'to', 'the', 'son', 'of', 'the']

['Nasze', 'Przebiegi']
['Nasze', 'Przebiegi', '[MISC]', '"', '("', '[MISC]', 'Our', 'Discourses', '[MISC]', '"']
['"', '[MISC]', 'Nasze', 'Przebiegi', '[MISC]', '"', '("', '[MISC]', 'Our', 'Discourses', '[MISC]', '"']

['1817']
['1817', 'the', '[LOC]', 'Saxon', 'Palace', '[LOC]', 'was', 'requisitioned', 'by', '[LOC]']
[]

['Kazimierz', 'Palace']
['Kazimierz', 'Palace', '[LOC]', '(', 'today', 'the', 'rectorate', 'of', '[ORG]', 'Warsaw']
['the', '[LOC]', 'Kazimierz', 'Palace', '[LOC]', '(', 'today', 'the', 'rectorate', 'of', '[ORG]', 'Warsaw']

['Grand', 'Duke', 'C

1865it [00:03, 698.98it/s]

['Maurice', 'Schlesinger']
['Maurice', 'Schlesinger', '[PER]', ',', 'who', 'arranged', 'for', 'it', 'to', 'be']
['with', '[PER]', 'Maurice', 'Schlesinger', '[PER]', ',', 'who', 'arranged', 'for', 'it', 'to', 'be']

['Adolphe', 'Gutmann']
['Adolphe', 'Gutmann', '[PER]', ',', '[PER]', 'Charles-Valentin', 'Alkan', '[PER]', ',', 'and']
['pupil', '[PER]', 'Adolphe', 'Gutmann', '[PER]', ',', '[PER]', 'Charles-Valentin', 'Alkan', '[PER]', ',', 'and']

['apartment']
['apartment', 'for', 'small', 'groups', 'of', 'friends', '.', 'The', 'musicologist', '[PER]']
['Paris', '[LOC]', 'apartment', 'for', 'small', 'groups', 'of', 'friends', '.', 'The', 'musicologist', '[PER]']

['Liszt', 'and', 'Hiller']
['Liszt', '[PER]', 'and', '[PER]', 'Hiller', '[PER]', 'performed', '(', 'on', 'pianos']
[',', '[PER]', 'Liszt', '[PER]', 'and', '[PER]', 'Hiller', '[PER]', 'performed', '(', 'on', 'pianos']

['Maurice', 'Schlesinger']
['Maurice', 'Schlesinger', '[PER]', ',', 'who', 'arranged', 'for', 'it', 'to', 'be']


2048it [00:03, 796.00it/s]

['Auguste', 'Franchomme']
['Auguste', 'Franchomme', '[PER]', ',', 'he', 'gave', 'his', 'last', '[LOC]', 'Paris']
['cellist', '[PER]', 'Auguste', 'Franchomme', '[PER]', ',', 'he', 'gave', 'his', 'last', '[LOC]', 'Paris']

['Cello']
['Cello', 'Sonata', 'Op', '.', '65', '[MISC]', '.', '.']
['the', '[MISC]', 'Cello', 'Sonata', 'Op', '.', '65', '[MISC]', '.', '.']

['BBC']
['BBC', 'TV', '[ORG]', 'documentary', '[MISC]', 'Chopin', '–', 'The', 'Women', 'Behind']
['a', '[ORG]', 'BBC', 'TV', '[ORG]', 'documentary', '[MISC]', 'Chopin', '–', 'The', 'Women', 'Behind']

['Angelo', 'Bozzolini', 'and', 'Roberto', 'Prosseda']
['Angelo', 'Bozzolini', '[PER]', 'and', '[PER]', 'Roberto', 'Prosseda', '[PER]', 'for', '[MISC]']
['by', '[PER]', 'Angelo', 'Bozzolini', '[PER]', 'and', '[PER]', 'Roberto', 'Prosseda', '[PER]', 'for', '[MISC]']

['The', 'Women', 'Behind', 'The', 'Music']
['The', 'Women', 'Behind', 'The', 'Music', '[MISC]', '(', '2010', ')', ',']
['Chopin', '–', 'The', 'Women', 'Behind', 'The', 'M

2256it [00:03, 868.85it/s]

['his', 'flexible', 'handling', 'of', 'the', 'four-bar', 'phrase', 'as', 'a', 'structural', 'unit']
['his', 'flexible', 'handling', 'of', 'the', 'four-bar', 'phrase', 'as', 'a', 'structural']
['individuality', 'is', 'his', 'flexible', 'handling', 'of', 'the', 'four-bar', 'phrase', 'as', 'a', 'structural']

['the', 'Barcarolle', 'Op', '.', '60']
['the', '[MISC]', 'Barcarolle', 'Op', '.', '60', '[MISC]', 'stands', 'apart', 'as']
['that', '"', 'the', '[MISC]', 'Barcarolle', 'Op', '.', '60', '[MISC]', 'stands', 'apart', 'as']

['the', 'four', 'ballades', 'and', 'four', 'scherzos']
['the', 'four', 'ballades', 'and', 'four', 'scherzos', 'stand', 'supreme', '"', ',']
['use', ',', 'the', 'four', 'ballades', 'and', 'four', 'scherzos', 'stand', 'supreme', '"', ',']

['departure', 'and', 'return']
['departure', 'and', 'return', '"', 'form', ';', '"', 'the', 'more', 'the']
['extended', '"', 'departure', 'and', 'return', '"', 'form', ';', '"', 'the', 'more', 'the']

['folk', 'features']
['folk', 'f

2344it [00:03, 867.98it/s]

['Wang', 'and', 'Nyima']
['Wang', '[PER]', 'and', '[PER]', 'Nyima', '[PER]', 'state', 'that', 'after', 'the']
[]

['The', 'Tai', 'Situpa']
['The', '[PER]', 'Tai', 'Situpa', '[PER]', 'is', 'even', 'supposed', 'to', 'have']
['.', '"', 'The', '[PER]', 'Tai', 'Situpa', '[PER]', 'is', 'even', 'supposed', 'to', 'have']

['Changchub', 'Gyaltsen']
['Changchub', 'Gyaltsen', '[PER]', "'s", 'aims', 'were', 'to', 'recreate', 'the', 'old']
['that', '[PER]', 'Changchub', 'Gyaltsen', '[PER]', "'s", 'aims', 'were', 'to', 'recreate', 'the', 'old']

['Van', 'Praag']
['Van', 'Praag', '[PER]', 'asserts', 'that', '[PER]', 'Changchub', 'Gyaltsen', '[PER]', "'s"]
['.', '[PER]', 'Van', 'Praag', '[PER]', 'asserts', 'that', '[PER]', 'Changchub', 'Gyaltsen', '[PER]', "'s"]

['the', 'University', 'of', 'Washington']
['the', '[ORG]', 'University', 'of', 'Washington', '[ORG]', ',', 'writes', 'that', '[PER]']
['history', 'at', 'the', '[ORG]', 'University', 'of', 'Washington', '[ORG]', ',', 'writes', 'that', '[PER]']

2432it [00:03, 777.00it/s]

['the', 'Ming']
['the', '[MISC]', 'Ming', 'Chinese', '[MISC]', 'frontier', 'and', 'even', 'as', 'far']
[',', 'raiding', 'the', '[MISC]', 'Ming', 'Chinese', '[MISC]', 'frontier', 'and', 'even', 'as', 'far']

['1571']
['1571', ',', 'he', 'invited', 'the', 'third', 'hierarch', 'of', 'the', '[PER]']
['dynasty', 'in', '1571', ',', 'he', 'invited', 'the', 'third', 'hierarch', 'of', 'the', '[PER]']

['the', 'third', 'hierarch', 'of', 'the', 'Gelug', '—', 'Sönam', 'Gyatso']
['the', 'third', 'hierarch', 'of', 'the', '[PER]', 'Gelug', '—', 'Sönam', 'Gyatso']
['he', 'invited', 'the', 'third', 'hierarch', 'of', 'the', '[PER]', 'Gelug', '—', 'Sönam', 'Gyatso']

['the', 'native', 'Mongol', 'practices', 'of', 'shamanism', 'and', 'blood', 'sacrifice']
['the', 'native', '[MISC]', 'Mongol', '[MISC]', 'practices', 'of', 'shamanism', 'and', 'blood']
['[PER]', 'abolished', 'the', 'native', '[MISC]', 'Mongol', '[MISC]', 'practices', 'of', 'shamanism', 'and', 'blood']

['the', 'Mongol', 'princes', 'and', 'su

2638it [00:04, 775.00it/s]

['Toshiba']
['Toshiba', '[ORG]', 'disk', 'drive', 'when', 'meeting', 'with', 'an', '[ORG]', 'Apple']
['the', '[ORG]', 'Toshiba', '[ORG]', 'disk', 'drive', 'when', 'meeting', 'with', 'an', '[ORG]', 'Apple']

['Pixo']
['Pixo', '[ORG]', ',', 'to', 'help', 'design', 'and', 'implement', 'the', 'user']
[',', '[ORG]', 'Pixo', '[ORG]', ',', 'to', 'help', 'design', 'and', 'implement', 'the', 'user']

['Chicago']
['Chicago', '[LOC]', 'font', 'was', 'replaced', 'with', '[ORG]', 'Espy', 'Sans', '[ORG]']
['the', '[LOC]', 'Chicago', '[LOC]', 'font', 'was', 'replaced', 'with', '[ORG]', 'Espy', 'Sans', '[ORG]']

['2007']
['2007', ',', '[ORG]', 'Apple', '[ORG]', 'modified', 'the', '[MISC]', 'iPod', '[MISC]']
['.', 'In', '2007', ',', '[ORG]', 'Apple', '[ORG]', 'modified', 'the', '[MISC]', 'iPod', '[MISC]']

['Helvetica']
['Helvetica', '[MISC]', 'and', ',', 'in', 'most', 'cases', ',', 'splitting', 'the']
['to', '[MISC]', 'Helvetica', '[MISC]', 'and', ',', 'in', 'most', 'cases', ',', 'splitting', 'the']



2832it [00:04, 804.27it/s]

['WMA']
['WMA', '[ORG]', '.', '.']
['than', '[ORG]', 'WMA', '[ORG]', '.', '.']

['PC', 'World']
['PC', 'World', '[ORG]', 'says', 'that', '[MISC]', 'iPod', '[MISC]', 'line', 'has']
['.', '[ORG]', 'PC', 'World', '[ORG]', 'says', 'that', '[MISC]', 'iPod', '[MISC]', 'line', 'has']

['Sony', 'Ericsson', 'and', 'Nokia']
['Sony', 'Ericsson', '[ORG]', 'and', '[ORG]', 'Nokia', '[ORG]', ',', 'which', 'play']
['from', '[ORG]', 'Sony', 'Ericsson', '[ORG]', 'and', '[ORG]', 'Nokia', '[ORG]', ',', 'which', 'play']

['PC', 'World']
['PC', 'World', '[ORG]', 'says', 'that', '[MISC]', 'iPod', '[MISC]', 'line', 'has']
['.', '[ORG]', 'PC', 'World', '[ORG]', 'says', 'that', '[MISC]', 'iPod', '[MISC]', 'line', 'has']

['business']
['business', 'device', '.', 'Government', 'departments', ',', 'major', 'institutions', 'and', 'international']
['as', 'a', 'business', 'device', '.', 'Government', 'departments', ',', 'major', 'institutions', 'and', 'international']

['communication', 'and', 'training']
['communica

3000it [00:04, 746.90it/s]

['Hyrule', 'Castle']
['Hyrule', 'Castle', '[LOC]', ',', 'with', 'a', 'lifeless', '[PER]', 'Zelda', '[PER]']
['in', '[LOC]', 'Hyrule', 'Castle', '[LOC]', ',', 'with', 'a', 'lifeless', '[PER]', 'Zelda', '[PER]']

['Zelda', "'s"]
['Zelda', '[PER]', "'s", 'body', 'and', 'eventually', 'by', 'transforming', 'into', 'a']
['possessing', '[PER]', 'Zelda', '[PER]', "'s", 'body', 'and', 'eventually', 'by', 'transforming', 'into', 'a']

['Shadow', 'Crystal']
['Shadow', 'Crystal', '[MISC]', '.', 'Now', 'able', 'to', 'use', 'it', 'to']
['the', '[MISC]', 'Shadow', 'Crystal', '[MISC]', '.', 'Now', 'able', 'to', 'use', 'it', 'to']

['Gerudo', 'Desert']
['Gerudo', 'Desert', '[LOC]', ',', 'the', 'only', 'known', 'gateway', 'between', 'the']
['the', '[LOC]', 'Gerudo', 'Desert', '[LOC]', ',', 'the', 'only', 'known', 'gateway', 'between', 'the']

['Midna']
['Midna', '[LOC]', '[PER]', 'to', 'the', '[MISC]', 'Mirror', 'of', 'Twilight', '[MISC]']
['[PER]', '[LOC]', 'Midna', '[LOC]', '[PER]', 'to', 'the', '[MIS

3183it [00:04, 793.12it/s]

['Best', 'Original', 'Song']
['Best', 'Original', 'Song', '[MISC]', 'and', 'was', 'nominated', 'for', 'the', '[MISC]']
['Globe', 'for', 'Best', 'Original', 'Song', '[MISC]', 'and', 'was', 'nominated', 'for', 'the', '[MISC]']

['Skyfall']
['Skyfall', '[ORG]', '[MISC]', ',', 'although', 'it', 'was', 'not', 'filmed', 'with']
['[MISC]', '[ORG]', 'Skyfall', '[ORG]', '[MISC]', ',', 'although', 'it', 'was', 'not', 'filmed', 'with']

['Writing', "'s", 'on', 'the', 'Wall']
['Writing', "'s", 'on', 'the', 'Wall', '[MISC]', '"', ',', 'received', 'mixed']
['"', '[MISC]', 'Writing', "'s", 'on', 'the', 'Wall', '[MISC]', '"', ',', 'received', 'mixed']

['Day', 'of', 'the', 'Dead']
['Day', 'of', 'the', 'Dead', '[MISC]', 'and', 'gives', 'chase', 'to', '[PER]']
['the', '[MISC]', 'Day', 'of', 'the', 'Dead', '[MISC]', 'and', 'gives', 'chase', 'to', '[PER]']

['Mallory']
['Mallory', '[PER]', "'s", 'promotion', 'to', '[MISC]', 'M', '[MISC]', ',', 'on']
['[PER]', 'Garreth', 'Mallory', '[PER]', "'s", 'promotio

3264it [00:05, 741.90it/s]

['night']
['night', ',', 'with', 'filming', 'temporarily', 'closing', 'both', '[LOC]', 'Westminster', '[LOC]']
['shot', 'at', 'night', ',', 'with', 'filming', 'temporarily', 'closing', 'both', '[LOC]', 'Westminster', '[LOC]']

['Oujda', ',', 'Tangier', 'and', 'Erfoud']
['Oujda', '[LOC]', ',', '[LOC]', 'Tangier', '[LOC]', 'and', '[LOC]', 'Erfoud', '[LOC]']
['in', '[LOC]', 'Oujda', '[LOC]', ',', '[LOC]', 'Tangier', '[LOC]', 'and', '[LOC]', 'Erfoud', '[LOC]']

['Largest', 'film', 'stunt', 'explosion']
['Largest', 'film', 'stunt', 'explosion', '"', 'in', 'cinematic', 'history', ',', 'with']
['the', '"', 'Largest', 'film', 'stunt', 'explosion', '"', 'in', 'cinematic', 'history', ',', 'with']

['Chris', 'Corbould']
['Chris', 'Corbould', '[PER]', '.', 'Principal', 'photography', 'concluded', 'on', '5', 'July']
['designer', '[PER]', 'Chris', 'Corbould', '[PER]', '.', 'Principal', 'photography', 'concluded', 'on', '5', 'July']

['128', 'days']
['128', 'days', '.']
['Filming', 'took', '128', 'da

3428it [00:05, 761.50it/s]

['almost', '80', '%']
['almost', '80', '%', 'of', 'buildings', 'were', 'destroyed', '.']
['the', 'quake', 'almost', '80', '%', 'of', 'buildings', 'were', 'destroyed', '.']

['Longmenshan', 'fault']
['Longmenshan', '[LOC]', 'fault', ',', 'a', 'thrust', 'structure', 'along', 'the', 'border']
['the', '[LOC]', 'Longmenshan', '[LOC]', 'fault', ',', 'a', 'thrust', 'structure', 'along', 'the', 'border']

['along', 'the', 'border', 'of', 'the', 'Indo-Australian', 'Plate', 'and', 'Eurasian', 'Plate']
['along', 'the', 'border', 'of', 'the', '[LOC]', 'Indo-Australian', 'Plate', '[LOC]', 'and']
['thrust', 'structure', 'along', 'the', 'border', 'of', 'the', '[LOC]', 'Indo-Australian', 'Plate', '[LOC]', 'and']

['120', 'sec']
['120', 'sec', ',', 'with', 'the', 'majority', 'of', 'energy', 'released', 'in']
['close', 'to', '120', 'sec', ',', 'with', 'the', 'majority', 'of', 'energy', 'released', 'in']

['10', 'km']
['10', 'km', '.']
['deeper', 'than', '10', 'km', '.']

['Longmenshan', 'fault']
['Longm

3675it [00:05, 953.37it/s]

['rural', 'part']
['rural', 'part', 'of', '[LOC]', 'China', '[LOC]', '.', 'Presumably', ',', 'many']
['in', 'the', 'rural', 'part', 'of', '[LOC]', 'China', '[LOC]', '.', 'Presumably', ',', 'many']

['not', 'designed']
['not', 'designed', ',', 'so', 'to', 'speak', '.', '"', '[PER]', 'Swaminathan']
['they', 'were', 'not', 'designed', ',', 'so', 'to', 'speak', '.', '"', '[PER]', 'Swaminathan']

['very', 'strong']
['very', 'strong', 'building', 'codes', 'in', '[LOC]', 'China', '[LOC]', ',', 'which']
['There', 'are', 'very', 'strong', 'building', 'codes', 'in', '[LOC]', 'China', '[LOC]', ',', 'which']

['regulations']
['regulations', 'overseeing', 'them', '.', '"']
['with', 'any', 'regulations', 'overseeing', 'them', '.', '"']

['$', '75', 'billion']
['$', '75', 'billion', ',', 'making', 'the', 'earthquake', 'one', 'of', 'the']
['US', '[LOC]', '$', '75', 'billion', ',', 'making', 'the', 'earthquake', 'one', 'of', 'the']

['minor', 'damage']
['minor', 'damage', 'from', 'the', 'quake', ',', '

3772it [00:05, 839.80it/s]

['34']
['34', 'lakes', 'had', 'formed', 'due', 'to', 'earthquake', 'debris', 'blocking', 'and']
['2008', ',', '34', 'lakes', 'had', 'formed', 'due', 'to', 'earthquake', 'debris', 'blocking', 'and']

['28']
['28', 'of', 'them', 'were', 'still', 'of', 'potential', 'danger', 'to', 'the']
['estimated', 'that', '28', 'of', 'them', 'were', 'still', 'of', 'potential', 'danger', 'to', 'the']

['7.9']
['7.9', 'earthquake', 'and', 'the', 'many', 'strong', 'aftershocks', ',', 'many', 'rivers']
['the', 'magnitude', '7.9', 'earthquake', 'and', 'the', 'many', 'strong', 'aftershocks', ',', 'many', 'rivers']

['large', 'landslides']
['large', 'landslides', ',', 'which', 'resulted', 'in', 'the', 'formation', 'of', '"']
['blocked', 'by', 'large', 'landslides', ',', 'which', 'resulted', 'in', 'the', 'formation', 'of', '"']

['quake', 'lakes']
['quake', 'lakes', '"', 'behind', 'the', 'blockages', ';', 'these', 'massive', 'amounts']
['of', '"', 'quake', 'lakes', '"', 'behind', 'the', 'blockages', ';', 'the

4025it [00:05, 997.45it/s]

['1674']
['1674', '.']
['[LOC]', 'in', '1674', '.']

['Anthonio', 'Colve']
['Anthonio', 'Colve', '[PER]', 'took', 'over', 'the', 'colony', 'of', '[LOC]', 'New']
['captain', '[PER]', 'Anthonio', 'Colve', '[PER]', 'took', 'over', 'the', 'colony', 'of', '[LOC]', 'New']

['England']
['England', '[LOC]', 'and', 'rechristened', 'it', '"', '[MISC]', 'New', 'Orange', '[MISC]']
['from', '[LOC]', 'England', '[LOC]', 'and', 'rechristened', 'it', '"', '[MISC]', 'New', 'Orange', '[MISC]']

['King', 'William', 'III']
['King', '[PER]', 'William', 'III', '[PER]', '.', 'However', ',', 'facing', 'defeat']
['[PER]', ',', 'King', '[PER]', 'William', 'III', '[PER]', '.', 'However', ',', 'facing', 'defeat']

['Second', 'Anglo-Dutch', 'War']
['Second', 'Anglo-Dutch', 'War', '[MISC]', ',', 'the', '[MISC]', 'English', '[MISC]', 'gained']
['the', '[MISC]', 'Second', 'Anglo-Dutch', 'War', '[MISC]', ',', 'the', '[MISC]', 'English', '[MISC]', 'gained']

['200']
['200', '.']
['diminished', 'to', '200', '.']

['Run'

4225it [00:06, 923.58it/s]

['National', 'Park', 'Service']
['National', 'Park', 'Service', '[ORG]', 'and', 'are', 'in', 'both', 'the', 'states']
['the', '[ORG]', 'National', 'Park', 'Service', '[ORG]', 'and', 'are', 'in', 'both', 'the', 'states']

['New', 'Jersey']
['New', 'Jersey', '[LOC]', '.', 'They', 'are', 'joined', 'in', 'the', 'harbor']
['and', '[LOC]', 'New', 'Jersey', '[LOC]', '.', 'They', 'are', 'joined', 'in', 'the', 'harbor']

['New', 'Jersey']
['New', 'Jersey', '[LOC]', '.', 'They', 'are', 'joined', 'in', 'the', 'harbor']
['and', '[LOC]', 'New', 'Jersey', '[LOC]', '.', 'They', 'are', 'joined', 'in', 'the', 'harbor']

['Stonewall', 'Inn']
['Stonewall', 'Inn', '[ORG]', 'in', '[LOC]', 'Greenwich', 'Village', '[LOC]', 'as', 'the']
['the', '[ORG]', 'Stonewall', 'Inn', '[ORG]', 'in', '[LOC]', 'Greenwich', 'Village', '[LOC]', 'as', 'the']

['Grant', "'s", 'Tomb']
['Grant', "'s", 'Tomb', '")', '[LOC]', ';', '[ORG]', 'African', 'Burial', 'Ground']
['("', '[LOC]', 'Grant', "'s", 'Tomb', '")', '[LOC]', ';', '[

4413it [00:06, 865.06it/s]

['Summerstage']
['Summerstage', '[LOC]', '.', 'Major', 'attractions', 'in', 'the', 'boroughs', 'outside', '[LOC]']
['at', '[LOC]', 'Summerstage', '[LOC]', '.', 'Major', 'attractions', 'in', 'the', 'boroughs', 'outside', '[LOC]']

['Queens']
['Queens', '[LOC]', ';', 'the', '[LOC]', 'Bronx', 'Zoo', '[LOC]', ';', '[LOC]']
['in', '[LOC]', 'Queens', '[LOC]', ';', 'the', '[LOC]', 'Bronx', 'Zoo', '[LOC]', ';', '[LOC]']

['90,000']
['90,000', 'hotel', 'rooms', 'at', 'the', 'end', 'of', '2014', ',', 'a']
['an', 'estimated', '90,000', 'hotel', 'rooms', 'at', 'the', 'end', 'of', '2014', ',', 'a']

['10', '%']
['10', '%', 'increase', 'from', '2013', '.', 'In', 'October', '2014', ',']
[',', 'a', '10', '%', 'increase', 'from', '2013', '.', 'In', 'October', '2014', ',']

['Anbang', 'Insurance', 'Group']
['Anbang', 'Insurance', 'Group', '[ORG]', ',', 'based', 'in', '[LOC]', 'China', '[LOC]']
['the', '[ORG]', 'Anbang', 'Insurance', 'Group', '[ORG]', ',', 'based', 'in', '[LOC]', 'China', '[LOC]']

['US'

4646it [00:06, 973.28it/s]

['1930', 'and', '1939']
['1930', 'and', '1939', '[MISC]', 'PGA', 'Championships', '[MISC]', ',', 'and', 'has']
['and', 'the', '1930', 'and', '1939', '[MISC]', 'PGA', 'Championships', '[MISC]', ',', 'and', 'has']

['Madison', 'Square', 'Garden']
['Madison', 'Square', 'Garden', '[LOC]', 'each', 'year', '.', 'The', 'city', 'is']
['at', '[LOC]', 'Madison', 'Square', 'Garden', '[LOC]', 'each', 'year', '.', 'The', 'city', 'is']

['Queens']
['Queens', '[LOC]', '.', 'The', '[ORG]', '[LOC]', 'New', 'York', 'Marathon', '[LOC]']
[',', '[LOC]', 'Queens', '[LOC]', '.', 'The', '[ORG]', '[LOC]', 'New', 'York', 'Marathon', '[LOC]']

['United', 'States', 'Open', 'Tennis', 'Championships']
['United', 'States', 'Open', 'Tennis', 'Championships', '[MISC]', 'is', 'one', 'of', 'the']
['annual', '[MISC]', 'United', 'States', 'Open', 'Tennis', 'Championships', '[MISC]', 'is', 'one', 'of', 'the']

['37,866']
['37,866', 'finishers', 'in', '2006', '.', 'The', '[MISC]', 'Millrose', 'Games', '[MISC]']
[',', 'inclu

4838it [00:06, 899.74it/s]

['Scout', ',', 'Jem', ',', 'and', 'Dill']
['Scout', '[PER]', ',', '[PER]', 'Jem', '[PER]', ',', 'and', '[PER]', 'Dill']
['when', '[PER]', 'Scout', '[PER]', ',', '[PER]', 'Jem', '[PER]', ',', 'and', '[PER]', 'Dill']

['Jem', 'and', 'Scout']
['Jem', '[PER]', '[PER]', 'and', '[PER]', 'Scout', '[PER]', 'to', 'be', 'present']
['[PER]', '[PER]', 'Jem', '[PER]', '[PER]', 'and', '[PER]', 'Scout', '[PER]', 'to', 'be', 'present']

['balcony']
['balcony', '.', '[MISC]', '[PER]', 'Atticus', '[PER]', '[MISC]', 'establishes', 'that', 'the']
['the', 'colored', 'balcony', '.', '[MISC]', '[PER]', 'Atticus', '[PER]', '[MISC]', 'establishes', 'that', 'the']

['the', 'colored', 'balcony']
['the', 'colored', 'balcony', '.', '[MISC]', '[PER]', 'Atticus', '[PER]', '[MISC]', 'establishes']
['watch', 'from', 'the', 'colored', 'balcony', '.', '[MISC]', '[PER]', 'Atticus', '[PER]', '[MISC]', 'establishes']

['shot', 'and', 'killed']
['shot', 'and', 'killed', 'while', 'trying', 'to', 'escape', 'from', 'prison', '

4929it [00:06, 797.45it/s]

['The', 'Chicago', 'Sunday', 'Tribune']
['The', '[ORG]', 'Chicago', 'Sunday', 'Tribune', '[ORG]', 'noted', 'the', 'even-handed', 'approach']
['"', '.', 'The', '[ORG]', 'Chicago', 'Sunday', 'Tribune', '[ORG]', 'noted', 'the', 'even-handed', 'approach']

['Granville', 'Hicks']
['Granville', 'Hicks', '[PER]', 'labeled', 'the', 'book', '"', 'melodramatic', 'and', 'contrived']
['and', '[PER]', 'Granville', 'Hicks', '[PER]', 'labeled', 'the', 'book', '"', 'melodramatic', 'and', 'contrived']

['Flannery', "O'Connor"]
['Flannery', "O'Connor", '[PER]', 'commented', ',', '"', 'I', 'think', 'for', 'a']
['writer', '[PER]', 'Flannery', "O'Connor", '[PER]', 'commented', ',', '"', 'I', 'think', 'for', 'a']

['William', 'Faulkner']
['William', 'Faulkner', '[PER]', ',', 'who', 'wrote', 'about', 'racism', 'as', 'an']
['to', '[PER]', 'William', 'Faulkner', '[PER]', ',', 'who', 'wrote', 'about', 'racism', 'as', 'an']

['Jane', 'Austen']
['Jane', 'Austen', '[PER]', ',', 'stating', 'the', 'book', "'s", 'cen

5011it [00:06, 757.31it/s]

['the', 'Mojave', 'Desert']
['the', '[LOC]', 'Mojave', 'Desert', '[LOC]', 'of', '[LOC]', 'California', '[LOC]', ',']
[',', 'in', 'the', '[LOC]', 'Mojave', 'Desert', '[LOC]', 'of', '[LOC]', 'California', '[LOC]', ',']

['2013']
['2013', 'solar', 'generated', 'less', 'than', '1', '%', 'of', 'the', 'worlds']
['.', 'In', '2013', 'solar', 'generated', 'less', 'than', '1', '%', 'of', 'the', 'worlds']

['354', 'MW', 'SEGS', 'CSP']
['354', 'MW', '[MISC]', 'SEGS', 'CSP', '[MISC]', 'installation', ',', 'in', 'the']
['the', 'eventually', '354', 'MW', '[MISC]', 'SEGS', 'CSP', '[MISC]', 'installation', ',', 'in', 'the']

['Mojave', 'Desert', 'of', 'California']
['Mojave', 'Desert', '[LOC]', 'of', '[LOC]', 'California', '[LOC]', ',', 'is', 'the']
['the', '[LOC]', 'Mojave', 'Desert', '[LOC]', 'of', '[LOC]', 'California', '[LOC]', ',', 'is', 'the']

['The', '250', 'MW', 'Agua', 'Caliente', 'Solar', 'Project', ',', 'in', 'the', 'United', 'States', ',', 'and', 'the', '221', 'MW', 'Charanka', 'Solar', 'P

5155it [00:07, 598.67it/s]

['photovoltaic', 'systems', ',', 'concentrated', 'solar', 'power', 'and', 'solar', 'water', 'heating']
['photovoltaic', 'systems', ',', 'concentrated', 'solar', 'power', 'and', 'solar', 'water', 'heating']
['use', 'of', 'photovoltaic', 'systems', ',', 'concentrated', 'solar', 'power', 'and', 'solar', 'water', 'heating']

['orienting', 'a', 'building', 'to', 'the', 'Sun']
['orienting', 'a', 'building', 'to', 'the', '[MISC]', '[ORG]', 'Sun', '[ORG]', '[MISC]']
['techniques', 'include', 'orienting', 'a', 'building', 'to', 'the', '[MISC]', '[ORG]', 'Sun', '[ORG]', '[MISC]']

['559.8', 'EJ']
['559.8', '[MISC]', 'EJ', '[MISC]', 'in', '2012', '.']
['which', 'was', '559.8', '[MISC]', 'EJ', '[MISC]', 'in', '2012', '.']

['1,575', '–', '49,837', 'exajoules', '(', 'EJ', ')']
['1,575', '–', '49,837', 'exajoules', '(', '[MISC]', 'EJ', '[MISC]', ')', '.']
['energy', 'was', '1,575', '–', '49,837', 'exajoules', '(', '[MISC]', 'EJ', '[MISC]', ')', '.']

['The', 'large', 'magnitude', 'of', 'solar', 'ene

5330it [00:07, 693.78it/s]

['The', 'government', 'insisted', 'that', 'only', 'minor', 'violations', 'had', 'occurred', ',', 'which', 'would', 'not', 'affect', 'the', 'will', 'of', 'the', 'Tajik', 'people']
['The', 'government', 'insisted', 'that', 'only', 'minor', 'violations', 'had', 'occurred', ',']
['.', '"', 'The', 'government', 'insisted', 'that', 'only', 'minor', 'violations', 'had', 'occurred', ',']

['in', 'Central', 'Asia']
['in', '[LOC]', 'Central', 'Asia', '[LOC]', 'with', 'an', 'estimated', '8', 'million']
['landlocked', 'country', 'in', '[LOC]', 'Central', 'Asia', '[LOC]', 'with', 'an', 'estimated', '8', 'million']

['8', 'million', 'people']
['8', 'million', 'people', 'in', '2013', ',', 'and', 'an', 'area', 'of']
['an', 'estimated', '8', 'million', 'people', 'in', '2013', ',', 'and', 'an', 'area', 'of']

['area', 'of', '143,100', 'km2', '(', '55,300', 'sq', 'mi', ')']
['area', 'of', '143,100', 'km2', '(', '55,300', 'sq', 'mi', ')', '.']
['and', 'an', 'area', 'of', '143,100', 'km2', '(', '55,300', '

5529it [00:07, 809.52it/s]

['neurosurgeon']
['neurosurgeon', ',', 'had', 'taken', 'an', 'interest', 'in', 'the', 'pathology', 'of']
['called', 'a', 'neurosurgeon', ',', 'had', 'taken', 'an', 'interest', 'in', 'the', 'pathology', 'of']

['the', 'pathology', 'of', 'speech']
['the', 'pathology', 'of', 'speech', '.', 'He', 'wanted', 'to', 'localize', 'the']
['interest', 'in', 'the', 'pathology', 'of', 'speech', '.', 'He', 'wanted', 'to', 'localize', 'the']

['speech', 'center']
['speech', 'center', 'of', 'the', 'human', 'brain', ',', 'today', 'called', '[PER]']
['discovered', 'the', 'speech', 'center', 'of', 'the', 'human', 'brain', ',', 'today', 'called', '[PER]']

['psychology']
['psychology', ',', '[PER]', 'Theodor', 'Waitz', '[PER]', ',', 'took', 'up', 'the']
['specializing', 'in', 'psychology', ',', '[PER]', 'Theodor', 'Waitz', '[PER]', ',', 'took', 'up', 'the']

['six-volume']
['six-volume', 'work', ',', 'entitled', '[MISC]', 'Die', 'Anthropologie', 'der', 'Naturvölker', '[MISC]']
['in', 'his', 'six-volume', '

5626it [00:07, 826.09it/s]

['Galician']
['Galician', '[MISC]', 'and', '[MISC]', 'Portuguese', '[MISC]', 'versions', 'of', 'the', 'language']
['The', '[MISC]', 'Galician', '[MISC]', 'and', '[MISC]', 'Portuguese', '[MISC]', 'versions', 'of', 'the', 'language']

['Spanish']
['Spanish', '[MISC]', ')', 'slowly', 'over', 'the', 'centuries', 'began', 'influencing', 'the']
['as', '[MISC]', 'Spanish', '[MISC]', ')', 'slowly', 'over', 'the', 'centuries', 'began', 'influencing', 'the']

['Castilian', '(', 'Spanish', 'Language', ')']
['Castilian', '[MISC]', '(', '[MISC]', 'Spanish', 'Language', '[MISC]', ')', '.']
['the', '[MISC]', 'Castilian', '[MISC]', '(', '[MISC]', 'Spanish', 'Language', '[MISC]', ')', '.']

['the', 'daughter', 'of', 'the', 'Austrian', 'Field', 'Marshal', 'Leopold', 'Josef', ',', 'Count', 'von', 'Daun']
['the', 'daughter', 'of', 'the', '[MISC]', 'Austrian', '[MISC]', 'Field', 'Marshal', '[PER]']
['marriage', 'to', 'the', 'daughter', 'of', 'the', '[MISC]', 'Austrian', '[MISC]', 'Field', 'Marshal', '[PER]

5796it [00:08, 767.69it/s]

['cork']
['cork', '(', '[LOC]', 'Portugal', '[LOC]', 'is', 'the', 'world', "'s", 'leading']
['footwear', 'and', 'cork', '(', '[LOC]', 'Portugal', '[LOC]', 'is', 'the', 'world', "'s", 'leading']

['European', 'Commission', ',', 'European', 'Central', 'Bank', 'and', 'International', 'Monetary', 'Fund']
['European', 'Commission', '[ORG]', ',', '[ORG]', 'European', 'Central', 'Bank', '[ORG]', 'and']
['the', '[ORG]', 'European', 'Commission', '[ORG]', ',', '[ORG]', 'European', 'Central', 'Bank', '[ORG]', 'and']

['2011']
['2011', ',', 'required', '[LOC]', 'Portugal', '[LOC]', 'to', 'enter', 'into', 'a']
['to', 'in', '2011', ',', 'required', '[LOC]', 'Portugal', '[LOC]', 'to', 'enter', 'into', 'a']

['€', '78', 'billion']
['€', '78', 'billion', '.', 'In', 'May', '2014', 'the', 'country', 'exited']
['support', 'of', '€', '78', 'billion', '.', 'In', 'May', '2014', 'the', 'country', 'exited']

['May', '2014']
['May', '2014', 'the', 'country', 'exited', 'the', 'bailout', 'but', 'reaffirmed', 'it

5953it [00:08, 713.30it/s]

['public', 'money']
['public', 'money', '.', 'However', ',', 'with', 'the', 'increasing', 'tuition', 'fees']
['supported', 'with', 'public', 'money', '.', 'However', ',', 'with', 'the', 'increasing', 'tuition', 'fees']

['over', '23', 'years', 'old']
['over', '23', 'years', 'old', ')', ',', 'international', 'students', ',', 'foreign']
['applicants', '(', 'over', '23', 'years', 'old', ')', ',', 'international', 'students', ',', 'foreign']

['developing', 'health', 'policy', 'as', 'well', 'as', 'managing', 'the', 'SNS']
['developing', 'health', 'policy', 'as', 'well', 'as', 'managing', 'the', '[ORG]', 'SNS']
['responsible', 'for', 'developing', 'health', 'policy', 'as', 'well', 'as', 'managing', 'the', '[ORG]', 'SNS']

['Five']
['Five', 'regional', 'health', 'administrations', 'are', 'in', 'charge', 'of', 'implementing', 'the']
['[ORG]', '.', 'Five', 'regional', 'health', 'administrations', 'are', 'in', 'charge', 'of', 'implementing', 'the']

['implementing', 'the', 'national', 'health',

6026it [00:08, 589.93it/s]

['2000']
['2000', ',', 'when', 'he', 'began', 'to', 'produce', 'for', 'artists', 'on']
['the', 'year', '2000', ',', 'when', 'he', 'began', 'to', 'produce', 'for', 'artists', 'on']

['The', 'Blueprint']
['The', 'Blueprint', '[MISC]', '.', 'The', '[MISC]', 'Blueprint', '[MISC]', 'is', 'consistently']
['album', '[MISC]', 'The', 'Blueprint', '[MISC]', '.', 'The', '[MISC]', 'Blueprint', '[MISC]', 'is', 'consistently']

['rapper']
['rapper', '.', 'Though', 'he', 'had', 'developed', 'his', 'rapping', 'long', 'before']
['be', 'a', 'rapper', '.', 'Though', 'he', 'had', 'developed', 'his', 'rapping', 'long', 'before']

['Capitol', 'Records']
['Capitol', 'Records', '[ORG]', ',', '[LOC]', '[PER]', 'West', '[PER]', '[LOC]', 'was']
['with', '[ORG]', 'Capitol', 'Records', '[ORG]', ',', '[LOC]', '[PER]', 'West', '[PER]', '[LOC]', 'was']

['rapper']
['rapper', '.', 'Though', 'he', 'had', 'developed', 'his', 'rapping', 'long', 'before']
['be', 'a', 'rapper', '.', 'Though', 'he', 'had', 'developed', 'his

6162it [00:08, 624.47it/s]

['Lady', 'Gaga']
['Lady', 'Gaga', '[PER]', 'was', 'cancelled', 'in', 'response', 'to', 'the', 'controversy']
['with', '[PER]', 'Lady', 'Gaga', '[PER]', 'was', 'cancelled', 'in', 'response', 'to', 'the', 'controversy']

['52nd', 'Grammy', 'Awards']
['52nd', 'Grammy', 'Awards', '[MISC]', '.', '.']
['the', '[MISC]', '52nd', 'Grammy', 'Awards', '[MISC]', '.', '.']

['Hawaii']
['Hawaii', '[LOC]', 'for', 'the', 'next', 'few', 'months', 'writing', 'and', 'recording']
['in', '[LOC]', 'Hawaii', '[LOC]', 'for', 'the', 'next', 'few', 'months', 'writing', 'and', 'recording']

['fashion']
['fashion', ',', 'only', 'to', 'hole', 'up', 'in', '[LOC]', 'Hawaii', '[LOC]']
['himself', 'into', 'fashion', ',', 'only', 'to', 'hole', 'up', 'in', '[LOC]', 'Hawaii', '[LOC]']

['Hawaii']
['Hawaii', '[LOC]', 'for', 'the', 'next', 'few', 'months', 'writing', 'and', 'recording']
['in', '[LOC]', 'Hawaii', '[LOC]', 'for', 'the', 'next', 'few', 'months', 'writing', 'and', 'recording']

['My', 'Beautiful', 'Dark', 'Twi

6227it [00:08, 609.21it/s]

['The', 'Dr.', 'Donda', 'West', 'Foundation']
['The', 'Dr.', 'Donda', 'West', 'Foundation', '[ORG]', '.', '"', 'The', 'foundation']
['"', '[ORG]', 'The', 'Dr.', 'Donda', 'West', 'Foundation', '[ORG]', '.', '"', 'The', 'foundation']

['2011']
['2011', '.']
['operations', 'in', '2011', '.']

['2008']
['2008', ',', 'following', 'the', 'death', 'of', '[LOC]', '[PER]', 'West', '[PER]']
[]

['The', 'Dr.', 'Donda', 'West', 'Foundation']
['The', 'Dr.', 'Donda', 'West', 'Foundation', '[ORG]', '.', '"', 'The', 'foundation']
['"', '[ORG]', 'The', 'Dr.', 'Donda', 'West', 'Foundation', '[ORG]', '.', '"', 'The', 'foundation']

['2008']
['2008', ',', 'following', 'the', 'death', 'of', '[LOC]', '[PER]', 'West', '[PER]']
[]

['100', 'Black', 'Men', 'of', 'America', ',', 'a', 'Live', 'Earth', 'concert', 'benefit', ',', 'World', 'Water', 'Day', 'rally']
['100', 'Black', 'Men', 'of', 'America', '[ORG]', ',', 'a', '[MISC]', 'Live']
[',', '[ORG]', '100', 'Black', 'Men', 'of', 'America', '[ORG]', ',', 'a', '

6363it [00:09, 596.36it/s]

['November', '2007']
['November', '2007', ',', '[PER]', 'Knievel', '[PER]', 'amicably', 'settled', 'the', 'suit']
['death', 'in', 'November', '2007', ',', '[PER]', 'Knievel', '[PER]', 'amicably', 'settled', 'the', 'suit']

['felony', 'vandalism']
['felony', 'vandalism', 'after', 'an', 'altercation', 'with', 'the', 'paparazzi', 'in', 'which']
['charges', 'of', 'felony', 'vandalism', 'after', 'an', 'altercation', 'with', 'the', 'paparazzi', 'in', 'which']

['$', '20,000']
['$', '20,000', 'bail', 'bond', '.', 'On', 'September', '26', ',', '2008']
['[LOC]', 'on', '$', '20,000', 'bail', 'bond', '.', 'On', 'September', '26', ',', '2008']

['one', 'count', 'of', 'misdemeanor', 'vandalism', ',', 'one', 'count', 'of', 'grand', 'theft', 'and', 'one', 'count', 'of', 'battery']
['one', 'count', 'of', 'misdemeanor', 'vandalism', ',', 'one', 'count', 'of', 'grand']
['[LOC]', 'with', 'one', 'count', 'of', 'misdemeanor', 'vandalism', ',', 'one', 'count', 'of', 'grand']

['September', '11', ',', '2008'

6562it [00:09, 772.17it/s]

['prolonged', 'fasting', ',', 'breath-holding', ',', 'and', 'exposure', 'to', 'pain']
['prolonged', 'fasting', ',', 'breath-holding', ',', 'and', 'exposure', 'to', 'pain', '.']
['[PER]', 'underwent', 'prolonged', 'fasting', ',', 'breath-holding', ',', 'and', 'exposure', 'to', 'pain', '.']

['meditation']
['meditation', ',', 'through', 'which', 'he', 'discovered', 'what', '[MISC]', 'Buddhists', '[MISC]']
['to', 'anapanasati', 'meditation', ',', 'through', 'which', 'he', 'discovered', 'what', '[MISC]', 'Buddhists', '[MISC]']

['asceticism']
['asceticism', ',', 'which', 'was', 'a', 'religious', 'pursuit', 'common', 'among', 'the']
['an', 'extreme', 'asceticism', ',', 'which', 'was', 'a', 'religious', 'pursuit', 'common', 'among', 'the']

['milk', 'and', 'rice']
['milk', 'and', 'rice', 'from', 'a', 'village', 'girl', 'and', 'changed', 'his']
['he', 'accepted', 'milk', 'and', 'rice', 'from', 'a', 'village', 'girl', 'and', 'changed', 'his']

['Middle', 'Way']
['Middle', 'Way', '[LOC]', '(', 

6770it [00:09, 906.42it/s]

['"', 'I', 'have', 'no', 'Self', '"']
['"', 'I', 'have', 'no', '[MISC]', 'Self', '[MISC]', '"', 'as', 'ontological']
['"', 'and', '"', 'I', 'have', 'no', '[MISC]', 'Self', '[MISC]', '"', 'as', 'ontological']

['skandhas']
['skandhas', ')', 'of', 'a', 'person', 'or', 'object', ',', 'the', 'practitioner']
['constituents', '(', 'skandhas', ')', 'of', 'a', 'person', 'or', 'object', ',', 'the', 'practitioner']

['pratītyasamutpāda', ',', '(', 'Sanskrit', ';', 'Pali', ':', 'paticcasamuppāda', ';', 'Tibetan', 'Wylie', ':', 'rten', 'cing', "'", 'brel', 'bar', "'", 'byung', 'ba', ';', 'Chinese', ':', '緣起)']
['pratītyasamutpāda', ',', '(', '[MISC]', 'Sanskrit', '[MISC]', ';', '[MISC]', 'Pali', '[MISC]']
['doctrine', 'of', 'pratītyasamutpāda', ',', '(', '[MISC]', 'Sanskrit', '[MISC]', ';', '[MISC]', 'Pali', '[MISC]']

['"', 'dependent', 'origination', '"', ',', '"', 'conditioned', 'genesis', '"', ',', '"', 'dependent', 'relationship', '"', ',', '"', 'dependent', 'co-arising', '"', ',', '"', 'inte

6863it [00:09, 746.68it/s]

['Rupa', 'Jhanas']
['Rupa', 'Jhanas', '[MISC]', ',', 'is', 'a', 'later', 'addition', 'to', 'texts']
['the', '[MISC]', 'Rupa', 'Jhanas', '[MISC]', ',', 'is', 'a', 'later', 'addition', 'to', 'texts']

['Majjhima', 'Nikaya']
['Majjhima', 'Nikaya', '36.', '[MISC]', '[', 'page', 'needed', ']']
['as', '[MISC]', 'Majjhima', 'Nikaya', '36.', '[MISC]', '[', 'page', 'needed', ']']

['rebirth']
['rebirth', 'of', 'earliest', 'Buddhism.', '[', 'page', 'needed', '][', 'note', '32']
['theory', 'of', 'rebirth', 'of', 'earliest', 'Buddhism.', '[', 'page', 'needed', '][', 'note', '32']

['karma']
['karma', 'in', 'the', '[MISC]', 'Sutta', 'Pitaka', '[MISC]', ',', 'which', 'may']
['presentation', 'of', 'karma', 'in', 'the', '[MISC]', 'Sutta', 'Pitaka', '[MISC]', ',', 'which', 'may']

['dhyana']
['dhyana', 'was', 'a', '[MISC]', 'Buddhist', '[MISC]', 'invention', ',', 'whereas', '[PER]']
['agrees', 'that', 'dhyana', 'was', 'a', '[MISC]', 'Buddhist', '[MISC]', 'invention', ',', 'whereas', '[PER]']

['meditat

7044it [00:09, 786.94it/s]

['the', 'Theravada']
['the', '[MISC]', '[LOC]', 'Theravada', '[LOC]', '[MISC]', 'school', 'does', 'not', 'include']
[]

['Sri', 'Lanka']
['Sri', 'Lanka', '[LOC]', 'prior', 'to', 'the', 'emergence', 'of', 'the', '[MISC]']
['in', '[LOC]', 'Sri', 'Lanka', '[LOC]', 'prior', 'to', 'the', 'emergence', 'of', 'the', '[MISC]']

['hinayana']
['hinayana', 'designation', ';', 'in', 'the', 'modern', 'era', ',', 'this', 'label']
['in', 'the', 'hinayana', 'designation', ';', 'in', 'the', 'modern', 'era', ',', 'this', 'label']

['Hinayana']
['Hinayana', '[MISC]', '"', 'was', 'used', 'to', 'refer', 'to', 'whomever', 'one']
['"', '[MISC]', 'Hinayana', '[MISC]', '"', 'was', 'used', 'to', 'refer', 'to', 'whomever', 'one']

['Śrāvakayāna']
['Śrāvakayāna', 'was', '"', 'the', 'more', 'politically', 'correct', 'and', 'much', 'more']
['the', 'term', 'Śrāvakayāna', 'was', '"', 'the', 'more', 'politically', 'correct', 'and', 'much', 'more']

['Buddhism']
['Buddhism', '[MISC]', 'provides', 'many', 'opportunities'

7237it [00:10, 823.37it/s]

['eight']
['eight', 'weeks', 'in', 'season', 'one', ',', 'eleven', 'weeks', 'in', 'subsequent']
['finals', 'lasted', 'eight', 'weeks', 'in', 'season', 'one', ',', 'eleven', 'weeks', 'in', 'subsequent']

['CBS', 'Television', 'City']
['CBS', 'Television', 'City', '[ORG]', 'in', '[LOC]', 'Los', 'Angeles', '[LOC]', ',']
['from', '[ORG]', 'CBS', 'Television', 'City', '[ORG]', 'in', '[LOC]', 'Los', 'Angeles', '[LOC]', ',']

['Jimmy', 'Iovine']
['Jimmy', 'Iovine', '[PER]', 'was', 'brought', 'in', 'as', 'a', 'mentor', 'for']
[',', '[PER]', 'Jimmy', 'Iovine', '[PER]', 'was', 'brought', 'in', 'as', 'a', 'mentor', 'for']

['one']
['one', ',', 'eleven', 'weeks', 'in', 'subsequent', 'seasons', 'until', 'seasons', 'ten']
['in', 'season', 'one', ',', 'eleven', 'weeks', 'in', 'subsequent', 'seasons', 'until', 'seasons', 'ten']

['top', 'four', 'or', 'five']
['top', 'four', 'or', 'five', 'onwards', ',', 'then', 'three', 'songs', 'for']
['songs', 'from', 'top', 'four', 'or', 'five', 'onwards', ',', 'th

7455it [00:10, 896.75it/s]

['The', 'American', 'Idol', 'Songwriter', 'contest']
['The', '[MISC]', 'American', 'Idol', 'Songwriter', '[MISC]', 'contest', 'was', 'also', 'held']
[]

['2008']
['2008', '.']
['22', ',', '2008', '.']

['The', 'Time', 'of', 'My', 'Life']
['The', 'Time', 'of', 'My', 'Life', '[MISC]', '"', ',', 'was', 'recorded']
['"', '[MISC]', 'The', 'Time', 'of', 'My', 'Life', '[MISC]', '"', ',', 'was', 'recorded']

['May', '22', ',', '2008']
['May', '22', ',', '2008', '.']
['released', 'on', 'May', '22', ',', '2008', '.']

['2009']
['2009', '.', '[PER]', 'Mike', 'Darnell', '[PER]', ',', 'the', 'president', 'of']
['13', ',', '2009', '.', '[PER]', 'Mike', 'Darnell', '[PER]', ',', 'the', 'president', 'of']

['Danny', 'Gokey.']
['Danny', 'Gokey.', '[PER]', '[', 'citation', 'needed', ']']
['of', '[PER]', 'Danny', 'Gokey.', '[PER]', '[', 'citation', 'needed', ']']

['president', 'of', 'alternative', 'programming']
['president', 'of', 'alternative', 'programming', 'for', '[ORG]', '[PER]', 'Fox', '[PER]', '[

7642it [00:10, 868.58it/s]

['judges']
['judges', '.', 'Eighth', 'season', 'runner-up', '[PER]', 'Adam', 'Lambert', '[PER]', 'filled']
['seasons', 'as', 'judges', '.', 'Eighth', 'season', 'runner-up', '[PER]', 'Adam', 'Lambert', '[PER]', 'filled']

['Randy', 'Jackson']
['Randy', 'Jackson', '[PER]', 'did', 'not', 'return', 'as', 'the', 'in-house', 'mentor']
['.', '[PER]', 'Randy', 'Jackson', '[PER]', 'did', 'not', 'return', 'as', 'the', 'in-house', 'mentor']

['January', '7', ',', '2015']
['January', '7', ',', '2015', '.', '[PER]', 'Ryan', 'Seacrest', '[PER]', 'returned']
['premiered', 'on', 'January', '7', ',', '2015', '.', '[PER]', 'Ryan', 'Seacrest', '[PER]', 'returned']

['Adam', 'Lambert']
['Adam', 'Lambert', '[PER]', 'filled', 'in', 'for', '[ORG]', '[LOC]', 'Urban', '[LOC]']
['runner-up', '[PER]', 'Adam', 'Lambert', '[PER]', 'filled', 'in', 'for', '[ORG]', '[LOC]', 'Urban', '[LOC]']

['one']
['one', 'episode', 'a', 'week', 'during', 'the', 'final', 'ten', '.', '[ORG]']
['only', 'airing', 'one', 'episode', 'a

7731it [00:10, 769.52it/s]

['Newsweek']
['Newsweek', '[LOC]', '[ORG]', 'accused', 'judge', '[PER]', 'Simon', 'Cowell', '[PER]', "'s"]
['[ORG]', '[LOC]', 'Newsweek', '[LOC]', '[ORG]', 'accused', 'judge', '[PER]', 'Simon', 'Cowell', '[PER]', "'s"]

['commercialism']
['commercialism', '"', '.', 'Pop', 'music', 'critic', '[PER]', 'Ann', 'Powers', '[PER]']
['and', 'promotes', 'commercialism', '"', '.', 'Pop', 'music', 'critic', '[PER]', 'Ann', 'Powers', '[PER]']

['Ann', 'Powers']
['Ann', 'Powers', '[PER]', 'nevertheless', 'suggested', 'that', '[PER]', 'Idol', '[PER]', 'has']
['critic', '[PER]', 'Ann', 'Powers', '[PER]', 'nevertheless', 'suggested', 'that', '[PER]', 'Idol', '[PER]', 'has']

['Simon', 'Cowell']
['Simon', 'Cowell', '[PER]', "'s", 'cruel', 'critiques', 'in', 'the', 'show', 'of']
['judge', '[PER]', 'Simon', 'Cowell', '[PER]', "'s", 'cruel', 'critiques', 'in', 'the', 'show', 'of']

['John', 'Mayer']
['John', 'Mayer', '[PER]', 'disparaged', 'the', 'contestants', ',', 'suggesting', 'that', 'those']
['singer

7944it [00:10, 866.69it/s]

['Dogue', 'de', 'Bordeaux']
['Dogue', 'de', 'Bordeaux', '[LOC]', ',', 'with', 'a', 'median', 'longevity', 'of']
['the', '[LOC]', 'Dogue', 'de', 'Bordeaux', '[LOC]', ',', 'with', 'a', 'median', 'longevity', 'of']

['5.2', 'years']
['5.2', 'years', ',', 'but', 'several', 'breeds', ',', 'including', '[ORG]', 'Miniature']
['of', 'about', '5.2', 'years', ',', 'but', 'several', 'breeds', ',', 'including', '[ORG]', 'Miniature']

['14', 'to', '15', 'years']
['14', 'to', '15', 'years', '.', 'The', 'median', 'longevity', 'of', 'mixed-breed']
['longevities', 'of', '14', 'to', '15', 'years', '.', 'The', 'median', 'longevity', 'of', 'mixed-breed']

['Bluey']
['Bluey', '[MISC]', '"', ',', 'who', 'died', 'in', '1939', 'and', 'was']
['"', '[MISC]', 'Bluey', '[MISC]', '"', ',', 'who', 'died', 'in', '1939', 'and', 'was']

['Pusuke']
['Pusuke', '[PER]', ',', 'the', 'world', "'s", 'oldest', 'living', 'dog', 'recognized']
[',', '[PER]', 'Pusuke', '[PER]', ',', 'the', 'world', "'s", 'oldest', 'living', 'dog

8164it [00:11, 845.36it/s]

['2000s']
['2000s', 'this', 'has', 'increased', 'to', '26.', '77', '%', 'of', 'dog']
['in', 'the', '2000s', 'this', 'has', 'increased', 'to', '26.', '77', '%', 'of', 'dog']

['less', 'severe']
['less', 'severe', 'than', 'bites', 'in', 'adults', '.', 'The', 'incidence', 'of']
['children', 'were', 'less', 'severe', 'than', 'bites', 'in', 'adults', '.', 'The', 'incidence', 'of']

['12.9']
['12.9', 'per', '10,000', 'inhabitants', ',', 'but', 'for', 'boys', 'aged', '5']
['[LOC]', 'is', '12.9', 'per', '10,000', 'inhabitants', ',', 'but', 'for', 'boys', 'aged', '5']

['60.7']
['60.7', 'per', '10,000', '.', 'Moreover', ',', 'children', 'have', 'a', 'much']
['rate', 'is', '60.7', 'per', '10,000', '.', 'Moreover', ',', 'children', 'have', 'a', 'much']

['the', 'face', 'or', 'neck']
['the', 'face', 'or', 'neck', '.', 'Sharp', 'claws', 'with', 'powerful', 'muscles']
['bitten', 'in', 'the', 'face', 'or', 'neck', '.', 'Sharp', 'claws', 'with', 'powerful', 'muscles']

['infections']
['infections', '.

8349it [00:11, 851.95it/s]

['24']
['24', 'km', 'torch', 'route', 'in', '[LOC]', 'Taiwan', '[LOC]', '.', 'By']
['along', 'the', '24', 'km', 'torch', 'route', 'in', '[LOC]', 'Taiwan', '[LOC]', '.', 'By']

['March', '24', ',', '2008']
['March', '24', ',', '2008', ',', 'the', '[MISC]', 'Olympic', 'Flame', '[MISC]']
[':', 'On', 'March', '24', ',', '2008', ',', 'the', '[MISC]', 'Olympic', 'Flame', '[MISC]']

['Olympia', ',', 'Greece']
['Olympia', '[LOC]', ',', '[LOC]', 'Greece', '[LOC]', '.', 'The', '[LOC]', 'People']
['in', '[LOC]', 'Olympia', '[LOC]', ',', '[LOC]', 'Greece', '[LOC]', '.', 'The', '[LOC]', 'People']

['Alexandros', 'Nikolaidis']
['Alexandros', 'Nikolaidis', '[PER]', 'from', '[LOC]', 'Greece', '[LOC]', ',', 'who', 'handed']
['taekwondo', '[PER]', 'Alexandros', 'Nikolaidis', '[PER]', 'from', '[LOC]', 'Greece', '[LOC]', ',', 'who', 'handed']

['Maria', 'Nafpliotou']
['Maria', 'Nafpliotou', '[PER]', ',', 'in', 'the', 'role', 'of', 'a', 'High']
['actress', '[PER]', 'Maria', 'Nafpliotou', '[PER]', ',', 'in'

8540it [00:11, 862.02it/s]

['Republic', 'Day', 'celebrations']
['Republic', 'Day', '[MISC]', 'celebrations', ',', 'which', 'are', 'considered', 'terrorist', 'targets']
['with', '[MISC]', 'Republic', 'Day', '[MISC]', 'celebrations', ',', 'which', 'are', 'considered', 'terrorist', 'targets']

['Nirupama', 'Sen']
['Nirupama', 'Sen', '[PER]', '.', 'The', '[MISC]', 'Indian', '[MISC]', 'media', 'responded']
[',', '[PER]', 'Nirupama', 'Sen', '[PER]', '.', 'The', '[MISC]', 'Indian', '[MISC]', 'media', 'responded']

['India', "'s", 'Commerce', 'Minister']
['India', '[LOC]', "'s", 'Commerce', 'Minister', ',', '[PER]', 'Kamal', 'Nath', '[PER]']
['that', '[LOC]', 'India', '[LOC]', "'s", 'Commerce', 'Minister', ',', '[PER]', 'Kamal', 'Nath', '[PER]']

['Kamal', 'Nath']
['Kamal', 'Nath', '[PER]', ',', 'cancelled', 'an', 'official', 'trip', 'to', '[LOC]']
[',', '[PER]', 'Kamal', 'Nath', '[PER]', ',', 'cancelled', 'an', 'official', 'trip', 'to', '[LOC]']

['2', 'am']
['2', 'am', 'local', 'time', ';', 'the', 'news', 'was', 'late

8635it [00:11, 853.21it/s]

['Macau', 'Fisherman', "'s", 'Wharf']
['Macau', 'Fisherman', "'s", 'Wharf', '[LOC]', '.', 'Afterward', ',', 'the', 'torch']
['at', '[LOC]', 'Macau', 'Fisherman', "'s", 'Wharf', '[LOC]', '.', 'Afterward', ',', 'the', 'torch']

['120']
['120', 'torchbearers', 'participated', 'in', 'this', 'event', 'including', 'casino', 'tycoon', '[PER]']
['total', 'of', '120', 'torchbearers', 'participated', 'in', 'this', 'event', 'including', 'casino', 'tycoon', '[PER]']

['Leong', 'Hong', 'Man']
['Leong', 'Hong', 'Man', '[PER]', 'and', '[PER]', 'Leong', 'Heng', 'Teng', '[PER]']
['.', '[PER]', 'Leong', 'Hong', 'Man', '[PER]', 'and', '[PER]', 'Leong', 'Heng', 'Teng', '[PER]']

['non-athletes']
['non-athletes', 'among', 'the', 'torchbearers', '.', '(', 'some', 'of', 'whom', 'had']
['too', 'many', 'non-athletes', 'among', 'the', 'torchbearers', '.', '(', 'some', 'of', 'whom', 'had']

['May', '3']
['May', '3', '.', 'It', 'was', 'the', 'first', 'time', 'that', 'the']
['[LOC]', 'on', 'May', '3', '.', 'It', '

8721it [00:11, 642.38it/s]

['By', 'assuming', 'that', 'enduring', 'objects', 'are', 'the', 'most', 'real', 'and', 'fundamental', 'things', 'in', 'the', 'universe', ',', 'materialists', 'have', 'mistaken', 'the', 'abstract', 'for', 'the', 'concrete']
['By', 'assuming', 'that', 'enduring', 'objects', 'are', 'the', 'most', 'real', 'and']
['events', '.', 'By', 'assuming', 'that', 'enduring', 'objects', 'are', 'the', 'most', 'real', 'and']

['"', 'quality', '"', ',', '"', 'matter', '"', ',', 'and', '"', 'form', '"']
['"', 'quality', '"', ',', '"', 'matter', '"', ',', 'and', '"']
['such', 'as', '"', 'quality', '"', ',', '"', 'matter', '"', ',', 'and', '"']

['These', '"', 'classical', '"', 'concepts', 'fail', 'to', 'adequately', 'account', 'for', 'change', ',', 'and', 'overlook', 'the', 'active', 'and', 'experiential', 'nature', 'of', 'the', 'most', 'basic', 'elements', 'of', 'the', 'world']
['These', '"', 'classical', '"', 'concepts', 'fail', 'to', 'adequately', 'account', 'for']
['problematic', '.', 'These', '"', 'c

8859it [00:12, 558.97it/s]

['phenonena', 'observed', 'locally', 'that', 'largely', 'violate', 'the', 'kind', 'of', 'local', 'flatness', 'of', 'space', 'that', 'Whitehead', 'assumes']
['phenonena', 'observed', 'locally', 'that', 'largely', 'violate', 'the', 'kind', 'of', 'local']
['They', 'are', 'phenonena', 'observed', 'locally', 'that', 'largely', 'violate', 'the', 'kind', 'of', 'local']

['Whitehead', "'s", 'cosmology', 'must', 'be', 'regarded', 'as', 'a', 'local', 'approximation']
['Whitehead', '[PER]', "'s", 'cosmology', 'must', 'be', 'regarded', 'as', 'a', 'local']
[',', '[PER]', 'Whitehead', '[PER]', "'s", 'cosmology', 'must', 'be', 'regarded', 'as', 'a', 'local']

['Grawemeyer', 'Award', 'for', 'Ideas', 'Improving', 'World', 'Order']
['Grawemeyer', 'Award', '[MISC]', 'for', 'Ideas', 'Improving', '[MISC]', 'World', 'Order', '[MISC]']
['the', '[MISC]', 'Grawemeyer', 'Award', '[MISC]', 'for', 'Ideas', 'Improving', '[MISC]', 'World', 'Order', '[MISC]']

['Is', 'It', 'Too', 'Late', '?', 'A', 'Theology', 'of', 

8922it [00:12, 496.81it/s]

['unmediated', 'by', 'the', 'senses']
['unmediated', 'by', 'the', 'senses', '.', 'Presentational', 'immediacy', ',', 'on', 'the']
['environment', ',', 'unmediated', 'by', 'the', 'senses', '.', 'Presentational', 'immediacy', ',', 'on', 'the']

['it', 'is', 'pure', 'appearance', ',', 'which', 'may', 'or', 'may', 'not', 'be', 'delusive']
['it', 'is', 'pure', 'appearance', ',', 'which', 'may', 'or', 'may', 'not']
['words', ',', 'it', 'is', 'pure', 'appearance', ',', 'which', 'may', 'or', 'may', 'not']

['causal', 'efficacy']
['causal', 'efficacy', 'as', '"', 'the', 'experience', 'dominating', 'the', 'primitive', 'living']
['[PER]', 'describes', 'causal', 'efficacy', 'as', '"', 'the', 'experience', 'dominating', 'the', 'primitive', 'living']

['Presentational', 'immediacy']
['Presentational', 'immediacy', ',', 'on', 'the', 'other', 'hand', ',', 'is', 'what']
['senses', '.', 'Presentational', 'immediacy', ',', 'on', 'the', 'other', 'hand', ',', 'is', 'what']

['Presentational', 'immediacy']


8977it [00:12, 421.64it/s]

['Bertrand', 'Russell', ',', 'and', 'he', 'also', 'taught', 'and', 'supervised', 'the', 'dissertation', 'of', 'Willard', 'Van', 'Orman', 'Quine']
['Bertrand', 'Russell', '[PER]', ',', 'and', 'he', 'also', 'taught', 'and', 'supervised']
['of', '[PER]', 'Bertrand', 'Russell', '[PER]', ',', 'and', 'he', 'also', 'taught', 'and', 'supervised']

['"', 'he', 'stands', 'provisionally', 'as', 'the', 'last', 'great', 'Anglo-American', 'philosopher', 'before', 'Wittgenstein', "'s", 'disciples', 'spread', 'their', 'misty', 'confusion', ',', 'sufficiency', ',', 'and', 'terror', '.']
['"', 'he', 'stands', 'provisionally', 'as', 'the', 'last', 'great', '[MISC]', 'Anglo-American']
['[PER]', 'that', '"', 'he', 'stands', 'provisionally', 'as', 'the', 'last', 'great', '[MISC]', 'Anglo-American']

['American', 'progressive', 'theology']
['American', '[MISC]', 'progressive', 'theology', '.', 'The', 'most', 'important', 'early', 'proponent']
['of', '[MISC]', 'American', '[MISC]', 'progressive', 'theology', 

9155it [00:12, 589.66it/s]

['faulty', 'risk-weightings']
['faulty', 'risk-weightings', '.', 'Major', 'banks', 'suffered', 'losses', 'from', '[MISC]', 'AAA-rated']
['problem', 'of', 'faulty', 'risk-weightings', '.', 'Major', 'banks', 'suffered', 'losses', 'from', '[MISC]', 'AAA-rated']

['financial', 'engineering']
['financial', 'engineering', '(', 'which', 'creates', 'apparently', 'risk-free', 'assets', 'out', 'of']
['created', 'by', 'financial', 'engineering', '(', 'which', 'creates', 'apparently', 'risk-free', 'assets', 'out', 'of']

['Basel', 'III', 'regulations']
['Basel', 'III', '[ORG]', 'regulations', 'for', 'banks', '.', 'It', 'increased', 'capital']
['introduced', '[ORG]', 'Basel', 'III', '[ORG]', 'regulations', 'for', 'banks', '.', 'It', 'increased', 'capital']

['Johan', 'Norberg']
['Johan', 'Norberg', '[PER]', 'argues', 'that', 'regulations', '(', '[ORG]', 'Basel', 'III']
['.', '[PER]', 'Johan', 'Norberg', '[PER]', 'argues', 'that', 'regulations', '(', '[ORG]', 'Basel', 'III']

['capital', 'ratios']
[

9324it [00:13, 674.22it/s]

['mainstream', 'economics']
['mainstream', 'economics', 'and', 'within', 'the', 'economics', 'profession', ',', 'and', 'call']
['ideas', 'in', 'mainstream', 'economics', 'and', 'within', 'the', 'economics', 'profession', ',', 'and', 'call']

['feminist', 'economics']
['feminist', 'economics', 'and', 'ecological', 'economics', 'that', 'take', 'as', 'their', 'starting']
['advances', 'within', 'feminist', 'economics', 'and', 'ecological', 'economics', 'that', 'take', 'as', 'their', 'starting']

['a', 'reshaping']
['a', 'reshaping', 'of', 'both', 'the', 'economy', ',', 'economic', 'theory', 'and']
['call', 'for', 'a', 'reshaping', 'of', 'both', 'the', 'economy', ',', 'economic', 'theory', 'and']

['Raghuram', 'Rajan']
['Raghuram', 'Rajan', '[ORG]', 'had', 'predicted', 'the', 'crisis', 'in', '2005', 'when']
['of', 'India', 'Raghuram', 'Rajan', '[ORG]', 'had', 'predicted', 'the', 'crisis', 'in', '2005', 'when']

['2005']
['2005', 'when', 'he', 'became', 'chief', 'economist', 'at', 'the', '[O

9475it [00:13, 658.03it/s]

['May.']
['May.', 'Turtles', 'are', 'a', 'common', 'sight', 'along', 'the', 'coastline', 'of']
['December', 'till', 'May.', 'Turtles', 'are', 'a', 'common', 'sight', 'along', 'the', 'coastline', 'of']

['December']
['December', 'till', 'May.', 'Turtles', 'are', 'a', 'common', 'sight', 'along', 'the']
['period', 'from', 'December', 'till', 'May.', 'Turtles', 'are', 'a', 'common', 'sight', 'along', 'the']

['Turtles']
['Turtles', 'are', 'a', 'common', 'sight', 'along', 'the', 'coastline', 'of', 'the']
['till', 'May.', 'Turtles', 'are', 'a', 'common', 'sight', 'along', 'the', 'coastline', 'of', 'the']

['jellyfish']
['jellyfish', 'is', 'their', 'favourite', 'diet', ')', ';', 'the', 'hawksbill', 'turtles']
['kg', '(', 'jellyfish', 'is', 'their', 'favourite', 'diet', ')', ';', 'the', 'hawksbill', 'turtles']

['amidst', 'tall', 'sea', 'grasses']
['amidst', 'tall', 'sea', 'grasses', '.']
['and', 'live', 'amidst', 'tall', 'sea', 'grasses', '.']

['pearly-pink']
['pearly-pink', 'shells', '.', '

9696it [00:13, 858.28it/s]

['genetic', 'novelty']
['genetic', 'novelty', '.']
['creation', 'of', 'genetic', 'novelty', '.']

['Horizontal', 'gene', 'transfer']
['Horizontal', 'gene', 'transfer', 'is', 'invoked', 'to', 'explain', 'how', 'there', 'is']
[]

['microbes']
['microbes', '.', 'Also', ',', 'eukaryotic', 'cells', 'seem', 'to', 'have', 'experienced']
['among', 'many', 'microbes', '.', 'Also', ',', 'eukaryotic', 'cells', 'seem', 'to', 'have', 'experienced']

['chloroplast', 'and', 'mitochondrial', 'genomes']
['chloroplast', 'and', 'mitochondrial', 'genomes', 'to', 'their', 'nuclear', 'chromosomes', '.']
['from', 'their', 'chloroplast', 'and', 'mitochondrial', 'genomes', 'to', 'their', 'nuclear', 'chromosomes', '.']

['comprehensive', 'school']
['comprehensive', 'school', 'is', 'a', 'state', 'school', 'that', 'does', 'not', 'select']
[]

['selective', 'school', 'system']
['selective', 'school', 'system', ',', 'where', 'admission', 'is', 'restricted', 'on', 'the']
['to', 'the', 'selective', 'school', 'system'

9907it [00:13, 825.73it/s]

['parliamentarians', 'and', 'legal', 'scholars']
['parliamentarians', 'and', 'legal', 'scholars', 'continued', 'to', 'deny', 'that', 'any', 'such']
['18th', 'century', 'parliamentarians', 'and', 'legal', 'scholars', 'continued', 'to', 'deny', 'that', 'any', 'such']

['George', 'II', 'and', 'George', 'III']
['George', 'II', '[PER]', 'and', '[PER]', 'George', 'III', '[PER]', 'made', 'strenuous']
['.', '[PER]', 'George', 'II', '[PER]', 'and', '[PER]', 'George', 'III', '[PER]', 'made', 'strenuous']

['Benjamin', 'Disraeli']
['Benjamin', 'Disraeli', '[PER]', 'but', 'did', 'not', 'appear', 'in', 'the', 'formal']
['of', '[PER]', 'Benjamin', 'Disraeli', '[PER]', 'but', 'did', 'not', 'appear', 'in', 'the', 'formal']

['1905']
['1905', '.']
['precedence', 'until', '1905', '.']

['president']
['president', '.', 'The', 'main', 'exceptions', 'to', 'this', 'system', 'have', 'been']
['a', 'ceremonial', 'president', '.', 'The', 'main', 'exceptions', 'to', 'this', 'system', 'have', 'been']

['Sheikh', 

10125it [00:13, 912.63it/s]

['Bologna', 'Process']
['Bologna', 'Process', '[MISC]', 'in', '2007', ',', 'they', 'have', 'been', 'allowed']
['the', '[MISC]', 'Bologna', 'Process', '[MISC]', 'in', '2007', ',', 'they', 'have', 'been', 'allowed']

['Millennia', 'Institute']
['Millennia', 'Institute', '[ORG]', ',', 'a', 'centralized', 'institute', ')', '.', 'Polytechnic']
['the', '[ORG]', 'Millennia', 'Institute', '[ORG]', ',', 'a', 'centralized', 'institute', ')', '.', 'Polytechnic']

['5']
['5', 'polytechnics', 'in', '[LOC]', 'Singapore', '[LOC]', '.', 'They', 'are', 'namely']
['There', 'are', '5', 'polytechnics', 'in', '[LOC]', 'Singapore', '[LOC]', '.', 'They', 'are', 'namely']

['three-year']
['three-year', 'diploma', 'courses', 'in', 'fields', 'such', 'as', 'information', 'technology', ',']
['Polytechnics', 'offer', 'three-year', 'diploma', 'courses', 'in', 'fields', 'such', 'as', 'information', 'technology', ',']

['queen', 'Maria', 'Theresa']
['queen', '[PER]', 'Maria', 'Theresa', '[PER]', 'in', 'order', 'to', 

10229it [00:14, 941.58it/s]

['James', 'Madison']
['James', 'Madison', '[PER]', ',', 'shows', '.', 'Such', 'influence', 'appears', ',']
['by', '[PER]', 'James', 'Madison', '[PER]', ',', 'shows', '.', 'Such', 'influence', 'appears', ',']

['the', 'Act', 'of', 'Abjuration', ',', 'essentially', 'the', 'declaration', 'of', 'independence', 'of', 'the', 'United', 'Provinces']
['the', '[MISC]', 'Act', 'of', 'Abjuration', '[MISC]', ',', 'essentially', 'the', 'declaration']
['similar', 'to', 'the', '[MISC]', 'Act', 'of', 'Abjuration', '[MISC]', ',', 'essentially', 'the', 'declaration']

['the', 'Union', 'of', 'Utrecht', 'of', '20', 'January', '1579']
['the', '[ORG]', '[LOC]', 'Union', 'of', 'Utrecht', '[LOC]', '[ORG]', 'of', '20']
[]

['personal', 'religion']
['personal', 'religion', 'and', 'that', 'no', 'person', 'should', 'be', 'prosecuted', 'based']
['choice', 'of', 'personal', 'religion', 'and', 'that', 'no', 'person', 'should', 'be', 'prosecuted', 'based']

['William', 'of', 'Orange']
['William', '[PER]', 'of', '[PER]

10416it [00:14, 844.55it/s]

['through', 'the', 'purchase', 'of', 'new', 'equipment', ',', 'improved', 'training', 'and', 'readiness']
['through', 'the', 'purchase', 'of', 'new', 'equipment', ',', 'improved', 'training', 'and']
['[ORG]', ',', 'through', 'the', 'purchase', 'of', 'new', 'equipment', ',', 'improved', 'training', 'and']

['loss', 'of', 'existing', 'members']
['loss', 'of', 'existing', 'members', ',', 'which', 'increased', 'between', '2006', 'and']
['rate', 'of', 'loss', 'of', 'existing', 'members', ',', 'which', 'increased', 'between', '2006', 'and']

['main', 'battle', 'tanks', ',', 'artillery', ',', 'unmanned', 'air', 'vehicles']
['main', 'battle', 'tanks', ',', 'artillery', ',', 'unmanned', 'air', 'vehicles', 'and']
['equipment', '(', 'main', 'battle', 'tanks', ',', 'artillery', ',', 'unmanned', 'air', 'vehicles', 'and']

['C-130', 'Hercules']
['C-130', 'Hercules', '–', '[MISC]', 'and', 'the', 'army', "'s", 'truck', 'and']
['the', '[MISC]', 'C-130', 'Hercules', '–', '[MISC]', 'and', 'the', 'army', 

10614it [00:14, 881.97it/s]

['a', 'Roman', 'priest']
['a', '[MISC]', 'Roman', '[MISC]', 'priest', 'and', 'never', 'a', 'bishop', 'from']
['was', 'customarily', 'a', '[MISC]', 'Roman', '[MISC]', 'priest', 'and', 'never', 'a', 'bishop', 'from']

['To', 'preserve', 'apostolic', 'succession']
['To', 'preserve', 'apostolic', 'succession', 'the', 'rite', 'of', 'consecrating', 'him', 'a']
['elsewhere', '.', 'To', 'preserve', 'apostolic', 'succession', 'the', 'rite', 'of', 'consecrating', 'him', 'a']

['he', 'is', 'consecrated', 'by', 'the', 'Dean', 'of', 'the', 'College', 'of', 'Cardinals', ',', 'the', 'Cardinal', 'Bishop', 'of', 'Ostia']
['he', 'is', 'consecrated', 'by', 'the', '[ORG]', 'Dean', 'of', 'the', 'College']
['bishop', ',', 'he', 'is', 'consecrated', 'by', 'the', '[ORG]', 'Dean', 'of', 'the', 'College']

['he', 'is', 'consecrated', 'by', 'the', 'Dean', 'of', 'the', 'College', 'of', 'Cardinals', ',', 'the', 'Cardinal', 'Bishop', 'of', 'Ostia']
['he', 'is', 'consecrated', 'by', 'the', '[ORG]', 'Dean', 'of', 'th

10704it [00:14, 691.88it/s]

['cable', 'lighting']
['cable', 'lighting', ',', 'where', 'lights', 'are', 'hung', 'from', 'or', 'clipped']
['this', 'is', 'cable', 'lighting', ',', 'where', 'lights', 'are', 'hung', 'from', 'or', 'clipped']

['12', 'or', '24', 'volts']
['12', 'or', '24', 'volts', ',', 'instead', 'of', 'each', 'light', 'fixture']
['rod', 'with', '12', 'or', '24', 'volts', ',', 'instead', 'of', 'each', 'light', 'fixture']

['torchiere']
['torchiere', 'is', 'an', 'uplight', 'intended', 'for', 'ambient', 'lighting', '.', 'It']
['.', 'A', 'torchiere', 'is', 'an', 'uplight', 'intended', 'for', 'ambient', 'lighting', '.', 'It']

['table', 'lamp']
['table', 'lamp', 'is', 'probably', 'the', 'most', 'common', 'fixture', ',', 'found']
['portable', 'or', 'table', 'lamp', 'is', 'probably', 'the', 'most', 'common', 'fixture', ',', 'found']

['task', 'lighting']
['task', 'lighting', '.', 'Magnifier', 'lamps', 'are', 'also', 'task', 'lighting', '.']
['is', 'considered', 'task', 'lighting', '.', 'Magnifier', 'lamps', 

10955it [00:14, 919.20it/s]

['1983']
['1983', ')', ',', 'the', '[ORG]', 'Supreme', 'Court', '[ORG]', 'decided', '(']
['[PER]', '(', '1983', ')', ',', 'the', '[ORG]', 'Supreme', 'Court', '[ORG]', 'decided', '(']

['two-thirds']
['two-thirds', 'of', 'the', '[ORG]', 'Senate', '[ORG]', 'and', '[ORG]', 'House', '—']
['repassed', 'by', 'two-thirds', 'of', 'the', '[ORG]', 'Senate', '[ORG]', 'and', '[ORG]', 'House', '—']

['Judicial']
['Judicial', 'power', '—', 'the', 'power', 'to', 'decide', 'cases', 'and', 'controversies']
[]

['president']
['president', 'with', 'the', 'advice', 'and', 'consent', 'of', 'the', '[ORG]', 'Senate']
['by', 'the', 'president', 'with', 'the', 'advice', 'and', 'consent', 'of', 'the', '[ORG]', 'Senate']

['Senate']
['Senate', '[ORG]', ',', 'hold', 'office', 'during', 'good', 'behavior', 'and', 'receive']
['the', '[ORG]', 'Senate', '[ORG]', ',', 'hold', 'office', 'during', 'good', 'behavior', 'and', 'receive']

['constitutional', 'courts']
['constitutional', 'courts', '.', '"']
['called', '"', '

11199it [00:15, 1050.39it/s]

['Amartya', 'Sen']
['Amartya', 'Sen', '[PER]', ',', 'they', 'worked', 'on', 'capabilities', 'and', 'functions']
['laureate', '[PER]', 'Amartya', 'Sen', '[PER]', ',', 'they', 'worked', 'on', 'capabilities', 'and', 'functions']

['people-centered', 'policies']
['people-centered', 'policies', '"', '.', 'To', 'produce', 'the', '[ORG]', 'Human', 'Development']
['accounting', 'to', 'people-centered', 'policies', '"', '.', 'To', 'produce', 'the', '[ORG]', 'Human', 'Development']

['1990']
['1990', 'and', 'had', 'the', 'explicit', 'purpose', '"', 'to', 'shift', 'the']
['[PER]', 'in', '1990', 'and', 'had', 'the', 'explicit', 'purpose', '"', 'to', 'shift', 'the']

['Life', 'expectancy', 'at', 'birth']
['Life', 'expectancy', 'at', 'birth', 'MYS', ':', 'Mean', 'years', 'of', 'schooling']
['[ORG]', ':', 'Life', 'expectancy', 'at', 'birth', 'MYS', ':', 'Mean', 'years', 'of', 'schooling']

['Mean', 'years', 'of', 'schooling']
['Mean', 'years', 'of', 'schooling', '(', 'Years', 'that', 'a', 'person', '

11309it [00:15, 949.29it/s] 

['405-line', 'interlaced', 'image']
['405-line', 'interlaced', 'image', 'on', '[MISC]', 'VHF.', '[MISC]', '[', 'original', 'research']
['transmitting', 'a', '405-line', 'interlaced', 'image', 'on', '[MISC]', 'VHF.', '[MISC]', '[', 'original', 'research']

['VHF.']
['VHF.', '[MISC]', '[', 'original', 'research', '?', ']']
['on', '[MISC]', 'VHF.', '[MISC]', '[', 'original', 'research', '?', ']']

['BBC', 'tv']
['BBC', 'tv', '[ORG]', '"', 'in', '1960', ')', 'showed', 'popular', 'programming']
['"', '[ORG]', 'BBC', 'tv', '[ORG]', '"', 'in', '1960', ')', 'showed', 'popular', 'programming']

['ITV']
['ITV', '[ORG]', 'to', 'become', 'the', 'channel', 'with', 'the', 'highest', 'ratings']
['with', '[ORG]', 'ITV', '[ORG]', 'to', 'become', 'the', 'channel', 'with', 'the', 'highest', 'ratings']

['Doctor', 'Who']
['Doctor', 'Who', '[MISC]', 'on', '23', 'November', '1963', '-', 'at', '17:16']
['show', '[MISC]', 'Doctor', 'Who', '[MISC]', 'on', '23', 'November', '1963', '-', 'at', '17:16']

['Alexan

11499it [00:15, 801.76it/s]

['Conan', 'the', 'Republican']
['Conan', 'the', 'Republican', '[MISC]', '"', '.', 'He', 'later', 'served', 'as']
['"', '[MISC]', 'Conan', 'the', 'Republican', '[MISC]', '"', '.', 'He', 'later', 'served', 'as']

['1999']
['1999', ',', '[PER]', 'Schwarzenegger', '[PER]', 'was', 'asked', 'if', 'he', 'thought']
['in', 'late', '1999', ',', '[PER]', 'Schwarzenegger', '[PER]', 'was', 'asked', 'if', 'he', 'thought']

['The', 'Hollywood', 'Reporter']
['The', '[ORG]', 'Hollywood', 'Reporter', '[ORG]', 'claimed', 'shortly', 'after', 'that', '[PER]']
['.', '"', 'The', '[ORG]', 'Hollywood', 'Reporter', '[ORG]', 'claimed', 'shortly', 'after', 'that', '[PER]']

['August', '6', ',', '2003']
['August', '6', ',', '2003', 'episode', 'of', '[MISC]', 'The', 'Tonight', 'Show']
['on', 'the', 'August', '6', ',', '2003', 'episode', 'of', '[MISC]', 'The', 'Tonight', 'Show']

['one']
['one', 'of', 'his', 'films', ')', ',', 'and', 'calling', 'the', 'recall']
['of', 'another', 'one', 'of', 'his', 'films', ')', ','

11672it [00:15, 816.40it/s]

['1512']
['1512', 'an', '[MISC]', 'Act', 'of', 'Parliament', '[MISC]', 'was', 'passed', 'for']
['.', 'In', '1512', 'an', '[MISC]', 'Act', 'of', 'Parliament', '[MISC]', 'was', 'passed', 'for']

['Sir', 'John', 'Hawkins']
['Sir', '[PER]', 'John', 'Hawkins', '[PER]', ',', 'who', 'led', '[LOC]', 'England']
['among', 'them', 'Sir', '[PER]', 'John', 'Hawkins', '[PER]', ',', 'who', 'led', '[LOC]', 'England']

['Sir', 'Francis', 'Drake']
['Sir', '[PER]', 'Francis', 'Drake', '[PER]', ',', 'Mayor', 'of', '[LOC]', 'Plymouth']
['well', 'as', 'Sir', '[PER]', 'Francis', 'Drake', '[PER]', ',', 'Mayor', 'of', '[LOC]', 'Plymouth']

['1588']
['1588', '.', 'In', '1620', 'the', '[ORG]', '[MISC]', 'Pilgrim', 'Fathers', '[MISC]']
['[MISC]', 'in', '1588', '.', 'In', '1620', 'the', '[ORG]', '[MISC]', 'Pilgrim', 'Fathers', '[MISC]']

['1620']
['1620', 'the', '[ORG]', '[MISC]', 'Pilgrim', 'Fathers', '[MISC]', '[ORG]', 'set', 'sail']
['.', 'In', '1620', 'the', '[ORG]', '[MISC]', 'Pilgrim', 'Fathers', '[MISC]', '

11858it [00:15, 861.93it/s]

['78.3', 'years']
['78.3', 'years', 'for', 'men', 'and', '82.1', 'for', 'women', ',', 'was']
[',', 'at', '78.3', 'years', 'for', 'men', 'and', '82.1', 'for', 'women', ',', 'was']

['82.1']
['82.1', 'for', 'women', ',', 'was', 'the', 'lowest', 'of', 'any', 'region']
['men', 'and', '82.1', 'for', 'women', ',', 'was', 'the', 'lowest', 'of', 'any', 'region']

['lowest']
['lowest', 'of', 'any', 'region', 'in', 'the', '[LOC]', 'South', 'West', 'of']
['was', 'the', 'lowest', 'of', 'any', 'region', 'in', 'the', '[LOC]', 'South', 'West', 'of']

['12,000']
['12,000', 'people', 'employed', 'and', 'approximately', '7,500', 'in', 'the', 'armed', 'forces']
['with', 'over', '12,000', 'people', 'employed', 'and', 'approximately', '7,500', 'in', 'the', 'armed', 'forces']

['7,500']
['7,500', 'in', 'the', 'armed', 'forces', '.', 'The', '[ORG]', 'Plymouth', 'Gin']
['and', 'approximately', '7,500', 'in', 'the', 'armed', 'forces', '.', 'The', '[ORG]', 'Plymouth', 'Gin']

['1793']
['1793', ',', 'which', 'wa

11946it [00:16, 718.20it/s]

['Lyndon', 'B.', 'Johnson']
['Lyndon', 'B.', 'Johnson', '[PER]', 'in', 'the', 'film', '[MISC]', 'The', 'Right']
['President', '[PER]', 'Lyndon', 'B.', 'Johnson', '[PER]', 'in', 'the', 'film', '[MISC]', 'The', 'Right']

['Clear', 'and', 'Present', 'Danger']
['Clear', 'and', 'Present', 'Danger', '[MISC]', ',', 'was', 'born', 'in', '[LOC]']
['in', '[MISC]', 'Clear', 'and', 'Present', 'Danger', '[MISC]', ',', 'was', 'born', 'in', '[LOC]']

['comedian']
['comedian', '[PER]', 'Dawn', 'French', '[PER]', '.', '[MISC]', 'Canadian', '[MISC]', 'politician']
['[PER]', 'and', 'comedian', '[PER]', 'Dawn', 'French', '[PER]', '.', '[MISC]', 'Canadian', '[MISC]', 'politician']

['established', 'beliefs', 'or', 'customs']
['established', 'beliefs', 'or', 'customs', '.', 'A', 'heretic', 'is', 'a', 'proponent']
['variance', 'with', 'established', 'beliefs', 'or', 'customs', '.', 'A', 'heretic', 'is', 'a', 'proponent']

['A', 'heretic']
['A', 'heretic', 'is', 'a', 'proponent', 'of', 'such', 'claims', 'or',

12031it [00:16, 749.82it/s]

['Materialism']
['Materialism', 'is', 'closely', 'related', 'to', 'physicalism', ',', 'the', 'view', 'that']
[]

['monist', 'ontology']
['monist', 'ontology', '.', 'As', 'such', ',', 'it', 'is', 'different', 'from']
['class', 'of', 'monist', 'ontology', '.', 'As', 'such', ',', 'it', 'is', 'different', 'from']

[':', '"', 'what', 'does', 'reality', 'consist', 'of']
[':', '"', 'what', 'does', 'reality', 'consist', 'of', '?"', 'and', '"']
['fundamental', 'questions', ':', '"', 'what', 'does', 'reality', 'consist', 'of', '?"', 'and', '"']

['"', 'how', 'does', 'it', 'originate', '?']
['"', 'how', 'does', 'it', 'originate', '?', '"', 'To', 'idealists', ',']
['?"', 'and', '"', 'how', 'does', 'it', 'originate', '?', '"', 'To', 'idealists', ',']

['spirit', 'or', 'mind', 'or', 'the', 'objects', 'of', 'mind', '(', 'ideas', ')']
['spirit', 'or', 'mind', 'or', 'the', 'objects', 'of', 'mind', '(', 'ideas']
['idealists', ',', 'spirit', 'or', 'mind', 'or', 'the', 'objects', 'of', 'mind', '(', 'ideas

12178it [00:16, 608.89it/s]

['the', 'Chief', 'Designer']
['the', 'Chief', 'Designer', '.', '"', 'In', 'the', '[LOC]', '[LOC]', 'West']
['as', '"', 'the', 'Chief', 'Designer', '.', '"', 'In', 'the', '[LOC]', '[LOC]', 'West']

['Peenemünde']
['Peenemünde', '[LOC]', 'was', 'located', 'in', 'the', 'eastern', 'part', 'of', '[LOC]']
['in', '[LOC]', 'Peenemünde', '[LOC]', 'was', 'located', 'in', 'the', 'eastern', 'part', 'of', '[LOC]']

['Sergei', 'Korolev']
['Sergei', 'Korolev', '[PER]', '.', 'He', 'had', 'been', 'involved', 'in', 'space']
['by', '[PER]', 'Sergei', 'Korolev', '[PER]', '.', 'He', 'had', 'been', 'involved', 'in', 'space']

['1938']
['1938', 'during', '[PER]', 'Joseph', 'Stalin', '[PER]', "'s", 'Great', '[MISC]', 'Purge']
['arrested', 'in', '1938', 'during', '[PER]', 'Joseph', 'Stalin', '[PER]', "'s", 'Great', '[MISC]', 'Purge']

['1966']
['1966', '.']
['died', 'in', '1966', '.']

['USSR', "'s", 'chief', 'rocket', 'and', 'spacecraft', 'engineer']
['USSR', '[LOC]', "'s", 'chief', 'rocket', 'and', 'spacecra

12315it [00:16, 592.12it/s]

['July', '24', ',', '1969']
['July', '24', ',', '1969', '.', 'When', 'the', 'spacecraft', 'splashed', 'down']
['[LOC]', 'on', 'July', '24', ',', '1969', '.', 'When', 'the', 'spacecraft', 'splashed', 'down']

['Pacific', 'Ocean']
['Pacific', 'Ocean', '[LOC]', 'on', 'July', '24', ',', '1969', '.', 'When']
['the', '[LOC]', 'Pacific', 'Ocean', '[LOC]', 'on', 'July', '24', ',', '1969', '.', 'When']

['November', '1969']
['November', '1969', '.', '[ORG]', 'NASA', '[ORG]', 'had', 'achieved', 'its', 'first']
['[MISC]', 'in', 'November', '1969', '.', '[ORG]', 'NASA', '[ORG]', 'had', 'achieved', 'its', 'first']

['February', '1971']
['February', '1971', ')', ',', '[MISC]', 'Apollo', '15', '[MISC]', '(', 'July']
['[MISC]', '(', 'February', '1971', ')', ',', '[MISC]', 'Apollo', '15', '[MISC]', '(', 'July']

['July', '1971']
['July', '1971', ')', ',', '[MISC]', 'Apollo', '16', '[MISC]', '(', 'April']
['[MISC]', '(', 'July', '1971', ')', ',', '[MISC]', 'Apollo', '16', '[MISC]', '(', 'April']

['Apri

12484it [00:17, 689.86it/s]

['1869']
['1869', 'the', 'growth', 'had', 'to', 'be', 'checked', 'by', 'magisterial', 'control']
['Finally', 'in', '1869', 'the', 'growth', 'had', 'to', 'be', 'checked', 'by', 'magisterial', 'control']

['pubs']
['pubs', '.', 'These', 'usually', 'small', 'establishments', 'can', 'still', 'be', 'identified']
['became', 'full', 'pubs', '.', 'These', 'usually', 'small', 'establishments', 'can', 'still', 'be', 'identified']

['19th']
['19th', 'century', '.', 'A', 'very', 'small', 'number', 'remained', 'into', 'the']
['of', 'the', '19th', 'century', '.', 'A', 'very', 'small', 'number', 'remained', 'into', 'the']

['corners', 'or', 'road', 'junctions']
['corners', 'or', 'road', 'junctions', '.', 'Many', 'of', 'today', "'s", 'respected']
['found', 'on', 'corners', 'or', 'road', 'junctions', '.', 'Many', 'of', 'today', "'s", 'respected']

['real', 'ale', 'micro-brewers']
['real', 'ale', 'micro-brewers', 'in', 'the', '[LOC]', 'UK', '[LOC]', 'started', 'as']
["'s", 'respected', 'real', 'ale', 'm

12704it [00:17, 869.12it/s]

['Sunday', 'League', 'Football']
['Sunday', 'League', 'Football', '[MISC]', '"', '.', '[ORG]', 'Bowling', '[ORG]', 'is']
['"', '[MISC]', 'Sunday', 'League', 'Football', '[MISC]', '"', '.', '[ORG]', 'Bowling', '[ORG]', 'is']

['Bowling']
['Bowling', '[ORG]', 'is', 'found', 'in', 'association', 'with', 'pubs', 'in', 'some']
['.', '[ORG]', 'Bowling', '[ORG]', 'is', 'found', 'in', 'association', 'with', 'pubs', 'in', 'some']

['Pub', 'rock']
['Pub', '[ORG]', 'rock', 'that', 'was', 'a', 'precursor', 'to', '[MISC]', '[MISC]']
['called', '[ORG]', 'Pub', '[ORG]', 'rock', 'that', 'was', 'a', 'precursor', 'to', '[MISC]', '[MISC]']

['Punk', 'music']
['Punk', '[MISC]', '[MISC]', 'music', '.']
['[MISC]', '[MISC]', 'Punk', '[MISC]', '[MISC]', 'music', '.']

['the', '1970s']
['the', '1970s', 'pubs', 'provided', 'an', 'outlet', 'for', 'a', 'number', 'of']
['.', 'During', 'the', '1970s', 'pubs', 'provided', 'an', 'outlet', 'for', 'a', 'number', 'of']

['bar', 'snacks']
['bar', 'snacks', '"', ',', 'suc

12891it [00:17, 785.49it/s]

['"', 'Time', 'of', 'the', 'Season', '"', 'by', 'the', 'Zombies']
['"', '[MISC]', 'Time', 'of', 'the', 'Season', '[MISC]', '"', 'by', 'the']
['success', 'was', '"', '[MISC]', 'Time', 'of', 'the', 'Season', '[MISC]', '"', 'by', 'the']

['#', '2']
['#', '2', 'in', '1969', '.', 'The', 'label', 'was', 'discontinued', 'in']
['peaking', 'at', '#', '2', 'in', '1969', '.', 'The', 'label', 'was', 'discontinued', 'in']

['1970']
['1970', '.']
['discontinued', 'in', '1970', '.']

['Leiberson']
['Leiberson', '[PER]', 'promoted', 'to', 'head', 'the', 'new', '"', '[ORG]', 'CBS-Columbia']
['with', '[PER]', 'Leiberson', '[PER]', 'promoted', 'to', 'head', 'the', 'new', '"', '[ORG]', 'CBS-Columbia']

['1968']
['1968', ',', '[ORG]', 'CBS', '[ORG]', 'and', '[ORG]', 'Sony', '[ORG]', 'formed']
['In', 'March', '1968', ',', '[ORG]', 'CBS', '[ORG]', 'and', '[ORG]', 'Sony', '[ORG]', 'formed']

['1983']
['1983', '.']
['market', 'in', '1983', '.']

['1972']
['1972', ',', 'after', 'it', 'was', 'discovered', 'that'

12973it [00:17, 738.06it/s]

['2013']
['2013', '.']
['[LOC]', 'in', '2013', '.']

['300']
['300', 'real', 'animal', 'skeletons', '.', 'Focusing', 'on', 'the', 'form', 'and']
['more', 'than', '300', 'real', 'animal', 'skeletons', '.', 'Focusing', 'on', 'the', 'form', 'and']

['2009']
['2009', '(', 'although', 'completion', 'of', 'the', 'facility', 'has', 'been', 'held']
['construction', 'in', '2009', '(', 'although', 'completion', 'of', 'the', 'facility', 'has', 'been', 'held']

['The', 'Oklahoma', 'City', 'National', 'Memorial']
['The', '[ORG]', '[LOC]', 'Oklahoma', 'City', 'National', 'Memorial', '[LOC]', '[ORG]', 'in']
[]

['Institute', 'for', 'the', 'Prevention', 'of', 'Terrorism']
['Institute', 'for', 'the', 'Prevention', 'of', 'Terrorism', '[ORG]', ',', 'a', 'non-partisan']
['National', 'Memorial', 'Institute', 'for', 'the', 'Prevention', 'of', 'Terrorism', '[ORG]', ',', 'a', 'non-partisan']

['the', 'banjo']
['the', 'banjo', '.', 'With', 'a', 'collection', 'valued', 'at', '$', '3.5']
['instrument', '–', 'the

13143it [00:17, 671.67it/s]

['agricultural', 'societies']
['agricultural', 'societies', 'increased', ',', 'they', 'expanded', 'into', 'lands', 'traditionally', 'used']
['size', 'of', 'agricultural', 'societies', 'increased', ',', 'they', 'expanded', 'into', 'lands', 'traditionally', 'used']

['first', 'forms', 'of', 'government']
['first', 'forms', 'of', 'government', 'in', 'agricultural', 'centers', ',', 'such', 'as']
['of', 'the', 'first', 'forms', 'of', 'government', 'in', 'agricultural', 'centers', ',', 'such', 'as']

['agricultural', 'centers']
['agricultural', 'centers', ',', 'such', 'as', 'the', '[LOC]', 'Fertile', 'Crescent', '[LOC]']
['government', 'in', 'agricultural', 'centers', ',', 'such', 'as', 'the', '[LOC]', 'Fertile', 'Crescent', '[LOC]']

['agriculture-driven']
['agriculture-driven', 'expansion', 'led', 'to', 'the', 'development', 'of', 'the', 'first', 'forms']
['process', 'of', 'agriculture-driven', 'expansion', 'led', 'to', 'the', 'development', 'of', 'the', 'first', 'forms']

['agricultural',

13339it [00:18, 802.29it/s]

['United', 'Nations', 'Development', 'Fund']
['United', 'Nations', 'Development', 'Fund', '[ORG]', '.', 'In', '1971', 'it', 'was']
['the', '[ORG]', 'United', 'Nations', 'Development', 'Fund', '[ORG]', '.', 'In', '1971', 'it', 'was']

['1971']
['1971', 'it', 'was', 'placed', 'under', 'the', 'authority', 'of', 'the', '[ORG]']
['.', 'In', '1971', 'it', 'was', 'placed', 'under', 'the', 'authority', 'of', 'the', '[ORG]']

['2015']
['2015', ',', 'the', '193', 'member', 'states', 'of', 'the', '[ORG]', 'United']
['In', 'September', '2015', ',', 'the', '193', 'member', 'states', 'of', 'the', '[ORG]', 'United']

['193']
['193', 'member', 'states', 'of', 'the', '[ORG]', 'United', 'Nations', '[ORG]', 'unanimously']
[',', 'the', '193', 'member', 'states', 'of', 'the', '[ORG]', 'United', 'Nations', '[ORG]', 'unanimously']

['17']
['17', 'goals', 'aiming', 'to', 'transform', 'the', 'world', 'over', 'the', 'next']
['set', 'of', '17', 'goals', 'aiming', 'to', 'transform', 'the', 'world', 'over', 'the',

13506it [00:18, 787.97it/s]

['the', 'Kazakh', 'and', 'Kirghiz', 'Soviet', 'Socialist', 'Republics']
['the', '[MISC]', 'Kazakh', '[MISC]', 'and', '[ORG]', 'Kirghiz', 'Soviet', 'Socialist', 'Republics']
['transformed', 'into', 'the', '[MISC]', 'Kazakh', '[MISC]', 'and', '[ORG]', 'Kirghiz', 'Soviet', 'Socialist', 'Republics']

['the', 'Uzbek', 'SSR']
['the', '[LOC]', 'Uzbek', 'SSR', '[LOC]', '.', '.']
['transferred', 'to', 'the', '[LOC]', 'Uzbek', 'SSR', '[LOC]', '.', '.']

['reduced']
['reduced', '.', 'The', '[MISC]', 'Kazakh', 'ASSR', '[MISC]', 'and', '[ORG]', '[MISC]']
['was', 'significantly', 'reduced', '.', 'The', '[MISC]', 'Kazakh', 'ASSR', '[MISC]', 'and', '[ORG]', '[MISC]']

['the', 'Russian', 'Soviet', 'Federative', 'Socialist', 'Republic']
['the', '[ORG]', 'Russian', 'Soviet', 'Federative', 'Socialist', 'Republic', '[ORG]', '.', '.']
['renamed', 'it', 'the', '[ORG]', 'Russian', 'Soviet', 'Federative', 'Socialist', 'Republic', '[ORG]', '.', '.']

['the', 'Russian', 'Constitution', 'of', '1937']
['the', '[MI

13587it [00:18, 656.18it/s]

['1938', '–', '42']
['1938', '–', '42', ')', 'and', '[ORG]', '[PER]', '[MISC]', 'The', 'Ritz']
['[PER]', '(', '1938', '–', '42', ')', 'and', '[ORG]', '[PER]', '[MISC]', 'The', 'Ritz']

['1932', '–', '33']
['1932', '–', '33', ')', ',', '[PER]', 'Buck', 'Jones', '[PER]', '(']
['[PER]', '(', '1932', '–', '33', ')', ',', '[PER]', 'Buck', 'Jones', '[PER]', '(']

['1946', '–', '47']
['1946', '–', '47', ')', '.']
['[PER]', '(', '1946', '–', '47', ')', '.']

['Bud', 'Abbott', 'and', 'Lou', 'Costello']
['Bud', 'Abbott', '[PER]', 'and', '[PER]', 'Lou', 'Costello', '[PER]', ')', '.']
['(', '[PER]', 'Bud', 'Abbott', '[PER]', 'and', '[PER]', 'Lou', 'Costello', '[PER]', ')', '.']

['Buck', 'Privates']
['Buck', 'Privates', '[MISC]', '(', '1941', ')', 'gave', 'the', 'former', 'burlesque']
['comedy', '[MISC]', 'Buck', 'Privates', '[MISC]', '(', '1941', ')', 'gave', 'the', 'former', 'burlesque']

['W.', 'C.', 'Fields']
['W.', 'C.', 'Fields', '[PER]', ',', 'and', 'the', 'comedy', 'team', 'of']
[',', '[PE

13740it [00:18, 704.50it/s]

['Focus', 'Features']
['Focus', 'Features', '[ORG]', 'as', 'a', 'genre', 'label', ',', 'that', 'concentrated']
['by', '[ORG]', 'Focus', 'Features', '[ORG]', 'as', 'a', 'genre', 'label', ',', 'that', 'concentrated']

['action', ',', 'sci-fi', ',', 'and', 'horror']
['action', ',', 'sci-fi', ',', 'and', 'horror', 'films', '.']
['concentrated', 'on', 'action', ',', 'sci-fi', ',', 'and', 'horror', 'films', '.']

['2015']
['2015', ',', '[ORG]', 'Universal', '[ORG]', 'is', 'the', 'only', 'studio', 'to']
['As', 'of', '2015', ',', '[ORG]', 'Universal', '[ORG]', 'is', 'the', 'only', 'studio', 'to']

['Minions']
['Minions', '[ORG]', '.', '.']
['World', 'and', 'Minions', '[ORG]', '.', '.']

['Universal', 'is', 'the', 'only', 'studio']
['Universal', '[ORG]', 'is', 'the', 'only', 'studio', 'to', 'have', 'released', 'three']
[',', '[ORG]', 'Universal', '[ORG]', 'is', 'the', 'only', 'studio', 'to', 'have', 'released', 'three']

[',', 'Universal', 'Productions', 'France']
[',', '[ORG]', 'Universal', 'P

13954it [00:18, 839.66it/s]

['reading', 'and', 'singing']
['reading', 'and', 'singing', '.', 'These', 'experiments', 'clearly', 'proved', 'that', 'the']
['Brantford', '[LOC]', 'reading', 'and', 'singing', '.', 'These', 'experiments', 'clearly', 'proved', 'that', 'the']

['$', '100,000']
['$', '100,000', '.', 'The', 'president', 'of', '[ORG]', '[LOC]', 'Western', 'Union']
['[ORG]', 'for', '$', '100,000', '.', 'The', 'president', 'of', '[ORG]', '[LOC]', 'Western', 'Union']

['Western', 'Union']
['Western', 'Union', '[LOC]', '[ORG]', 'for', '$', '100,000', '.', 'The', 'president']
['[ORG]', '[LOC]', 'Western', 'Union', '[LOC]', '[ORG]', 'for', '$', '100,000', '.', 'The', 'president']

['a', 'toy']
['a', 'toy', '.', 'Two', 'years', 'later', ',', 'he', 'told', 'colleagues']
['nothing', 'but', 'a', 'toy', '.', 'Two', 'years', 'later', ',', 'he', 'told', 'colleagues']

['$', '25', 'million']
['$', '25', 'million', 'he', 'would', 'consider', 'it', 'a', 'bargain', '.']
['patent', 'for', '$', '25', 'million', 'he', 'would'

14135it [00:19, 847.07it/s]

['the', 'First', 'Amendment']
['the', '[MISC]', 'First', 'Amendment', '[MISC]', 'is', 'a', 'plan', 'to', 'regulate']
['[MISC]', 'than', 'the', '[MISC]', 'First', 'Amendment', '[MISC]', 'is', 'a', 'plan', 'to', 'regulate']

['free', 'speech']
['free', 'speech', '.', 'They', 'both', 'stand', 'for', 'the', 'same', 'concept']
['to', 'regulate', 'free', 'speech', '.', 'They', 'both', 'stand', 'for', 'the', 'same', 'concept']

['13', 'April', '2015']
['13', 'April', '2015', ',', 'the', '[ORG]', 'FCC', '[ORG]', 'published', 'the']
['.', 'On', '13', 'April', '2015', ',', 'the', '[ORG]', 'FCC', '[ORG]', 'published', 'the']

['Internet', 'access']
['Internet', '[MISC]', 'access', ',', 'employing', 'a', 'range', 'of', 'technologies', 'to']
['provide', '[MISC]', 'Internet', '[MISC]', 'access', ',', 'employing', 'a', 'range', 'of', 'technologies', 'to']

['a', 'range', 'of', 'technologies', 'to', 'connect', 'users', 'to', 'their', 'network']
['a', 'range', 'of', 'technologies', 'to', 'connect', 'us

14277it [00:19, 968.95it/s]

['periodicals']
['periodicals', '.', 'It', 'is', 'common', 'in', '[MISC]', 'English', '[MISC]', 'to']
['but', 'rather', 'periodicals', '.', 'It', 'is', 'common', 'in', '[MISC]', 'English', '[MISC]', 'to']

['manga']
['manga', 'for', '[MISC]', 'Japanese', '[MISC]', 'comics', ',', 'or', 'bandes', 'dessinées']
['such', 'as', 'manga', 'for', '[MISC]', 'Japanese', '[MISC]', 'comics', ',', 'or', 'bandes', 'dessinées']

['bandes', 'dessinées']
['bandes', 'dessinées', 'for', '[MISC]', 'French-language', 'Franco-Belgian', '[MISC]', 'comics', '.']
[',', 'or', 'bandes', 'dessinées', 'for', '[MISC]', 'French-language', 'Franco-Belgian', '[MISC]', 'comics', '.']

['specialists']
['specialists', '.', 'There', 'may', 'be', 'separate', 'writers', 'and', 'artists', ',']
['number', 'of', 'specialists', '.', 'There', 'may', 'be', 'separate', 'writers', 'and', 'artists', ',']

['penciller']
['penciller', ',', 'who', 'lays', 'out', 'the', 'artwork', 'in', 'pencil', ';']
['between', 'a', 'penciller', ',', '

14477it [00:19, 950.84it/s]

['8', '%']
['8', '%', 'between', '2000', 'and', '2004', ',', 'implying', 'a', 'decline']
['%', 'to', '8', '%', 'between', '2000', 'and', '2004', ',', 'implying', 'a', 'decline']

['22']
['22', '%', 'of', 'the', 'population', 'claimed', 'social', 'security', 'benefit', 'in']
['Nevertheless', ',', '22', '%', 'of', 'the', 'population', 'claimed', 'social', 'security', 'benefit', 'in']

['20', '%']
['20', '%', 'of', 'the', 'population', '.']
['that', 'represents', '20', '%', 'of', 'the', 'population', '.']

['70,560']
['70,560', 'copper', 'tokens', 'worth', 'a', 'halfpenny', 'each', 'Payable', 'at', '[LOC]']
['issued', 'a', '70,560', 'copper', 'tokens', 'worth', 'a', 'halfpenny', 'each', 'Payable', 'at', '[LOC]']

['Saul', 'Solomon']
['Saul', 'Solomon', '[PER]', 'issued', 'a', '70,560', 'copper', 'tokens', 'worth', 'a']
[',', '[PER]', 'Saul', 'Solomon', '[PER]', 'issued', 'a', '70,560', 'copper', 'tokens', 'worth', 'a']

['a', 'halfpenny']
['a', 'halfpenny', 'each', 'Payable', 'at', '[LOC]

14677it [00:19, 947.04it/s]

['⟨', 'b', '̤⟩', ',', 'with', 'the', 'diacritic']
['⟨', 'b', '̤⟩', ',', 'with', 'the', 'diacritic', 'for', 'breathy', 'voice']
['transcribed', 'as', '⟨', 'b', '̤⟩', ',', 'with', 'the', 'diacritic', 'for', 'breathy', 'voice']

['murmured', 'sonorants']
['murmured', 'sonorants', ',', 'such', 'as', 'vowels', 'and', 'nasals', ',', 'which']
['⟨◌̤⟩', 'to', 'murmured', 'sonorants', ',', 'such', 'as', 'vowels', 'and', 'nasals', ',', 'which']

['vowels', 'and', 'nasals']
['vowels', 'and', 'nasals', ',', 'which', 'are', 'murmured', 'throughout', 'their', 'duration']
['such', 'as', 'vowels', 'and', 'nasals', ',', 'which', 'are', 'murmured', 'throughout', 'their', 'duration']

['breathy-voiced', 'release', 'of', 'obstruents']
['breathy-voiced', 'release', 'of', 'obstruents', '.']
['for', 'the', 'breathy-voiced', 'release', 'of', 'obstruents', '.']

['Hydrogen']
['Hydrogen', 'is', 'a', 'chemical', 'element', 'with', 'chemical', 'symbol', 'H', 'and']
[]

['1']
['1', '.', 'With', 'an', 'atomic', 'wei

14866it [00:19, 868.22it/s]

['May', '2012']
['May', '2012', ',', '[ORG]', 'Chrome', '[ORG]', "'s", 'usage', 'passed', 'the']
['and', 'in', 'May', '2012', ',', '[ORG]', 'Chrome', '[ORG]', "'s", 'usage', 'passed', 'the']

['Internet', 'Explorer']
['Internet', 'Explorer', '[MISC]', '[ORG]', ',', 'on', 'the', 'other', 'hand', ',']
['[ORG]', '[MISC]', 'Internet', 'Explorer', '[MISC]', '[ORG]', ',', 'on', 'the', 'other', 'hand', ',']

['sales', 'of', 'Windows', 'to', 'computer', 'manufacturers', 'and', 'direct', 'to', 'users']
['sales', 'of', '[MISC]', 'Windows', '[MISC]', 'to', 'computer', 'manufacturers', 'and', 'direct']
['by', 'the', 'sales', 'of', '[MISC]', 'Windows', '[MISC]', 'to', 'computer', 'manufacturers', 'and', 'direct']

['Mac']
['Mac', '[MISC]', '[ORG]', '.', 'It', 'is', 'likely', 'that', 'releasing', 'IE']
['[ORG]', '[MISC]', 'Mac', '[MISC]', '[ORG]', '.', 'It', 'is', 'likely', 'that', 'releasing', 'IE']

['the', 'European', 'Commission']
['the', '[ORG]', 'European', 'Commission', '[ORG]', 'announced', 

14954it [00:20, 706.07it/s]

['89.6', 'square', 'miles']
['89.6', 'square', 'miles', '(', '232.1', 'km2', ')—', '48.4', 'square', 'miles']
['area', 'of', '89.6', 'square', 'miles', '(', '232.1', 'km2', ')—', '48.4', 'square', 'miles']

['48.4', 'square', 'miles']
['48.4', 'square', 'miles', '(', '125.4', 'km2', ')', '(', '54.0', '%)']
['km2', ')—', '48.4', 'square', 'miles', '(', '125.4', 'km2', ')', '(', '54.0', '%)']

['41.2', 'square', 'miles']
['41.2', 'square', 'miles', '(', '106.7', 'km2', ')', '(', '46.0', '%)']
['land', 'and', '41.2', 'square', 'miles', '(', '106.7', 'km2', ')', '(', '46.0', '%)']

['19', 'ft']
['19', 'ft', '(', '5.8', 'm', ')', 'above', 'sea', 'level', '.']
[',', 'is', '19', 'ft', '(', '5.8', 'm', ')', 'above', 'sea', 'level', '.']

['oceanic', 'coastline']
['oceanic', 'coastline', '.']
['with', 'an', 'oceanic', 'coastline', '.']

['Greater', 'Boston']
['Greater', 'Boston', '[LOC]', '"', 'region', 'and', 'is', 'contiguously', 'bordered', 'by']
['"', '[LOC]', 'Greater', 'Boston', '[LOC]', 

15106it [00:20, 708.45it/s]

['south', 'of', 'Boston', 'Common']
['south', 'of', '[LOC]', 'Boston', 'Common', '[LOC]', ',', 'including', 'the', '[ORG]']
['District', '[LOC]', 'south', 'of', '[LOC]', 'Boston', 'Common', '[LOC]', ',', 'including', 'the', '[ORG]']

['First', 'Night']
['First', 'Night', '[MISC]', ',', 'which', 'occurs', 'on', '[MISC]', 'New', 'Year']
['as', '[MISC]', 'First', 'Night', '[MISC]', ',', 'which', 'occurs', 'on', '[MISC]', 'New', 'Year']

['Christopher', 'Columbus', 'Waterfront', 'Park']
['Christopher', 'Columbus', 'Waterfront', 'Park', '[LOC]', '[ORG]', ',', 'and', '[MISC]', 'Italian']
['[ORG]', '[LOC]', 'Christopher', 'Columbus', 'Waterfront', 'Park', '[LOC]', '[ORG]', ',', 'and', '[MISC]', 'Italian']

['the', 'North', 'End']
['the', '[LOC]', 'North', 'End', '[LOC]', 'honoring', '[MISC]', 'Catholic', '[MISC]', 'saints']
['feasts', 'in', 'the', '[LOC]', 'North', 'End', '[LOC]', 'honoring', '[MISC]', 'Catholic', '[MISC]', 'saints']

['Catholic', 'saints']
['Catholic', '[MISC]', 'saints', '.

15267it [00:20, 702.63it/s]

['mainly', 'for', 'customers', 'in', 'China', 'and', 'neighboring', 'regions']
['mainly', 'for', 'customers', 'in', '[LOC]', 'China', '[LOC]', 'and', 'neighboring', 'regions']
['services', ',', 'mainly', 'for', 'customers', 'in', '[LOC]', 'China', '[LOC]', 'and', 'neighboring', 'regions']

['BeiDou-1']
['BeiDou-1', '[MISC]', ',', 'consists', 'of', 'three', 'satellites', 'and', 'offers', 'limited']
['as', '[MISC]', 'BeiDou-1', '[MISC]', ',', 'consists', 'of', 'three', 'satellites', 'and', 'offers', 'limited']

['three']
['three', 'satellites', 'and', 'offers', 'limited', 'coverage', 'and', 'applications', '.', 'It']
['consists', 'of', 'three', 'satellites', 'and', 'offers', 'limited', 'coverage', 'and', 'applications', '.', 'It']

['since', '2000']
['since', '2000', '.']
['regions', ',', 'since', '2000', '.']

['BeiDou', 'Navigation', 'Satellite', 'System']
['BeiDou', 'Navigation', 'Satellite', 'System', '[MISC]', '[ORG]', '(', '[ORG]', 'BDS', '[ORG]']
['[ORG]', '[MISC]', 'BeiDou', 'Nav

15432it [00:20, 744.88it/s]

['cane']
['cane', '")', '.']
['word', '"', 'cane', '")', '.']

['eighty-five']
['eighty-five', 'in', 'the', '[MISC]', 'Eastern', '[MISC]', ',', 'fifty', 'in', 'the']
['decrees', '(', 'eighty-five', 'in', 'the', '[MISC]', 'Eastern', '[MISC]', ',', 'fifty', 'in', 'the']

['fifty']
['fifty', 'in', 'the', '[LOC]', 'Western', 'Church', '[LOC]', ')', 'concerning', 'the']
['[MISC]', ',', 'fifty', 'in', 'the', '[LOC]', 'Western', 'Church', '[LOC]', ')', 'concerning', 'the']

['Ecclesiastical', 'Canons']
['Ecclesiastical', 'Canons', 'of', 'the', 'Same', 'Holy', 'Apostles', '[MISC]', 'is', 'a']
['or', '[MISC]', 'Ecclesiastical', 'Canons', 'of', 'the', 'Same', 'Holy', 'Apostles', '[MISC]', 'is', 'a']

['325']
['325', ')', 'calls', 'canons', 'the', 'disciplinary', 'measures', 'of', 'the', '[ORG]']
['[ORG]', '(', '325', ')', 'calls', 'canons', 'the', 'disciplinary', 'measures', 'of', 'the', '[ORG]']

['a', 'rule']
['a', 'rule', '.', 'There', 'is', 'a', 'very', 'early', 'distinction', 'between']
['[

15697it [00:20, 1032.61it/s]

['gradually', 'began', 'to', 'fall']
['gradually', 'began', 'to', 'fall', 'as', 'broadband', 'wireless', 'access', 'rose', '.']
['satellite', 'services', 'gradually', 'began', 'to', 'fall', 'as', 'broadband', 'wireless', 'access', 'rose', '.']

['Tri-Band', '3G']
['Tri-Band', '3G', '[MISC]', '[ORG]', 'service', 'for', 'internet', 'and', 'mobile', 'clients']
['[ORG]', '[MISC]', 'Tri-Band', '3G', '[MISC]', '[ORG]', 'service', 'for', 'internet', 'and', 'mobile', 'clients']

['internet', 'and', 'mobile', 'clients']
['internet', 'and', 'mobile', 'clients', '.', 'The', 'first', 'of', 'its', 'kind']
['service', 'for', 'internet', 'and', 'mobile', 'clients', '.', 'The', 'first', 'of', 'its', 'kind']

['Tri-Band', '3G']
['Tri-Band', '3G', '[MISC]', '[ORG]', 'service', 'for', 'internet', 'and', 'mobile', 'clients']
['[ORG]', '[MISC]', 'Tri-Band', '3G', '[MISC]', '[ORG]', 'service', 'for', 'internet', 'and', 'mobile', 'clients']

['faster', 'and', 'more', 'secure']
['faster', 'and', 'more', 'secu

15909it [00:21, 949.55it/s] 

['Valencian', 'parliament']
['Valencian', '[MISC]', 'parliament', ',', 'is', 'in', 'charge', 'of', 'dictating', 'the']
['the', '[MISC]', 'Valencian', '[MISC]', 'parliament', ',', 'is', 'in', 'charge', 'of', 'dictating', 'the']

['use', 'of', 'Valencian']
['use', 'of', '[MISC]', 'Valencian', '[MISC]', ',', 'and', 'its', 'standard', 'is']
['governing', 'the', 'use', 'of', '[MISC]', 'Valencian', '[MISC]', ',', 'and', 'its', 'standard', 'is']

['Norms', 'of', 'Castelló']
['Norms', 'of', 'Castelló', '[MISC]', '(', '[ORG]', '[LOC]', 'Normes', 'de', 'Castelló']
['the', '[MISC]', 'Norms', 'of', 'Castelló', '[MISC]', '(', '[ORG]', '[LOC]', 'Normes', 'de', 'Castelló']

['everyone', 'who', 'writes']
['everyone', 'who', 'writes', 'in', '[MISC]', 'Valencian', '[MISC]', 'uses', 'this', 'standard']
['Currently', ',', 'everyone', 'who', 'writes', 'in', '[MISC]', 'Valencian', '[MISC]', 'uses', 'this', 'standard']

['Royal', 'Academy', 'of', 'Valencian', 'Culture']
['Royal', 'Academy', 'of', 'Valencian'

16131it [00:21, 1006.68it/s]

['Estonian']
['Estonian', '[MISC]', ',', 'nouns', 'and', 'pronouns', 'do', 'not', 'have', 'grammatical']
['In', '[MISC]', 'Estonian', '[MISC]', ',', 'nouns', 'and', 'pronouns', 'do', 'not', 'have', 'grammatical']

['adjectives']
['adjectives', 'decline', 'in', 'fourteen', 'cases', ':', 'nominative', ',', 'genitive', ',']
['nouns', 'and', 'adjectives', 'decline', 'in', 'fourteen', 'cases', ':', 'nominative', ',', 'genitive', ',']

['Finnish']
['Finnish', '[MISC]', '.', 'This', 'is', 'a', 'rough', 'equivalent', 'of', 'the']
['in', '[MISC]', 'Finnish', '[MISC]', '.', 'This', 'is', 'a', 'rough', 'equivalent', 'of', 'the']

['partitive']
['partitive', '(', 'for', 'partial', 'objects', ')', '.', 'The', 'accusative', 'coincides']
['in', 'the', 'partitive', '(', 'for', 'partial', 'objects', ')', '.', 'The', 'accusative', 'coincides']

['total', 'objects']
['total', 'objects', ')', 'or', 'in', 'the', 'partitive', '(', 'for', 'partial']
['(', 'for', 'total', 'objects', ')', 'or', 'in', 'the', 'p

16234it [00:21, 954.54it/s] 

['June', '20', ',', '1987']
['June', '20', ',', '1987', ',', 'at', 'the', '[ORG]', 'Rosemont', 'Horizon']
[',', 'on', 'June', '20', ',', '1987', ',', 'at', 'the', '[ORG]', 'Rosemont', 'Horizon']

['the', 'Rosemont', 'Horizon']
['the', '[ORG]', 'Rosemont', 'Horizon', '[ORG]', 'in', 'suburban', '[LOC]', 'Chicago', '[LOC]']
[',', 'at', 'the', '[ORG]', 'Rosemont', 'Horizon', '[ORG]', 'in', 'suburban', '[LOC]', 'Chicago', '[LOC]']

['September', '30', ',', '1987']
['September', '30', ',', '1987', ',', '[PER]', '[ORG]', 'Foster', '[ORG]', '[PER]']
[]

['March', '27', ',', '1990']
['March', '27', ',', '1990', '.', 'The', 'patent', 'expired', 'on', 'September']
['granted', 'on', 'March', '27', ',', '1990', '.', 'The', 'patent', 'expired', 'on', 'September']

['September', '30', ',', '2007']
['September', '30', ',', '2007', '.']
['expired', 'on', 'September', '30', ',', '2007', '.']

['the', 'United', 'States', 'Patent', 'and', 'Trademark', 'Office']
['the', '[ORG]', 'United', 'States', 'Patent

16415it [00:21, 781.79it/s]

['Terry', 'Emmert']
['Terry', 'Emmert', '[PER]', '(', 'similar', 'to', 'the', '[PER]', '[MISC]', 'Jerry']
['owner', '[PER]', 'Terry', 'Emmert', '[PER]', '(', 'similar', 'to', 'the', '[PER]', '[MISC]', 'Jerry']

['Jerry', 'Jones']
['Jerry', 'Jones', '[MISC]', '[PER]', 'move', 'with', 'the', '[MISC]', '[ORG]', 'Desperados']
['[PER]', '[MISC]', 'Jerry', 'Jones', '[MISC]', '[PER]', 'move', 'with', 'the', '[MISC]', '[ORG]', 'Desperados']

['Scott', 'Butera']
['Scott', 'Butera', '[PER]', 'announced', 'that', 'a', 'new', 'identity', 'will', 'be']
['commissioner', '[PER]', 'Scott', 'Butera', '[PER]', 'announced', 'that', 'a', 'new', 'identity', 'will', 'be']

['December', '10', ',', '2015']
['December', '10', ',', '2015', ',', 'shows', 'an', 'eight-team', 'league', 'playing']
['website', 'on', 'December', '10', ',', '2015', ',', 'shows', 'an', 'eight-team', 'league', 'playing']

['eight-team']
['eight-team', 'league', 'playing', 'a', '16-game', 'regular', 'season', 'over', '18', 'weeks']
['sho

16606it [00:21, 839.72it/s]

['"', 'All', 'My', 'Life', '"']
['"', '[MISC]', 'All', 'My', 'Life', '[MISC]', '"', ',', 'were', 'both']
['"', 'and', '"', '[MISC]', 'All', 'My', 'Life', '[MISC]', '"', ',', 'were', 'both']

['1990s']
['1990s', 'marked', 'the', 'softening', 'of', 'urban', 'R', '&', 'B', 'at']
['The', 'early', '1990s', 'marked', 'the', 'softening', 'of', 'urban', 'R', '&', 'B', 'at']

['softening']
['softening', 'of', 'urban', 'R', '&', 'B', 'at', 'the', 'same', 'time']
['marked', 'the', 'softening', 'of', 'urban', 'R', '&', 'B', 'at', 'the', 'same', 'time']

['widening', 'of', 'the', 'market']
['widening', 'of', 'the', 'market', ',', 'not', 'only', 'allowing', 'to', 'cater']
['to', 'a', 'widening', 'of', 'the', 'market', ',', 'not', 'only', 'allowing', 'to', 'cater']

['1980s']
['1980s', 'mainstream', 'singers', ',', 'the', '1990s', 'mainstream', 'pop', '/', 'R']
['majority', 'of', '1980s', 'mainstream', 'singers', ',', 'the', '1990s', 'mainstream', 'pop', '/', 'R']

['Savage', 'Garden']
['Savage', 'Ga

16694it [00:22, 816.94it/s]

['a', 'Bishop', 'of', 'London']
['a', 'Bishop', 'of', '[LOC]', 'London', '[LOC]', '.', 'Construction', 'of', 'the']
['[MISC]', ',', 'a', 'Bishop', 'of', '[LOC]', 'London', '[LOC]', '.', 'Construction', 'of', 'the']

['1245']
['1245', ',', 'on', 'the', 'orders', 'of', '[PER]', 'King', 'Henry', 'III']
['began', 'in', '1245', ',', 'on', 'the', 'orders', 'of', '[PER]', 'King', 'Henry', 'III']

['King', 'Henry', 'III']
['King', 'Henry', 'III', '[PER]', '.', '.']
['of', '[PER]', 'King', 'Henry', 'III', '[PER]', '.', '.']

['Aldrich']
['Aldrich', '[PER]', 'on', 'the', '[LOC]', 'River', 'Thames', '[LOC]', 'saw', 'a']
['called', '[PER]', 'Aldrich', '[PER]', 'on', 'the', '[LOC]', 'River', 'Thames', '[LOC]', 'saw', 'a']

['salmon']
['salmon', 'from', '[LOC]', 'Thames', '[LOC]', 'fishermen', 'that', 'the', 'abbey', 'received']
['gifts', 'of', 'salmon', 'from', '[LOC]', 'Thames', '[LOC]', 'fishermen', 'that', 'the', 'abbey', 'received']

['Benedictine']
['Benedictine', '[MISC]', 'monks', 'here', '.

16856it [00:22, 663.34it/s]

['John', 'Loughborough', 'Pearson']
['John', 'Loughborough', 'Pearson', '[PER]', ',', 'were', 're-instated', 'and', 'coloured', 'in']
['by', '[PER]', 'John', 'Loughborough', 'Pearson', '[PER]', ',', 'were', 're-instated', 'and', 'coloured', 'in']

['the', 'Celestial', 'Organ']
['the', '[MISC]', 'Celestial', 'Organ', '[MISC]', ',', 'is', 'currently', 'not', 'connected']
['instrument', ',', 'the', '[MISC]', 'Celestial', 'Organ', '[MISC]', ',', 'is', 'currently', 'not', 'connected']

['1971']
['1971', '.', 'The', 'ring', 'is', 'now', 'made', 'up', 'of', 'ten']
['overhauled', 'in', '1971', '.', 'The', 'ring', 'is', 'now', 'made', 'up', 'of', 'ten']

['Tenor']
['Tenor', 'bell', 'in', 'D', '(', '588.5', 'Hz', ')', 'has', 'a']
['.', 'The', 'Tenor', 'bell', 'in', 'D', '(', '588.5', 'Hz', ')', 'has', 'a']

['1971']
['1971', '.', 'The', 'ring', 'is', 'now', 'made', 'up', 'of', 'ten']
['overhauled', 'in', '1971', '.', 'The', 'ring', 'is', 'now', 'made', 'up', 'of', 'ten']

['Whitechapel', 'Bell',

16996it [00:22, 593.27it/s]

['it', 'was', 'impossible', 'to', 'move', 'or', 'complete', 'it']
['it', 'was', 'impossible', 'to', 'move', 'or', 'complete', 'it', '.']
['was', 'understood', 'it', 'was', 'impossible', 'to', 'move', 'or', 'complete', 'it', '.']

['the', 'Yongle', 'Emperor']
['the', '[LOC]', '[MISC]', 'Yongle', 'Emperor', '[MISC]', '[LOC]', ',', 'lies', 'abandoned']
['orders', 'of', 'the', '[LOC]', '[MISC]', 'Yongle', 'Emperor', '[MISC]', '[LOC]', ',', 'lies', 'abandoned']

['admiral', 'Zheng', 'He']
['admiral', '[PER]', 'Zheng', 'He', '[PER]', ',', 'who', 'went', 'to', 'sail']
['of', 'the', 'admiral', '[PER]', 'Zheng', 'He', '[PER]', ',', 'who', 'went', 'to', 'sail']

['(', 'Boni', '渤泥)']
['(', '[PER]', 'Boni', '[PER]', '渤泥)', ',', 'who', 'died', 'during', 'his']
['Borneo', '[LOC]', '(', '[PER]', 'Boni', '[PER]', '渤泥)', ',', 'who', 'died', 'during', 'his']

['Borneo']
['Borneo', '[LOC]', '(', '[PER]', 'Boni', '[PER]', '渤泥)', ',', 'who', 'died']
['from', '[LOC]', 'Borneo', '[LOC]', '(', '[PER]', 'Boni'

17143it [00:22, 653.64it/s]

['RMB', '801', 'billion']
['RMB', '[MISC]', '801', 'billion', '(', '3rd', 'in', '[LOC]', 'Jiangsu', '[LOC]']
['was', '[MISC]', 'RMB', '[MISC]', '801', 'billion', '(', '3rd', 'in', '[LOC]', 'Jiangsu', '[LOC]']

['RMB', '98,174', '(', 'US', '$', '16041', ')']
['RMB', '98,174', '(', '[LOC]', 'US', '[LOC]', '$', '16041', ')', ',']
[')', 'was', 'RMB', '98,174', '(', '[LOC]', 'US', '[LOC]', '$', '16041', ')', ',']

['increase']
['increase', 'from', '2012', '.', 'The', 'average', 'urban', 'resident', "'s", 'disposable']
['11', 'percent', 'increase', 'from', '2012', '.', 'The', 'average', 'urban', 'resident', "'s", 'disposable']

['lower', 'than', 'the', 'national', 'average']
['lower', 'than', 'the', 'national', 'average', '(', '4.3', 'percent', ')', '.']
['percent', ',', 'lower', 'than', 'the', 'national', 'average', '(', '4.3', 'percent', ')', '.']

['12th']
['12th', 'in', '2013', 'in', '[LOC]', 'China', '[LOC]', ',', 'and', 'its']
['Product', 'ranked', '12th', 'in', '2013', 'in', '[LOC]', 

17225it [00:22, 697.56it/s]

['second']
['second', 'most', 'populous', 'of', '[LOC]', 'Switzerland', '[LOC]', "'s", 'cantons', '.']
[',', 'the', 'second', 'most', 'populous', 'of', '[LOC]', 'Switzerland', '[LOC]', "'s", 'cantons', '.']

['German']
['German', '[MISC]', ',', 'but', 'the', 'main', 'spoken', 'language', 'is', 'the']
[')', '[MISC]', 'German', '[MISC]', ',', 'but', 'the', 'main', 'spoken', 'language', 'is', 'the']

['Bernese', 'German']
['Bernese', 'German', '[MISC]', '.', '.']
['called', '[MISC]', 'Bernese', 'German', '[MISC]', '.', '.']

['Alemannic', 'Swiss']
['Alemannic', 'Swiss', 'German', '[MISC]', 'dialect', 'called', '[MISC]', 'Bernese', 'German', '[MISC]']
['the', '[MISC]', 'Alemannic', 'Swiss', 'German', '[MISC]', 'dialect', 'called', '[MISC]', 'Bernese', 'German', '[MISC]']

['German']
['German', '[MISC]', ',', 'but', 'the', 'main', 'spoken', 'language', 'is', 'the']
[')', '[MISC]', 'German', '[MISC]', ',', 'but', 'the', 'main', 'spoken', 'language', 'is', 'the']

['1983']
['1983', 'the', 'hi

17405it [00:23, 766.03it/s]

['106', 'm']
['106', 'm', '(', '348', 'ft', ')', ',', 'the', 'second', 'shortest']
['length', 'of', '106', 'm', '(', '348', 'ft', ')', ',', 'the', 'second', 'shortest']

['Marzilibahn']
['Marzilibahn', '[LOC]', 'funicular', 'is', ',', 'with', 'a', 'length', 'of', '106']
['The', '[LOC]', 'Marzilibahn', '[LOC]', 'funicular', 'is', ',', 'with', 'a', 'length', 'of', '106']

['Zagreb', 'funicular']
['Zagreb', '[LOC]', 'funicular', '.']
['the', '[LOC]', 'Zagreb', '[LOC]', 'funicular', '.']

['Bern', 'Airport']
['Bern', 'Airport', '[LOC]', '[ORG]', ',', 'located', 'outside', 'the', 'city', 'near']
['[ORG]', '[LOC]', 'Bern', 'Airport', '[LOC]', '[ORG]', ',', 'located', 'outside', 'the', 'city', 'near']

['Bern-Belp', 'or', 'Belpmoos']
['Bern-Belp', '[MISC]', '[LOC]', 'or', '[LOC]', '[MISC]', 'Belpmoos', '[MISC]', '[LOC]', ',']
['[LOC]', '[MISC]', 'Bern-Belp', '[MISC]', '[LOC]', 'or', '[LOC]', '[MISC]', 'Belpmoos', '[MISC]', '[LOC]', ',']

['summer', 'time']
['summer', 'time', 'is', 'the', 'pra

17483it [00:23, 668.85it/s]

['sporting', 'goods']
['sporting', 'goods', 'makers', ',', 'and', 'other', 'businesses', 'benefit', 'from', 'extra']
['Retailers', ',', 'sporting', 'goods', 'makers', ',', 'and', 'other', 'businesses', 'benefit', 'from', 'extra']

['the', 'National', 'Golf', 'Foundation']
['the', '[ORG]', 'National', 'Golf', 'Foundation', '[ORG]', 'estimated', 'the', 'extension', 'would']
[',', 'and', 'the', '[ORG]', 'National', 'Golf', 'Foundation', '[ORG]', 'estimated', 'the', 'extension', 'would']

['1984']
['1984', ',', '[ORG]', 'Fortune', 'magazine', '[ORG]', 'estimated', 'that', 'a', 'seven-week']
['.', 'In', '1984', ',', '[ORG]', 'Fortune', 'magazine', '[ORG]', 'estimated', 'that', 'a', 'seven-week']

['3', '%']
['3', '%', '.']
['by', 'about', '3', '%', '.']

['$', '30', 'million']
['$', '30', 'million', 'for', '[ORG]', '7-Eleven', '[ORG]', 'stores', ',', 'and']
['an', 'additional', '$', '30', 'million', 'for', '[ORG]', '7-Eleven', '[ORG]', 'stores', ',', 'and']

['after']
['after', 'dew', 'evap

17616it [00:23, 541.19it/s]

['date', '/', 'time']
['date', '/', 'time', 'calculations', 'from', 'data', 'derived', 'from', 'the', '[ORG]']
['base', 'their', 'date', '/', 'time', 'calculations', 'from', 'data', 'derived', 'from', 'the', '[ORG]']

['zoneinfo']
['zoneinfo', '.']
['known', 'as', 'zoneinfo', '.']

['historical', 'and', 'predicted', 'clock', 'shifts']
['historical', 'and', 'predicted', 'clock', 'shifts', '.', 'This', 'database', 'is', 'used']
['location', "'s", 'historical', 'and', 'predicted', 'clock', 'shifts', '.', 'This', 'database', 'is', 'used']

['system', 'maintenance']
['system', 'maintenance', '.', 'In', '[MISC]', 'Unix-like', '[MISC]', 'systems', 'the', '[MISC]']
['of', 'ordinary', 'system', 'maintenance', '.', 'In', '[MISC]', 'Unix-like', '[MISC]', 'systems', 'the', '[MISC]']

['TZ']
['TZ', '[MISC]', 'environment', 'variable', 'specifies', 'the', 'location', 'name', ',', 'as']
['the', '[MISC]', 'TZ', '[MISC]', 'environment', 'variable', 'specifies', 'the', 'location', 'name', ',', 'as']

['

17729it [00:23, 512.85it/s]

['an', 'approved', 'external', 'examiner']
['an', 'approved', 'external', 'examiner', ',', 'and', 'whose', 'standard', 'of', 'attainment']
['conducted', 'by', 'an', 'approved', 'external', 'examiner', ',', 'and', 'whose', 'standard', 'of', 'attainment']

['Visiting', 'Board']
['Visiting', '[ORG]', '[ORG]', 'Board', '[ORG]', '[ORG]', '"', 'from', 'the', '[ORG]']
['a', '"', 'Visiting', '[ORG]', '[ORG]', 'Board', '[ORG]', '[ORG]', '"', 'from', 'the', '[ORG]']

['European', 'Union']
['European', 'Union', 'Directives', '[ORG]', 'concerning', 'mutual', 'recognition', 'of', 'professional', 'qualifications']
['with', '[ORG]', 'European', 'Union', 'Directives', '[ORG]', 'concerning', 'mutual', 'recognition', 'of', 'professional', 'qualifications']

['1996']
['1996', '[ORG]', 'and', 'reenacted', 'as', 'the', '[MISC]', 'Architects', 'Act', '1997']
['Regeneration', 'Act', '1996', '[ORG]', 'and', 'reenacted', 'as', 'the', '[MISC]', 'Architects', 'Act', '1997']

['the', 'Housing', 'Grants', ',', 'Co

17902it [00:24, 676.67it/s]

['Congress']
['Congress', '[ORG]', '.', 'As', 'a', 'result', 'of', 'a', 'first', '[ORG]']
['by', '[ORG]', 'Congress', '[ORG]', '.', 'As', 'a', 'result', 'of', 'a', 'first', '[ORG]']

['Hoover', 'Commission']
['Hoover', 'Commission', '[ORG]', 'recommendation', ',', 'in', '1949', 'the', '[ORG]', 'National']
['first', '[ORG]', 'Hoover', 'Commission', '[ORG]', 'recommendation', ',', 'in', '1949', 'the', '[ORG]', 'National']

['subordinate', 'official']
['subordinate', 'official', 'to', 'the', '[ORG]', 'GSA', '[ORG]', 'Administrator', 'until', 'the']
['as', 'a', 'subordinate', 'official', 'to', 'the', '[ORG]', 'GSA', '[ORG]', 'Administrator', 'until', 'the']

['2007']
['2007', '.']
['end', 'in', '2007', '.']

['public', 'hearing']
['public', 'hearing', 'that', 'a', 'memorandum', 'of', 'understanding', 'between', '[ORG]', 'NARA']
['in', 'a', 'public', 'hearing', 'that', 'a', 'memorandum', 'of', 'understanding', 'between', '[ORG]', 'NARA']

['2010']
['2010', ',', 'Executive', '[MISC]', 'Order

17973it [00:24, 650.01it/s]

['2009']
['2009', 'forming', 'the', 'current', 'School', 'of', '[ORG]', 'Architecture', '[ORG]', ',']
['school', 'in', '2009', 'forming', 'the', 'current', 'School', 'of', '[ORG]', 'Architecture', '[ORG]', ',']

['DesignIntelligence']
['DesignIntelligence', '[ORG]', ',', 'which', 'annually', 'publishes', '"', '[ORG]', 'America', "'s"]
['journal', '[ORG]', 'DesignIntelligence', '[ORG]', ',', 'which', 'annually', 'publishes', '"', '[ORG]', 'America', "'s"]

['America', "'s", 'Best', 'Architecture', 'and', 'Design', 'Schools']
['America', "'s", '[ORG]', 'Best', '[MISC]', 'Architecture', '[MISC]', 'and', 'Design', 'Schools']
['"', '[ORG]', 'America', "'s", '[ORG]', 'Best', '[MISC]', 'Architecture', '[MISC]', 'and', 'Design', 'Schools']

['best', 'in', 'the', 'Midwest']
['best', 'in', 'the', '[LOC]', 'Midwest', '[LOC]', 'and', 'ranked', '11th', 'among']
['named', 'the', 'best', 'in', 'the', '[LOC]', 'Midwest', '[LOC]', 'and', 'ranked', '11th', 'among']

['11th']
['11th', 'among', 'all', 'un

18131it [00:24, 707.45it/s]

['bought', 'and', 'sold']
['bought', 'and', 'sold', '.', 'Corruption', 'undermines', 'the', 'legitimacy', 'of', 'government']
['offices', 'are', 'bought', 'and', 'sold', '.', 'Corruption', 'undermines', 'the', 'legitimacy', 'of', 'government']

['accountability']
['accountability', 'of', 'decision-makers', '.', 'Evidence', 'from', 'fragile', 'states', 'also', 'shows']
['level', 'of', 'accountability', 'of', 'decision-makers', '.', 'Evidence', 'from', 'fragile', 'states', 'also', 'shows']

['private']
['private', 'sector', ',', 'corruption', 'increases', 'the', 'cost', 'of', 'business', 'through']
['In', 'the', 'private', 'sector', ',', 'corruption', 'increases', 'the', 'cost', 'of', 'business', 'through']

['bureaucracy']
['bureaucracy', ',', 'the', 'availability', 'of', 'bribes', 'can', 'also', 'induce', 'officials']
['by', 'cutting', 'bureaucracy', ',', 'the', 'availability', 'of', 'bribes', 'can', 'also', 'induce', 'officials']

['Openly', 'removing', 'costly', 'and', 'lengthy', 're

18272it [00:24, 611.64it/s]

['privatized', 'less']
['privatized', 'less', '.']
['countries', 'that', 'privatized', 'less', '.']

['principle', 'of', 'subsidiarity']
['principle', 'of', 'subsidiarity', 'is', 'applied', ':', 'a', 'government', 'service', 'should']
[',', 'the', 'principle', 'of', 'subsidiarity', 'is', 'applied', ':', 'a', 'government', 'service', 'should']

['embezzlement']
['embezzlement', ',', 'because', 'even', 'small', 'sums', 'missing', 'will', 'be', 'noticed']
['instances', 'discourages', 'embezzlement', ',', 'because', 'even', 'small', 'sums', 'missing', 'will', 'be', 'noticed']

['centralized']
['centralized', 'authority', ',', 'even', 'minute', 'proportions', 'of', 'public', 'funds', 'can']
['in', 'a', 'centralized', 'authority', ',', 'even', 'minute', 'proportions', 'of', 'public', 'funds', 'can']

['kleptocracy']
['kleptocracy', '.', 'Members', 'of', 'the', 'government', 'can', 'take', 'advantage', 'of']
['the', 'neologism', 'kleptocracy', '.', 'Members', 'of', 'the', 'government', 'can',

18406it [00:24, 623.07it/s]

['Gallo-Italic']
['Gallo-Italic', '[MISC]', 'language', ',', 'an', '[ORG]', '[MISC]', 'Eastern', 'Lombard', '[MISC]']
['a', '[MISC]', 'Gallo-Italic', '[MISC]', 'language', ',', 'an', '[ORG]', '[MISC]', 'Eastern', 'Lombard', '[MISC]']

['Neapolitan']
['Neapolitan', '[MISC]', 'language', ',', 'but', 'far', 'less', 'mutual', 'intelligibility', 'with']
['related', '[MISC]', 'Neapolitan', '[MISC]', 'language', ',', 'but', 'far', 'less', 'mutual', 'intelligibility', 'with']

['Sicilian', 'Gallo-Italic']
['Sicilian', 'Gallo-Italic', '[MISC]', ',', 'a', 'language', 'that', 'developed', 'in', 'isolated']
['speaking', '[MISC]', 'Sicilian', 'Gallo-Italic', '[MISC]', ',', 'a', 'language', 'that', 'developed', 'in', 'isolated']

['Florentine', 'Tuscan']
['Florentine', 'Tuscan', '[MISC]', 'language', '.', 'The', '[MISC]', 'Tuscan-based', '[MISC]', 'language']
['[MISC]', 'Latin-derived', 'Florentine', 'Tuscan', '[MISC]', 'language', '.', 'The', '[MISC]', 'Tuscan-based', '[MISC]', 'language']

['Latin

18542it [00:25, 638.02it/s]

['instrumental', 'performers']
['instrumental', 'performers', 'would', 'improvise', 'musical', 'ornaments', '.', '[MISC]', '[MISC]', 'J.S']
['vocal', 'and', 'instrumental', 'performers', 'would', 'improvise', 'musical', 'ornaments', '.', '[MISC]', '[MISC]', 'J.S']

['J.S', '.', 'Bach']
['J.S', '[MISC]', '[MISC]', '.', '[MISC]', '[PER]', 'Bach', '[PER]', '[MISC]', 'was']
['[MISC]', '[MISC]', 'J.S', '[MISC]', '[MISC]', '.', '[MISC]', '[PER]', 'Bach', '[PER]', '[MISC]', 'was']

['his', 'ability', 'to', 'improvise', 'melodies', 'in', 'different', 'styles']
['his', 'ability', 'to', 'improvise', 'melodies', 'in', 'different', 'styles', '.', 'During']
['noted', 'for', 'his', 'ability', 'to', 'improvise', 'melodies', 'in', 'different', 'styles', '.', 'During']

['the', 'mid-19th', 'century']
['the', 'mid-19th', 'century', '(', 'often', 'much', 'earlier', ')', 'and', 'codified']
['invented', 'before', 'the', 'mid-19th', 'century', '(', 'often', 'much', 'earlier', ')', 'and', 'codified']

['The'

18625it [00:25, 691.69it/s]

['The', 'common', 'practice', 'period']
['The', 'common', 'practice', 'period', 'is', 'when', 'many', 'of', 'the', 'ideas']
[]

['The', 'common', 'practice', 'period']
['The', 'common', 'practice', 'period', 'is', 'when', 'many', 'of', 'the', 'ideas']
[]

['Baroque', 'era']
['Baroque', '[MISC]', 'era', ',', 'running', 'from', 'roughly', '1600', 'to', 'the']
['the', '[MISC]', 'Baroque', '[MISC]', 'era', ',', 'running', 'from', 'roughly', '1600', 'to', 'the']

['around', '1820']
['around', '1820', '.', 'The', '[MISC]', 'Romantic', '[MISC]', 'era', 'ran', 'through']
['ending', 'roughly', 'around', '1820', '.', 'The', '[MISC]', 'Romantic', '[MISC]', 'era', 'ran', 'through']

['about', '1910']
['about', '1910', '.']
[',', 'ending', 'about', '1910', '.']

['Baroque']
['Baroque', '[MISC]', 'music', 'is', 'characterized', 'by', 'the', 'use', 'of', 'complex']
[]

['a', 'continuous', 'bass', 'line']
['a', 'continuous', 'bass', 'line', '.', 'Music', 'became', 'more', 'complex', 'in']
['continuo',

18762it [00:25, 639.82it/s]

['Kurt', 'Weill']
['Kurt', 'Weill', '[PER]', "'s", '[MISC]', 'The', 'Threepenny', 'Opera', '[MISC]', ',']
['by', '[PER]', 'Kurt', 'Weill', '[PER]', "'s", '[MISC]', 'The', 'Threepenny', 'Opera', '[MISC]', ',']

['popular']
['popular', 'music', 'of', 'the', 'composer', "'s", 'time', '.', 'Examples', 'include']
['material', 'from', 'popular', 'music', 'of', 'the', 'composer', "'s", 'time', '.', 'Examples', 'include']

['the', '1970s']
['the', '1970s', ',', 'and', 'the', 'musical', 'crossover', 'phenomenon', ',', 'where']
['put', 'since', 'the', '1970s', ',', 'and', 'the', 'musical', 'crossover', 'phenomenon', ',', 'where']

['the', 'musical', 'crossover', 'phenomenon']
['the', 'musical', 'crossover', 'phenomenon', ',', 'where', 'classical', 'musicians', 'have', 'achieved']
[',', 'and', 'the', 'musical', 'crossover', 'phenomenon', ',', 'where', 'classical', 'musicians', 'have', 'achieved']

['heavy', 'metal']
['heavy', 'metal', ',', 'a', 'number', 'of', 'lead', 'guitarists', '(', 'playing'

18913it [00:25, 684.16it/s]

['westernmost', 'Croatia']
['westernmost', '[LOC]', 'Croatia', '[LOC]', ',', '[LOC]', 'Dalmatinci', '[LOC]', 'in', 'southern']
['[LOC]', 'in', 'westernmost', '[LOC]', 'Croatia', '[LOC]', ',', '[LOC]', 'Dalmatinci', '[LOC]', 'in', 'southern']

['Adriatic', 'islands']
['Adriatic', 'islands', '[LOC]', ',', '[LOC]', 'Vlaji', '[LOC]', 'in', 'hinterland', 'of']
['in', '[LOC]', 'Adriatic', 'islands', '[LOC]', ',', '[LOC]', 'Vlaji', '[LOC]', 'in', 'hinterland', 'of']

['hinterland', 'of', 'Dalmatia']
['hinterland', 'of', '[LOC]', 'Dalmatia', '[LOC]', ',', '[LOC]', 'Slavonci', '[LOC]', 'in']
['[LOC]', 'in', 'hinterland', 'of', '[LOC]', 'Dalmatia', '[LOC]', ',', '[LOC]', 'Slavonci', '[LOC]', 'in']

['R1a1a', '[', 'M17', ']', 'and', 'I2a2a']
['R1a1a', '[MISC]', '[', 'M17', ']', 'and', '[MISC]', 'I2a2a', '[MISC]', '[']
[':', '[MISC]', 'R1a1a', '[MISC]', '[', 'M17', ']', 'and', '[MISC]', 'I2a2a', '[MISC]', '[']

['63.39', '%']
['63.39', '%', 'in', 'the', '[MISC]', 'Sorbs', '[MISC]', ',', 'through',

19104it [00:25, 723.84it/s]

['Henry', 'VI']
['Henry', 'VI', '[PER]', ',', 'granted', 'on', '9', 'March', '1446', '/']
['of', '[PER]', 'Henry', 'VI', '[PER]', ',', 'granted', 'on', '9', 'March', '1446', '/']

['29', 'June', '1199']
['29', 'June', '1199', '.', 'The', 'definition', 'of', 'the', 'port', 'of']
['[LOC]', 'on', '29', 'June', '1199', '.', 'The', 'definition', 'of', 'the', 'port', 'of']

['Charles', 'I']
['Charles', 'I', '[PER]', 'by', 'at', 'once', 'the', 'formal', 'separation', 'from']
['of', '[PER]', 'Charles', 'I', '[PER]', 'by', 'at', 'once', 'the', 'formal', 'separation', 'from']

['1640']
['1640', 'the', 'formal', 'title', 'of', 'the', 'town', 'became', "'", '[LOC]']
['27', 'June', '1640', 'the', 'formal', 'title', 'of', 'the', 'town', 'became', "'", '[LOC]']

['Victorian', 'period']
['Victorian', '[MISC]', 'period', 'which', 'from', 'about', '1888', 'saw', 'the', 'setting']
['later', '[MISC]', 'Victorian', '[MISC]', 'period', 'which', 'from', 'about', '1888', 'saw', 'the', 'setting']

['Hampshire'

19351it [00:26, 741.79it/s]

['English', 'National', 'Ballet']
['English', 'National', 'Ballet', '[ORG]', '.', 'There', 'is', 'also', 'the', '[ORG]']
['and', '[ORG]', 'English', 'National', 'Ballet', '[ORG]', '.', 'There', 'is', 'also', 'the', '[ORG]']

['Nuffield', 'Theatre']
['Nuffield', 'Theatre', '[ORG]', 'based', 'at', 'the', '[ORG]', 'University', 'of', 'Southampton']
['the', '[ORG]', 'Nuffield', 'Theatre', '[ORG]', 'based', 'at', 'the', '[ORG]', 'University', 'of', 'Southampton']

['The', 'Southampton', 'City', 'Art', 'Gallery']
['The', '[ORG]', 'Southampton', 'City', 'Art', 'Gallery', '[ORG]', 'at', 'the', '[LOC]']
['city', '.', 'The', '[ORG]', 'Southampton', 'City', 'Art', 'Gallery', '[ORG]', 'at', 'the', '[LOC]']

['a', 'space']
['a', 'space', '"', '.', 'A', 'space', 'also', 'run', 'the', 'Art']
['organisation', '"', 'a', 'space', '"', '.', 'A', 'space', 'also', 'run', 'the', 'Art']

['Art', 'Vaults']
['Art', '[ORG]', 'Vaults', '[ORG]', 'project', ',', 'which', 'creatively', 'uses', 'several']
['run', 't




KeyboardInterrupt: 

In [269]:
errors

39963

In [211]:
    """
    Find last index of answer in the updated context
    """
    new_last_index = new_index
    searched_answer = copy.deepcopy(original_answer_list)
    #print(index, new_index)
    #print(updated_context)
    #print(searched_answer)
    #print(updated_context[new_index:new_index+10])

    searching_answer_iter = -1

    while len(searched_answer) != 0 or (updated_context[new_last_index] in NER_TAGS):
        #print(searched_answer)
        searching_answer_iter += 1
        if len(searched_answer) != 0:
            if searched_answer[0] in updated_context[new_last_index]:
                searched_answer.pop(0)
                new_last_index += 1

        try:
            if updated_context[new_last_index] in NER_TAGS:
                new_last_index += 1
        except:
            pass
            
        if new_last_index == len(updated_context) or searching_answer_iter == 100:
            break

    original_answer = tokens[index:index+len(original_answer_list)]
    updated_answer = updated_context[new_index:new_last_index]
    #print(original_answer, updated_answer)

    """
    Count how many errors we have
    """
    if original_answer != [token for token in updated_answer if token[0] not in NER_TAGS]:
        errors +=1

0

In [17]:
special_characters = ["$", ",", "''", "-LRB-", "-RRB-", ".", ":", "``"]


def get_proc_pos_between_item(pos_context, context, answers):
    positions = {}
    copy_answer = []
    i = 0
    # answers["answer_start"].sort()
    # answers["answer_start"].sort()
    
    for answer in answers["answer_start"]:
        answer_position = find_start_position(answers["text"][i], pos_context, answer, context)
        if answer_position != -1:
            positions[answer_position] = answers["answer_start"][i]
            copy_answer.append(answers["answer_start"][i])
        i += 1
    pos = ""
    offset = 0
    offsets = {}
    # print(positions)
    for index in range(len(pos_context)):
        pos_text = pos_context[index][str(index)]

        if pos_text[1] in special_characters:
            pos_text[1] = "sym"
        if index in positions.keys():
            offsets[positions[index]] = offset
        pos += f"{pos_text[0]} [{pos_text[1].lower()}] "
        offset += len(f" [{pos_text[1]}] ")

    answer_starts = []
    # print(copy_answer)
    for index in range(len(copy_answer)):
        answer = copy_answer[index]
        if answer == 0:
            answer_starts.append(answer)
            continue
        spaces_no = context[:answer].count(" ") + 1
        if answer not in offsets.keys():
            continue
        else:
            answer += offsets[answer] - spaces_no
        if pos[answer : answer + 1] == " ":
            answer += 1
        # if len(answers["text"]) > index:
        #     if len(answers["text"][index]) > 1:
        #         if pos[answer][0] == answers["text"][index][1]:
        #             answer -= 1
        answer_starts.append(answer)

    return pos, answer_starts


In [402]:
example_id = 84

pos, answer_starts = get_proc_pos_between_item(data[example_id]['POS_context'], data[example_id]['context'],
                          data[example_id]['answers'])
pos, answer_starts

('The [dt] university [nn] is [vbz] affiliated [vbn] with [in] the [dt] Congregation [nnp] of [in] Holy [nnp] Cross [nnp] ( [sym] Latin [nnp] : [sym] Congregatio [nnp] a [dt] Sancta [nnp] Cruce [nnp] , [sym] abbreviated [vbn] postnominals [nns] : [sym] " [sym] CSC [nnp] ") [sym] . [sym] While [in] religious [jj] affiliation [nn] is [vbz] not [rb] a [dt] criterion [nn] for [in] admission [nn] , [sym] more [jjr] than [in] 93 [cd] % [nn] of [in] students [nns] identify [vbp] as [in] Christian [nnp] , [sym] with [in] over [in] 80 [cd] % [nn] of [in] the [dt] total [nn] being [vbg] Catholic [jj] . [sym] Collectively [rb] , [sym] Catholic [nnp] Mass [nnp] is [vbz] celebrated [vbn] over [in] 100 [cd] times [nns] per [in] week [nn] on [in] campus [nn] , [sym] and [cc] a [dt] large [jj] campus [nn] ministry [nn] program [nn] provides [vbz] for [in] the [dt] faith [nn] needs [nns] of [in] the [dt] community [nn] . [sym] There [ex] are [vbp] multitudes [nns] of [in] religious [jj] statues [nns] a

In [403]:
pos[554:]

'[dt] total [nn] being [vbg] Catholic [jj] . [sym] Collectively [rb] , [sym] Catholic [nnp] Mass [nnp] is [vbz] celebrated [vbn] over [in] 100 [cd] times [nns] per [in] week [nn] on [in] campus [nn] , [sym] and [cc] a [dt] large [jj] campus [nn] ministry [nn] program [nn] provides [vbz] for [in] the [dt] faith [nn] needs [nns] of [in] the [dt] community [nn] . [sym] There [ex] are [vbp] multitudes [nns] of [in] religious [jj] statues [nns] and [cc] artwork [nn] around [in] campus [nn] , [sym] most [rbs] prominent [jj] of [in] which [wdt] are [vbp] the [dt] statue [nn] of [in] Mary [nnp] on [in] the [dt] Main [nnp] Building [nnp] , [sym] the [dt] Notre [nnp] Dame [nnp] Grotto [nnp] , [sym] and [cc] the [dt] Word [nnp] of [in] Life [nnp] mural [nn] on [in] Hesburgh [nnp] Library [nnp] depicting [vbg] Christ [nnp] as [in] a [dt] teacher [nn] . [sym] Additionally [rb] , [sym] every [dt] classroom [nn] displays [vbz] a [dt] crucifix [nn] . [sym] There [ex] are [vbp] many [jj] religious [jj]

In [404]:
for item in tqdm.tqdm(data):
    context = item['context']
    question = item['question']
    item['context'], item['answers']['answer_start'] = get_proc_pos_between_item(item['POS_context'], context, item['answers'])

    item.pop('POS_context', None)
    item.pop('POS_question', None)
    item.pop('NER_question', None)
    item.pop('NER_context', None)

100%|██████████| 87599/87599 [01:00<00:00, 1458.73it/s]


In [405]:
incorrect_answers_no = 0

for index, line in tqdm.tqdm(enumerate(data)):
    try:
        if len(line["answers"]["answer_start"]) == 0:
            continue
        answer_start = line["answers"]["answer_start"][0]
        answer = line["answers"]["text"][0]

        if answer_start == -1:
            continue

        if line["context"][answer_start:][0] != answer[0]:
            # line["answers"]["answer_start"][0] -= 1
            incorrect_answers_no += 1
            #print(index, line["context"][answer_start:], 4*'-', answer, answer_start)

    except Exception as e: 
        print(e)

87599it [00:00, 183267.93it/s]


In [406]:
incorrect_answers_no = 0

for i, line in tqdm.tqdm(enumerate(data)):
    try:
        if(len(line["answers"]["answer_start"]) == 0):
            line["answers"]["answer_start"].append(0)
        answer_start = line["answers"]["answer_start"][0]
        answer =  line["answers"]["text"][0]
        answer_length = len(answer)

        if line["context"][answer_start:][0] != answer[0]:
            incorrect_answers_no += 1

    except Exception as e: 
        # print(e)
        incorrect_answers_no += 1
        # print(i, answer_start, line["context"][answer_start:], 4*'-', answer)

87599it [00:00, 270431.26it/s]


In [407]:
data[292]

{'id': '56bf6e823aeaaa14008c9627',
 'title': 'Beyoncé',
 'context': 'Following [vbg] the [dt] disbandment [nn] of [in] Destiny [nnp] \'s [pos] Child [nnp] in [in] June [nnp] 2005 [cd] , [sym] she [prp] released [vbd] her [prp$] second [jj] solo [jj] album [nn] , [sym] B\'Day [nnp] ( [sym] 2006 [cd] ) [sym] , [sym] which [wdt] contained [vbd] hits [nns] " [sym] Déjà [nnp] Vu [nnp] " [sym] , [sym] " [sym] Irreplaceable [nnp] " [sym] , [sym] and [cc] " [sym] Beautiful [nnp] Liar [nnp] " [sym] . [sym] Beyoncé [nnp] also [rb] ventured [vbd] into [in] acting [nn] , [sym] with [in] a [dt] Golden [nnp] Globe-nominated [nnp] performance [nn] in [in] Dreamgirls [nnps] ( [sym] 2006 [cd] ) [sym] , [sym] and [cc] starring [vbg] roles [nns] in [in] The [dt] Pink [nnp] Panther [nnp] ( [sym] 2006 [cd] ) [sym] and [cc] Obsessed [nnp] ( [sym] 2009 [cd] ) [sym] . [sym] Her [prp$] marriage [nn] to [in] rapper [nn] Jay [nnp] Z [nnp] and [cc] portrayal [nn] of [in] Etta [nnp] James [nnp] in [in] Cadillac [n

In [408]:
incorrect_answers_no

2390

In [409]:
write_data("./squad_data_validation_pos_between.json", data)

Data with 'Test' property added has been saved to ./squad_data_validation_pos_between.json


In [410]:
proc_data = read_data('./squad_data_validation_pos_between.json')

In [411]:
def rreplace(string, substring, replacement):
    k = string.rfind(substring)
    return string[:k] + replacement + string[k+len(substring):]

In [412]:
symbols = ["'", '.', ',', ')']


def get_updated_answer(context, answers):
    new_answers = []
    if len(answers['answer_start']) == 0:
        return

    for index, answer in enumerate(answers['text']):
        new_answer = ''
        if len(answers['answer_start']) < index + 1:
            return new_answers
        answer_start = answers['answer_start'][index]
        if answer_start != -1:
            tokenized_context = Sentence(context[answer_start:])
            no_of_words = len(Sentence(answer))
            max_length = no_of_words * 3 + no_of_words
            if max_length > len(tokenized_context):
                max_length = len(tokenized_context)
            minus_words = 0
            for token_index in range(max_length):
                # if token_index == max_length - minus_words - 1:
                #     break
                if (tokenized_context[token_index].text == '[' and tokenized_context[token_index+1].text != '[') or (len(tokenized_context) > token_index+1 and tokenized_context[token_index].text!=']' and tokenized_context[token_index+1].text==']') or token_index == no_of_words * 4 - 1:
                    new_answer += tokenized_context[token_index].text
                elif tokenized_context[token_index].text in symbols and tokenized_context[token_index+1].text != '[':
                    new_answer += tokenized_context[token_index].text
                elif (tokenized_context[token_index].text.replace('.', '').isalpha() or tokenized_context[token_index].text[-1] in '."') and tokenized_context[token_index+1].text in '.$]':
                    new_answer += tokenized_context[token_index].text
                    # minus_words += 1
                elif len(tokenized_context) > token_index+1 and tokenized_context[token_index].text.replace('.', '').isnumeric() and tokenized_context[token_index+1].text in '.$]':
                    new_answer += tokenized_context[token_index].text
                elif tokenized_context[token_index].text.replace('-', '').isalpha() and tokenized_context[token_index+1].text in '.$]':
                    new_answer += tokenized_context[token_index].text
                elif len(tokenized_context) > token_index+1 and tokenized_context[token_index+1].text[0].isnumeric() and tokenized_context[token_index].text in '-':
                    new_answer += tokenized_context[token_index].text
                else:
                    new_answer += tokenized_context[token_index].text + ' '
        new_answer = new_answer.rstrip()
        if answer_start != -1:
            tokens_new_answer = Sentence(new_answer)
            # print(tokens_new_answer[-2])
            if tokens_new_answer[-2].text == '[':
                # print(tokens_new_answer[-2])
                new_answer += ']'
            if tokens_new_answer[-1].text == '[':
                #print(tokens_new_answer[-1], tokens_new_answer[-2].text)
                new_answer = rreplace(new_answer, tokens_new_answer[-1].text, '')
                new_answer = rreplace(new_answer, tokens_new_answer[-2].text, '')
                new_answer = new_answer.rstrip()
        new_answers.append(new_answer)
    return new_answers

In [413]:
example_id = 4892

In [414]:
get_updated_answer(proc_data[example_id]['context'], proc_data[example_id]['answers'])

['a [dt] U.S. [nnp] inventor [nn] , [sym] engineer [nn] and [cc] solar [jj] energy [nn] pioneer [nn]']

In [None]:
for item in tqdm.tqdm(proc_data):
    item['answers']['text'] = get_updated_answer(item['context'], item['answers'])

 93%|█████████▎| 81573/87599 [03:34<00:16, 362.93it/s]

In [391]:
incorrect_answers_no = 0

for index, line in tqdm.tqdm(enumerate(proc_data)):
    try:
        if len(line["answers"]["text"]) == 0:
            line["answers"]["text"] = ['']
        if len(line["answers"]["answer_start"]) == 0:
            line["answers"]["text"] = [0]
        answer_start = line["answers"]["answer_start"][0]
        answer = line["answers"]["text"][0]
        answer_length = len(answer)

        if answer_start == -1:
            continue

        if line["context"][answer_start: answer_start + answer_length] != answer:
            incorrect_answers_no += 1
            print(index, line["context"][answer_start: answer_start + answer_length],
                  "-" * 4, answer)

        # if answer[-1] != ']':
        #     print(index, line["context"][answer_start: answer_start + answer_length], "-" * 4, answer)
        #     incorrect_answers_no += 1

    except Exception as e: 
        print(e)

10570it [00:00, 284819.05it/s]

717 :28 [cd]  ---- : 28 [cd]
3615 oxygen-18. [nn]  ---- oxygen-18 . [nn]
4006 -Gemini [jj]  ---- - Gemini [jj]





In [392]:
incorrect_answers_no / len(proc_data) * 100

0.028382213812677387

In [393]:
write_data("./squad_data_train_pos_ner_answer_updated.json", proc_data)

Data with 'Test' property added has been saved to ./squad_data_validation_pos_ner_answer_updated.json
