In [3]:
from typing import List
 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math
import torch
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Installing Stanza

Installing and importing Stanza are as simple as running the following commands:

In [4]:
# Install stanza
%%capture
!pip install stanza

# Import stanza
import stanza
stanza.download('en')

### Setting up Stanford CoreNLP


In [5]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

## 2. Load CoreNLP Interface

### Constructing CoreNLPClient



In [6]:
# Import client module
from stanza.server import CoreNLPClient

In [7]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['dcoref'], 
    memory='6G', 
    endpoint='http://localhost:9001',
    outputFormat="json",
    
    be_quiet=False)
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

<stanza.server.client.CoreNLPClient object at 0x7f920cd6fb10>


In [8]:
# Initialize tokenizer used by dcoref model
nlp = stanza.Pipeline(lang='en', processors='tokenize', use_gpu= True)

# Load ECB+ Data

In [9]:
import random
import json
import pandas as pd
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

# Make sure to click "Add shortcut to drive" for the "Coref-for-GPT" folder
gdrive_dir_path = "/content/drive/MyDrive/Coref-for-GPT"

Mounted at /content/drive


In [10]:
# Change this path to the root dir where you stored data
root_path = gdrive_dir_path

In [11]:
# Path to the ecb data
ecb_path = f"{root_path}/Data/ECB+/processed/"

In [12]:
file_path = ecb_path + "dev_with_new_index.json"
with open(file_path) as f:
    dev = json.load(f)
print(len(dev))

192


# Parse and Match Data

In [13]:
# Parse gold mentions to get corresponding clusters
def get_gold_clusters(gold_mentions):
    clusters = {}
    for m_id in gold_mentions:
        mention = gold_mentions[m_id]
        cluster_id = mention["cluster_id"]
        if cluster_id in clusters:
            clusters[cluster_id].append(m_id)
        else:
            clusters[cluster_id] = [m_id]
    return clusters

In [14]:
# Parse annotation results from CoreNLP
def get_vocab(annotated_doc):
    vocab = [] 
    sents_divider = []
    idx = 0
    for sent in annotated_doc.sentence:
        for tok in sent.token:
            vocab.append(tok.word)
            idx += 1
        sents_divider.append(idx)
    return vocab, sents_divider

def get_sentence_id(idx, sents_divider):
    # idx starts from 0
    for i, divider in enumerate(sents_divider):
        if idx < divider:
            return i
    raise Exception(f"Error: {idx} not covered by {sents_divider}")

def get_pred_mentions_and_clusters(annotated_doc, vocab, sents_divider, map):
    mentions = {}
    clusters = {}
    
    for chain in annotated_doc.corefChain:
        cluster_id = chain.chainID
        mentions_in_chain = chain.mention
        
        mention_ids_in_cluster = []

        for m in mentions_in_chain:
            s_id = m.sentenceIndex 
            if s_id > 0:
                start = sents_divider[s_id-1]
            else:
                start = 0
            
            m_id = str([start+m.beginIndex, start+m.endIndex])
            tokens_ids = list(range(start+m.beginIndex, start+m.endIndex))
            
            tokens = []
            gold_tokens_ids = []
            for t_id in tokens_ids:
                tokens.append(vocab[t_id])
                gold_tok_id = map[t_id]
                gold_tokens_ids.append(gold_tok_id)

            mentions[m_id] = {"sentence_id": s_id , 
                              "tokens": tokens,
                              "tokens_ids": tokens_ids,
                              "gold_tokens_ids": gold_tokens_ids,
                              }
            mention_ids_in_cluster.append(m_id)

        clusters[cluster_id] = mention_ids_in_cluster

    return mentions,clusters


In [15]:
# Match tokens from CoreNLP with original gold tokens
def get_token_from_nlp(doc):
    tokens = []
    for i, sentence in enumerate(doc.sentences):
        tokens += [token.text for token in sentence.tokens]
    return tokens

def reproduce_tokens_from_gold(gold_tokens, vocab_set, nlp):
    new_pred_vocab = []
    map_pred_to_gold = []
    for i, token in enumerate(gold_tokens):
        if token in vocab_set:
            new_pred_vocab.append(token)
            map_pred_to_gold.append(i)
        else:
            # CoreNLP tokenizer might split original token into multiple tokens
            doc = nlp(token)
            new_tokens = get_token_from_nlp(doc)
            if new_tokens:
                new_pred_vocab += new_tokens
                map_pred_to_gold += [i]*len(new_tokens)
    return new_pred_vocab, map_pred_to_gold

# Sanity Check: make sure the reproduce tokens match the CoreNLP tokens
def check_same_vocabs(new_vocab, old_vocab):
    assert len(new_vocab) == len(old_vocab), "Error: Not in the same length."
    for i, tok in enumerate(new_vocab):
        assert tok == old_vocab[i], f"Error: {tok} doesn't match {old_vocab[i]} at {i}."
    return True

In [16]:
# Match predicted mentions with gold mentions, ignoring mentions that are not gold
def match(m_token_ids, gold_mentions, threshold):
    m_token_ids = set(m_token_ids)
    for m_id in gold_mentions:
        mention = gold_mentions[m_id]
        gold_token_ids = set(mention["tokens_ids"])
        intersec = gold_token_ids.intersection(m_token_ids)
        matched_ratio = len(intersec)/len(gold_token_ids)
        if matched_ratio > threshold:
            return m_id
    return -1

def matched_gold_mentions(mentions, gold_mentions, threshold = 0.5):
    mention_map = {}
    matched_mentions = []
    for m_id in mentions:
        m = mentions[m_id]
        matched_gold_m_id = match(m["gold_tokens_ids"], gold_mentions, threshold)
        mention_map[m_id] = matched_gold_m_id
        if matched_gold_m_id != -1:
            matched_mentions.append(matched_gold_m_id)
    return mention_map, matched_mentions

# Build clusters based on cluster predictions of gold mentions
def pred_clusters_with_gold_mentions(pred_clusters, mention_map):
    new_clusters = {}
    for i, cluster in pred_clusters.items():
        gold_mentions_in_cluster = []
        for m_id in cluster:
            if mention_map[m_id] != -1:
                gold_mentions_in_cluster.append(mention_map[m_id])
        if gold_mentions_in_cluster:
            new_clusters[i] = gold_mentions_in_cluster
    return new_clusters

In [17]:
# # Test case
# doc_name = "35_10ecb.xml"
# dev[doc_name][]

In [18]:
# # Test case
# doc_name = "35_10ecb.xml"
# text, gold_toks, gold_mentions, gold_sents_divider = dev[doc_name]
# gold_clusters = get_gold_clusters(gold_mentions)
# annotated_doc = client.annotate(" ".join(gold_toks))

In [19]:
# pred_vocab, pred_sents_divider = get_vocab(annotated_doc)
# pred_vocab_set = set(pred_vocab)
# new_vocab, map_pred_to_gold = reproduce_tokens_from_gold(gold_toks, pred_vocab_set, nlp)
# check_same_vocabs(new_vocab, pred_vocab)

In [20]:
# pred_mentions, pred_clusters = get_pred_mentions_and_clusters(annotated_doc, pred_vocab, pred_sents_divider, map_pred_to_gold)
# mention_map, matched_mentions = matched_gold_mentions(pred_mentions, gold_mentions, threshold = 0.5)
# print(len(gold_mentions), len(matched_mentions))

# Generate Pairwise Annotation for Gold Mentions

In [21]:
def get_pairwise_labels(mentions_ids, clusters):
    pairs = []
    labels = []
    for i in range(len(mentions_ids)-1):
        for j in range(i+1, len(mentions_ids)):
            m1_id = mentions_ids[i]
            m2_id = mentions_ids[j]
            pair = set([m1_id, m2_id])
            label = 0
            for cluster_mentions in clusters.values():
                if pair.issubset(set(cluster_mentions)):
                    label = 1
                    break
            pairs.append(pair)
            labels.append(label)
            
    pairs, labels = np.array(pairs), np.array(labels)
    return pairs, labels

In [22]:
# # Test case
# pred_clusters_gold_mentions = pred_clusters_with_gold_mentions(pred_clusters, mention_map)
# pairs, labels = get_pairwise_labels(list(gold_mentions.keys()), gold_clusters)
# _, predictions = get_pairwise_labels(list(gold_mentions.keys()), pred_clusters_gold_mentions)

# print("Labels: ", labels)
# print("Preds: ", predictions)

In [23]:
def get_result(gold_clusters, pred_clusters, gold_mentions, text, sentence_restriction = 2):
    # gold
    gold_pairs, gold_labels = get_pairwise_labels(list(gold_mentions.keys()), gold_clusters)
    gold_sents = text.split("[EOS]")
    df = pd.DataFrame([gold_pairs, gold_labels]).T
    df.columns = ["pair", "label"]

    # predictions
    _, pred_labels = get_pairwise_labels(list(gold_mentions.keys()), pred_clusters)
    df["pred"] = pred_labels
    
    # pair information
    df["mention_pair"] = df["pair"].apply(lambda x: [gold_mentions[list(x)[0]], gold_mentions[list(x)[1]]])
    df["pair"] = df["pair"].astype(str)

    # sentence_restriction=2: only keep pairs from consecutive sentences
    df["sent_idx"] = df["mention_pair"].apply(lambda x: [x[0]["sentence_id"], x[1]["sentence_id"]])
    df["sentence"] = df["sent_idx"].apply(lambda x: [gold_sents[x[0]],gold_sents[x[1]]])
    df["sent_filter"] = df["sent_idx"].apply(lambda x: np.abs(x[1]-x[0]))
    if sentence_restriction:
        df = df[df["sent_filter"] < sentence_restriction].reset_index(drop = True)
 
    return df

# Annotate Data 

In [24]:
def annotate(data, matching_threshold = 0.5):
    n_skipped = 0
    for doc_name in tqdm(data):
        print(f"Parsing ----> {doc_name}")
        text, gold_toks, gold_mentions, _, = data[doc_name]
        gold_clusters = get_gold_clusters(gold_mentions)
        try:
            annotated_doc = client.annotate(" ".join(gold_toks))
        except:
            print(f"Skip (client issue) ----> {doc_name}")
            continue
        pred_vocab, pred_sents_divider = get_vocab(annotated_doc)
        pred_vocab_set = set(pred_vocab)
        new_vocab, map_pred_to_gold = reproduce_tokens_from_gold(gold_toks, pred_vocab_set, nlp)
        try:
            check_same_vocabs(new_vocab, pred_vocab)
        except:
            print("Not matching: ", new_vocab, pred_vocab)
            print(f"Skip ----> {doc_name}")
            n_skipped += 1
            continue
        
        pred_mentions, pred_clusters = get_pred_mentions_and_clusters(annotated_doc, pred_vocab, pred_sents_divider, map_pred_to_gold)
        mention_map, matched_mentions = matched_gold_mentions(pred_mentions, gold_mentions, threshold = matching_threshold)

        pred_clusters_gold_mentions = pred_clusters_with_gold_mentions(pred_clusters, mention_map)
        result = get_result(gold_clusters, pred_clusters_gold_mentions, gold_mentions, text, sentence_restriction = 2)
        result["doc_name"] = doc_name

        output_file = f"{root_path}/Results/Multi-pass-Sieve/pairwise_result.csv"
        result.to_csv(output_file, mode="a", index=False, header=None)

    columns = result.columns
    
    print(f"Files saved with headers {columns}")
    return n_skipped

In [25]:
n_skipped = annotate(dev)
print(f"skipped {n_skipped} in total")

  0%|          | 0/192 [00:00<?, ?it/s]

Parsing ----> 35_10ecb.xml


  1%|          | 1/192 [00:31<1:40:12, 31.48s/it]

Parsing ----> 35_11ecbplus.xml


  1%|          | 2/192 [00:38<54:47, 17.30s/it]  

Parsing ----> 35_10ecbplus.xml


  2%|▏         | 3/192 [00:45<39:41, 12.60s/it]

Parsing ----> 35_1ecb.xml


  2%|▏         | 4/192 [00:50<29:43,  9.49s/it]

Parsing ----> 35_1ecbplus.xml


  3%|▎         | 5/192 [01:00<29:58,  9.62s/it]

Parsing ----> 35_2ecb.xml


  3%|▎         | 6/192 [01:10<30:24,  9.81s/it]

Parsing ----> 35_3ecbplus.xml


  4%|▎         | 7/192 [01:22<32:30, 10.54s/it]

Parsing ----> 35_3ecb.xml


  4%|▍         | 8/192 [01:25<25:11,  8.21s/it]

Parsing ----> 35_5ecbplus.xml


  5%|▍         | 9/192 [01:27<18:48,  6.17s/it]

Parsing ----> 35_4ecb.xml


  5%|▌         | 10/192 [01:35<20:33,  6.77s/it]

Parsing ----> 35_5ecb.xml


  6%|▌         | 11/192 [01:37<15:32,  5.15s/it]

Parsing ----> 35_4ecbplus.xml


  6%|▋         | 12/192 [01:43<16:14,  5.41s/it]

Parsing ----> 35_7ecb.xml


  7%|▋         | 13/192 [01:45<13:09,  4.41s/it]

Parsing ----> 35_7ecbplus.xml


  7%|▋         | 14/192 [01:48<11:43,  3.95s/it]

Not matching:  ['http://www.tsn', '.', 'ca', '/', 'mma', '/', 'story', '/', '?', 'id=265672', 'CHARGERS', "'", 'WILLIAMS', 'ARRESTED', 'ON', 'SUSPICION', 'OF', 'DUI', '2/3/2009', '5', ':', '59', ':', '13', 'PM', 'San', 'Diego', 'Chargers', 'defensive', 'tackle', 'Jamal', 'Williams', 'was', 'arrested', 'on', 'suspicion', 'of', 'drunk', 'driving', ',', 'the', 'team', "'s", 'second', 'such', 'arrest', 'in', 'less', 'than', 'a', 'month', '.', 'Williams', 'was', 'pulled', 'over', 'for', 'speeding', 'early', 'Sunday', 'on', 'a', 'freeway', 'outside', 'the', 'city', "'s", 'downtown', 'area', ',', 'the', 'California', 'Highway', 'Patrol', 'said', '.', 'Williams', 'gave', 'a', 'blood', 'sample', 'for', 'a', 'blood', '-', 'alcohol', 'test', ',', 'but', 'the', 'results', 'were', 'not', 'yet', 'known', '.', 'Williams', 'was', 'booked', 'into', 'county', 'jail', 'and', 'later', 'released', 'on', 'bail', '.', 'The', 'City', 'Attorney', "'s", 'office', 'said', 'the', 'case', 'had', "n't", 'been', 'su

  8%|▊         | 15/192 [01:51<11:22,  3.86s/it]

Parsing ----> 35_6ecb.xml


  8%|▊         | 16/192 [01:52<08:45,  2.99s/it]

Parsing ----> 35_8ecbplus.xml


  9%|▉         | 17/192 [01:54<07:37,  2.61s/it]

Parsing ----> 35_9ecb.xml


  9%|▉         | 18/192 [01:56<06:39,  2.30s/it]

Not matching:  ['SAN', 'Diego', 'Chargers', 'receiver', 'Vincent', 'Jackson', 'has', 'been', 'arrested', 'on', 'suspicion', 'of', 'drink', '-', 'driving', 'five', 'days', 'before', 'a', 'key', 'NFL', 'playoff', 'game', '.', 'Police', 'arested', 'him', 'in', 'San', 'Diego', 'at', '2.30am', 'and', 'booked', 'him', 'before', 'releasing', 'him', '.', 'The', 'Chargers', 'are', 'due', 'to', 'meet', 'Pittsburgh', 'Steelers', 'in', 'a', 'playoff', 'match', 'on', 'Sunday', '.', 'Victory', 'in', 'that', 'game', 'would', 'see', 'them', 'meet', 'either', 'Tennessee', 'or', 'Baltimore', 'for', 'a', 'spot', 'in', 'next', 'month', "'s", 'Super', 'Bowl', '.', 'Jackson', 'is', 'enjoying', 'the', 'best', 'of', 'his', 'four', 'NFL', 'seasons', 'in', 'the', 'current', 'campaign', 'with', '59', 'catches', 'for', '1098', 'yards', 'and', 'seven', 'touchdowns', '.', '"', 'We', "'re", 'aware', 'of', 'the', 'off', '-', 'the', '-', 'field', 'issue', 'involving', 'Vincent', ',', '"', 'Chargers', 'general', 'manag

 10%|▉         | 19/192 [01:59<07:28,  2.59s/it]

Parsing ----> 35_9ecbplus.xml


 10%|█         | 20/192 [02:01<07:13,  2.52s/it]

Parsing ----> 34_10ecb.xml


 11%|█         | 21/192 [02:04<07:31,  2.64s/it]

Not matching:  ['I', 'confirmed', 'yesterday', 'that', 'Obama', 'has', 'offered', 'the', 'job', 'and', 'that', 'Gupta', 'had', 'all', 'but', 'officially', 'accepted', '.', 'Got', 'it', 'up', 'on', 'the', 'Web', 'site', 'at', '2:36', 'p.m.', ',', 'mere', 'moments', 'after', 'filing', 'the', 'story', '.', 'Cyberspeed', 'comes', 'in', 'handy', 'sometimes', '.', 'CNN', ',', 'where', 'I', 'host', 'a', 'weekly', 'program', ',', 'did', "n't", 'report', 'it', 'until', '3:59', ',', 'but', 'the', 'blogs', 'went', 'crazy', '--', 'lead', 'story', 'on', 'HuffPost', ',', 'top', 'of', 'Drudge', "'s", 'page', ',', 'then', 'the', 'NYT', 'and', 'others', '.', 'Gupta', 'would', "n't", 'comment', 'to', 'me', '--', 'while', 'not', 'denying', 'the', 'story', '--', 'but', 'confirmed', 'on', 'his', 'Twitter', 'feed', 'that', 'he', '"', 'has', 'been', 'approached', 'by', 'the', 'Obama', 'administration', 'about', 'the', 'U.S.', 'surgeon', 'general', "'s", 'post', '.', '"', 'Everyone', "'s", 'digital', 'these',

 11%|█▏        | 22/192 [02:05<06:11,  2.18s/it]

Parsing ----> 34_10ecbplus.xml


 12%|█▏        | 23/192 [02:17<14:22,  5.10s/it]

Not matching:  ['http://www.scientificamerican.com/blog/post', '.', 'cfm', '?', 'id', '=', 'obama', '-', 'nominates', '-', 'new', '-', 'surgeon', '-', 'general-2009', '-', '07', '-', '13', 'Obama', 'nominates', 'new', 'surgeon', 'general', ':', 'MacArthur', '"', 'genius', 'grant', '"', 'fellow', 'Regina', 'Benjamin', 'Jul', '13', ',', '2009', '01', ':', '40', 'PM', 'As', 'the', 'country', "'s", 'politicos', 'have', 'their', 'sights', 'set', 'on', 'the', 'U.', 'S.', 'Supreme', 'Court', 'confirmation', 'hearings', 'for', 'Sonia', 'Sotomayor', 'today', ',', 'President', 'Barack', 'Obama', 'nominated', 'Regina', 'Benjamin', ',', '52', ',', 'as', 'the', 'country', "'s", 'new', 'surgeon', 'general', ',', 'the', '"', 'leading', 'spokesperson', 'on', 'issues', 'of', 'public', 'health', ',', '"', 'Obama', 'said', '.', 'Benjamin', ',', 'a', 'rural', 'family', 'doctor', 'in', 'Alabama', ',', 'was', 'a', '2008', 'MacArthur', 'Foundation', 'Fellow', ',', 'and', 'in', '1995', 'she', 'was', 'elected'

 12%|█▎        | 24/192 [02:19<11:08,  3.98s/it]

Parsing ----> 34_12ecb.xml


 13%|█▎        | 25/192 [02:20<08:44,  3.14s/it]

Parsing ----> 34_11ecbplus.xml


 14%|█▎        | 26/192 [02:45<26:45,  9.67s/it]

Parsing ----> 34_12ecbplus.xml


 14%|█▍        | 27/192 [03:17<45:29, 16.54s/it]

Parsing ----> 34_15ecb.xml


 15%|█▍        | 28/192 [03:18<32:42, 11.97s/it]

Parsing ----> 34_1ecb.xml


 15%|█▌        | 29/192 [03:19<23:09,  8.52s/it]

Parsing ----> 34_16ecb.xml


 16%|█▌        | 30/192 [03:21<17:26,  6.46s/it]

Parsing ----> 34_14ecb.xml


 16%|█▌        | 31/192 [03:22<13:01,  4.86s/it]

Parsing ----> 34_2ecb.xml


 17%|█▋        | 32/192 [03:23<10:15,  3.85s/it]

Parsing ----> 34_2ecbplus.xml


 17%|█▋        | 33/192 [03:39<19:17,  7.28s/it]

Parsing ----> 34_3ecb.xml


 18%|█▊        | 34/192 [03:39<13:42,  5.21s/it]

Parsing ----> 34_1ecbplus.xml


 18%|█▊        | 35/192 [03:51<19:18,  7.38s/it]

Not matching:  ['http://www.whitehouse', '.', 'gov', '/', 'the', '-', 'press', '-', 'office', '/', 'president', '-', 'obama', '-', 'announces', '-', 'nominee', '-', 'surgeon', '-', 'general-7', '-', '13', '-', '09', 'THE', 'WHITE', 'HOUSE', 'Office', 'of', 'the', 'Press', 'Secretary', 'FOR', 'IMMEDIATE', 'RELEASE', 'July', '13', ',', '2009', 'President', 'Obama', 'Announces', 'Nominee', 'for', 'Surgeon', 'General', 'Today', ',', 'President', 'Barack', 'Obama', 'announced', 'his', 'intent', 'to', 'nominate', 'Regina', 'M.', 'Benjamin', 'as', 'Surgeon', 'General', ',', 'Department', 'of', 'Health', 'and', 'Human', 'Services', '.', 'President', 'Obama', 'said', ',', '"', 'Health', 'care', 'reform', 'is', 'about', 'every', 'family', "'s", 'health', 'and', 'the', 'health', 'of', 'our', 'economy', '.', 'And', 'if', 'there', "'s", 'anyone', 'who', 'understands', 'the', 'urgency', 'of', 'meeting', 'this', 'challenge', 'in', 'a', 'personal', 'and', 'powerful', 'way', ',', 'it', "'s", 'the', 'wo

 19%|█▉        | 36/192 [03:56<17:17,  6.65s/it]

Parsing ----> 34_4ecbplus.xml


 19%|█▉        | 37/192 [04:03<17:08,  6.64s/it]

Not matching:  ['http://abcnews', '.', 'go.com/blogs/politics/2009/07/the-doctor-is-finally-in-obama-to-name-regina-benjamin-for-surgeon-general/', 'The', 'Doctor', 'Is', '(', 'Finally', ')', 'In', ':', 'Obama', 'To', 'Nominate', 'Regina', 'Benjamin', 'as', 'Surgeon', 'General', 'Jul', '13', ',', '2009', '11', ':', '23am', 'President', 'Obama', 'will', 'name', 'Dr.', 'Regina', 'Benjamin', 'as', 'U.', 'S.', 'Surgeon', 'General', 'in', 'a', 'Rose', 'Garden', 'announcement', 'late', 'this', 'morning', '.', 'Benjamin', ',', 'an', 'Alabama', 'family', 'physician', ',', 'runs', 'a', 'rural', 'health', 'clinic', 'in', 'Bayou', 'La', 'Batre', '.', 'She', 'was', 'the', 'first', 'African', '-', 'American', 'woman', 'to', 'head', 'a', 'state', 'medical', 'society', 'and', 'received', 'a', 'MacArthur', 'Foundation', '"', 'genius', 'grant', '"', 'last', 'year', '.', 'She', 'became', 'known', 'nationally', 'for', 'her', 'determination', 'to', 'rebuild', 'her', 'clinic', ',', 'destroyed', 'in', 'Hurr

 20%|█▉        | 38/192 [04:04<12:41,  4.95s/it]

Parsing ----> 34_3ecbplus.xml


 20%|██        | 39/192 [04:15<17:04,  6.69s/it]

Parsing ----> 34_4ecb.xml


 21%|██        | 40/192 [04:16<12:42,  5.02s/it]

Parsing ----> 34_6ecb.xml


 21%|██▏       | 41/192 [04:17<10:00,  3.98s/it]

Parsing ----> 34_6ecbplus.xml


 22%|██▏       | 42/192 [04:26<13:07,  5.25s/it]

Parsing ----> 34_7ecb.xml


 22%|██▏       | 43/192 [04:26<09:34,  3.85s/it]

Parsing ----> 34_7ecbplus.xml


 23%|██▎       | 44/192 [04:30<09:13,  3.74s/it]

Parsing ----> 34_8ecbplus.xml


 23%|██▎       | 45/192 [04:35<10:27,  4.27s/it]

Not matching:  ['http://www.womenmakingmoves', '.', 'org/2009/07', '/', 'surgeon', '-', 'general', '-', 'nominee', '-', 'dr', '-', 'regina', '.', 'html', 'WEDNESDAY', ',', 'JULY', '29', ',', '2009', 'Surgeon', 'General', 'Nominee', 'Dr.', 'Regina', 'Benjamin', 'I', 'was', 'immediately', 'impressed', 'with', 'Dr.', 'Regina', 'Benjamin', "'s", 'background', 'when', 'President', 'Obama', 'nominated', 'her', 'for', 'U.', 'S.', 'Surgeon', 'General', '.', 'She', "'s", 'led', 'an', 'impressive', 'career', ',', 'one', 'that', 'includes', 'deciding', 'to', 'practice', 'in', 'a', 'low', '-', 'income', 'community', 'in', 'the', 'Gulf', 'Coast', 'of', 'Alabama', '.', 'The', 'Bayou', 'La', 'Batre', 'Rural', 'Health', 'Clinic', 'has', 'been', 'a', 'staple', 'for', 'residents', 'since', '1987', 'and', 'often', 'does', "n't", 'charge', 'patients', 'who', 'ca', "n't", 'afford', 'to', 'pay', '.', 'The', 'clinic', 'also', 'focuses', 'heavily', 'on', 'preventative', 'illness', 'since', 'this', 'hits', 'cl

 24%|██▍       | 46/192 [04:37<08:33,  3.52s/it]

Parsing ----> 34_8ecb.xml


 24%|██▍       | 47/192 [04:38<06:31,  2.70s/it]

Parsing ----> 18_10ecb.xml


 25%|██▌       | 48/192 [04:39<05:15,  2.19s/it]

Parsing ----> 18_10ecbplus.xml


 26%|██▌       | 49/192 [04:53<14:09,  5.94s/it]

Parsing ----> 18_11ecb.xml


 26%|██▌       | 50/192 [04:56<11:44,  4.96s/it]

Parsing ----> 18_11ecbplus.xml


 27%|██▋       | 51/192 [05:04<13:40,  5.82s/it]

Not matching:  ['http://www.theage.com', '.', 'au', '/', 'articles/', '2007/04/10/1175971051701', '.', 'html', 'Sacked', 'worker', 'shoots', 'ex', '-', 'colleagues', 'April', '10', ',', '2007', '-', '7', ':', '07AM', 'A', 'man', 'suspected', 'of', 'shooting', 'three', 'people', ',', 'killing', 'one', ',', 'at', 'a', 'suburban', 'Detroit', 'accounting', 'firm', 'from', 'which', 'he', 'was', 'fired', 'last', 'week', 'was', 'arrested', 'after', 'a', 'high', '-', 'speed', 'chase', 'a', 'few', 'hours', 'after', 'the', 'attack', 'this', 'morning', ',', 'authorities', 'said', '.', 'Police', 'said', 'they', 'had', 'located', 'Anthony', 'LaCalamita', ',', '38', ',', 'on', 'the', 'Interstate', '75', 'highway', ',', 'north', 'of', 'the', 'office', 'building', 'where', 'the', 'shootings', 'took', 'place', '.', 'Sheriff', "'s", 'deputies', 'and', 'state', 'police', 'chased', 'him', 'for', '48', 'km', ',', 'including', 'through', 'a', 'construction', 'zone', ',', 'at', 'speeds', 'up', 'to', '193', '

 27%|██▋       | 52/192 [05:08<12:09,  5.21s/it]

Not matching:  ['A', 'former', 'employee', 'who', 'opened', 'fire', 'at', 'an', 'office', 'Christmas', 'party', 'and', 'killed', 'one', 'man', 'has', 'been', 'charged', 'with', 'first', '-', 'degree', 'murder', ',', 'police', 'said', 'Saturday', '.', 'Constable', 'Tim', 'Fanning', 'allege', 'the', 'recently', '-', 'fired', 'man', ',', 'Eric', 'Allen', 'Kirkpatrick', ',', '61', ',', 'opened', 'fire', 'at', 'a', 'Vancouver', 'office', 'Friday', ',', 'killing', 'Benjamin', 'David', 'Banky', ',', '40', '.', 'Fanning', 'said', 'the', 'gunman', 'entered', 'the', 'party', 'of', 'at', 'least', 'a', 'dozen', 'people', 'around', '4', 'p.m.', 'One', 'man', 'was', 'shot', 'and', 'killed', 'but', 'the', 'other', 'partygoers', 'managed', 'to', 'escape', 'and', 'no', 'one', 'else', 'was', 'injured', '.', '"', 'Police', "'", 'locked', 'down', "'", 'the', 'neighborhood', 'and', 'negotiated', 'with', 'the', 'suspect', ',', 'who', 'gave', 'himself', 'up', 'to', 'police', 'just', 'after', '6', 'p.m.', ','

 28%|██▊       | 53/192 [05:17<15:09,  6.54s/it]

Parsing ----> 18_16ecb.xml


 28%|██▊       | 54/192 [05:19<11:31,  5.01s/it]

Parsing ----> 18_1ecbplus.xml


 29%|██▊       | 55/192 [05:27<13:53,  6.08s/it]

Parsing ----> 18_14ecb.xml


 29%|██▉       | 56/192 [05:31<12:11,  5.38s/it]

Parsing ----> 18_1ecb.xml


 30%|██▉       | 57/192 [05:33<09:44,  4.33s/it]

Not matching:  ['A', '61', '-', 'year', '-', 'old', 'man', 'was', 'charged', 'with', 'first', '-', 'degree', 'murder', 'Saturday', 'after', 'allegedly', 'opening', 'fire', 'at', 'his', 'former', 'employers', "'", 'office', 'Christmas', 'party', 'in', 'Vancouver', ',', 'killing', 'one', 'man', ',', 'police', 'said', '.', 'Constable', 'Tim', 'Fanning', 'said', 'Eric', 'Allen', 'Kirkpatrick', ',', 'who', 'had', 'recently', 'been', 'fired', 'from', 'his', 'job', ',', 'is', 'accused', 'of', 'fatally', 'shooting', 'Benjamin', 'David', 'Banky', ',', '40', '.', 'Fanning', 'said', 'at', 'least', 'a', 'dozen', 'people', 'were', 'at', 'the', 'party', 'when', 'the', 'gunman', 'entered', 'about', '4', 'p.m.', 'All', 'but', 'Kirkpatrick', 'escaped', 'without', 'injury', '.', '"', 'Police', 'locked', 'down', 'the', 'neighborhood', 'and', 'negotiated', 'with', 'the', 'suspect', ',', 'who', 'gave', 'himself', 'up', 'to', 'police', 'just', 'after', '6', ':', '00', 'p.m.', ',', '"', 'Fanning', 'said', 'i

 30%|███       | 58/192 [05:38<10:16,  4.60s/it]

Parsing ----> 18_3ecb.xml


 31%|███       | 59/192 [05:46<12:14,  5.53s/it]

Parsing ----> 18_3ecbplus.xml


 31%|███▏      | 60/192 [05:54<13:43,  6.24s/it]

Not matching:  ['http://voices', '.', 'yahoo.com/troy-michigan-office-shooting-follow-1-dead-2-injured-294072', '.', 'html', '?', 'cat=8', 'Troy', ',', 'Michigan', 'Office', 'Shooting', 'Follow', '-', 'Up-1', 'Dead', ',', '2', 'Injured', ',', 'and', 'Suspect', 'Caught', 'Apr', '12', ',', '2007', '"', 'A', 'man', 'suspected', 'of', 'shooting', 'three', 'people', ',', 'killing', 'one', ',', 'at', 'an', 'accounting', 'firm', 'where', 'was', 'fired', 'last', 'week', 'was', 'arrested', 'after', 'a', 'high', '-', 'speed', 'chase', 'a', 'few', 'hours', 'after', 'the', 'Monday', 'morning', 'attack', ',', 'authorities', 'said', '.', 'Police', 'said', 'they', 'had', 'located', 'Anthony', 'LaCalamita', ',', '38', ',', 'on', 'Interstate', '75', ',', 'north', 'of', 'the', 'suburban', 'Detroit', 'office', 'building', 'where', 'the', 'shootings', 'took', 'place', '.', 'Sheriff', "'s", 'deputies', 'and', 'state', 'police', 'chased', 'him', 'for', '30', 'miles', ',', 'including', 'through', 'a', 'const

 32%|███▏      | 61/192 [05:57<11:52,  5.44s/it]

Parsing ----> 18_2ecbplus.xml


 32%|███▏      | 62/192 [06:05<13:11,  6.09s/it]

Not matching:  ['http://usatoday30', '.', 'usatoday.com/news/nation/2007-04-09-office-shooting_N', '.', 'htm', '1', 'dead', ',', '2', 'hurt', 'in', 'Mich.', 'office', 'shooting', ';', 'police', 'say', 'suspect', 'had', 'worked', 'there', 'Updated', '4/9/2007', '6', ':', '04', 'PM', 'A', 'man', 'suspected', 'of', 'shooting', 'three', 'people', 'at', 'an', 'accounting', 'firm', 'where', 'he', 'had', 'worked', 'was', 'arrested', 'a', 'few', 'hours', 'later', 'after', 'a', 'high', '-', 'speed', 'chase', ',', 'authorities', 'said', '.', 'One', 'victim', 'died', 'in', 'the', 'Monday', 'morning', 'attack', '.', 'Police', 'said', 'they', 'had', 'located', 'Anthony', 'LaCalamita', ',', '38', ',', 'of', 'Troy', 'on', 'Interstate', '75', ',', 'north', 'of', 'the', 'suburban', 'Detroit', 'office', 'building', 'where', 'the', 'shootings', 'took', 'place', '.', 'ON', 'DEADLINE', ':', 'Live', 'updates', 'and', 'local', 'coverage', 'Sheriff', "'s", 'deputies', 'and', 'state', 'police', 'chased', 'him'

 33%|███▎      | 63/192 [06:10<12:08,  5.65s/it]

Not matching:  ['A', 'former', 'employee', 'recently', 'let', 'go', 'from', 'his', 'job', 'opened', 'fire', 'at', 'an', 'office', 'Christmas', 'party', 'Friday', ',', 'killing', 'one', 'person', ',', 'police', 'said', '.', '"', 'The', 'suspect', ',', 'a', '61', '-', 'year', '-', 'old', 'man', ',', 'entered', 'the', 'business', ',', 'which', 'had', 'at', 'least', 'a', 'dozen', 'people', 'inside', 'enjoying', 'a', 'Christmas', 'party', ',', '"', 'police', 'said', 'in', 'a', 'statement', '.', 'Police', 'Const', '.', 'Tim', 'Fanning', 'said', 'the', 'gunman', 'entered', 'the', 'party', 'around', '4', 'p.m.', 'One', 'man', 'was', 'shot', 'and', 'killed', 'but', 'the', 'other', 'partygoers', 'managed', 'to', 'escape', 'and', 'no', 'one', 'else', 'was', 'injured', '.', '"', 'Police', "'", 'locked', 'down', "'", 'the', 'neighborhood', 'and', 'negotiated', 'with', 'the', 'suspect', ',', 'who', 'gave', 'himself', 'up', 'to', 'police', 'just', 'after', '6', ':', '00', 'p.m.', ',', '"', 'Fanning',

 33%|███▎      | 64/192 [06:19<14:14,  6.68s/it]

Parsing ----> 18_5ecb.xml


 34%|███▍      | 65/192 [06:22<12:10,  5.75s/it]

Not matching:  ['A', 'former', 'employee', 'who', 'opened', 'fire', 'at', 'an', 'office', 'Christmas', 'party', 'killing', 'one', 'man', ',', 'has', 'been', 'charged', 'with', 'first', '-', 'degree', 'murder', ',', 'police', 'said', 'Saturday', '.', 'Constable', 'Tim', 'Fanning', 'allege', 'the', 'recently', '-', 'fired', 'man', ',', 'Eric', 'Allen', 'Kirkpatrick', ',', '61', ',', 'opened', 'fire', 'at', 'a', 'Vancouver', 'office', 'Friday', ',', 'killing', 'Benjamin', 'David', 'Banky', ',', '40', '.', 'Fanning', 'said', 'the', 'gunman', ',', 'entered', 'the', 'party', 'of', 'at', 'least', 'a', 'dozen', 'people', 'around', '4:00', 'p.m.', 'One', 'man', 'was', 'shot', 'and', 'killed', 'but', 'the', 'other', 'partygoers', 'managed', 'to', 'escape', 'and', 'no', 'one', 'else', 'was', 'injured', '.', '"', 'Police', "'", 'locked', 'down', "'", 'the', 'neighborhood', 'and', 'negotiated', 'with', 'the', 'suspect', ',', 'who', 'gave', 'himself', 'up', 'to', 'police', 'just', 'after', '6:00', '

 34%|███▍      | 66/192 [06:29<12:53,  6.14s/it]

Not matching:  ['http://voices', '.', 'yahoo.com/breaking-news-three-people-shot-office-shooting-292999', '.', 'html', '?', 'cat=8', 'Breaking', 'News', '-', 'Three', 'People', 'Shot', 'in', 'Office', 'Shooting', 'in', 'Suburban', 'Detroit', 'Apr', '9', ',', '2007', 'There', 'is', 'breaking', 'news', 'out', 'of', 'Troy', ',', 'Michigan', ',', 'a', 'suburb', 'of', 'Detroit', '.', '"', 'A', 'gunman', 'shot', 'three', 'people', 'at', 'a', 'suburban', 'Detroit', 'office', 'building', 'Monday', 'morning', ',', 'and', 'officers', 'were', 'searching', 'for', 'the', 'shooter', ',', 'police', 'said', '.', 'The', 'victims', "'", 'conditions', 'were', "n't", 'immediately', 'known', ',', 'said', 'Troy', 'police', 'Lt', '.', 'Gerry', 'Scherlinck', '.', 'A', 'spokeswoman', 'for', 'Beaumont', 'Hospital', 'in', 'Royal', 'Oak', 'said', 'the', 'three', 'victims', 'were', 'taken', 'to', 'the', 'hospital', 'but', 'could', 'not', 'provide', 'more', 'details', '.', 'Witnesses', 'told', 'Detroit', 'broadcast

 35%|███▍      | 67/192 [06:33<11:35,  5.56s/it]

Parsing ----> 18_6ecb.xml


 35%|███▌      | 68/192 [06:47<16:29,  7.98s/it]

Parsing ----> 18_8ecb.xml


 36%|███▌      | 69/192 [06:49<12:35,  6.14s/it]

Not matching:  ['One', 'man', 'is', 'dead', 'after', 'an', 'ex', '-', 'employee', 'opened', 'fire', 'at', 'his', 'company', "'s", 'Christmas', 'party', 'at', 'a', 'Vancouver', ',', 'B.C.', 'business', 'Friday', 'night', '.', 'Just', 'past', '4', 'p.m.', 'Friday', ',', 'Vancouver', 'Police', 'received', 'a', 'call', 'of', 'a', 'man', 'being', 'shot', 'at', '40', 'East', '5th', 'Ave', '.', 'at', 'a', 'business', 'called', 'Tall', 'Grass', 'Distribution', 'Limited', '.', 'Police', 'say', 'a', 'recently', 'laid', 'off', 'employee', 'opened', 'fire', 'shortly', 'after', 'entering', 'the', 'party', ',', 'killing', 'one', 'male', 'employee', '.', 'At', 'least', 'a', 'dozen', 'workers', 'were', 'at', 'the', 'party', 'at', 'the', 'time', '.', 'A', '61', '-', 'year', '-', 'old', 'man', 'surrendered', 'to', 'police', 'shortly', 'after', '6', 'p.m.', 'He', 'is', 'currently', 'in', 'custody', 'and', 'his', 'identity', 'will', 'not', 'be', 'released', 'until', 'charges', 'are', 'formally', 'laid', '

 36%|███▋      | 70/192 [06:59<14:43,  7.24s/it]

Parsing ----> 18_7ecb.xml


 37%|███▋      | 71/192 [07:01<11:43,  5.82s/it]

Not matching:  ['A', 'just', '-', 'fired', 'employee', 'of', 'a', 'Canadian', 'health', 'products', 'company', 'crashed', 'the', 'office', 'Christmas', 'party', 'in', 'Vancouver', ',', 'killing', 'the', 'chief', 'executive', 'officer', ',', 'police', 'said', '.', 'Vancouver', 'police', 'identified', 'the', 'victim', 'of', 'the', 'Friday', 'night', 'shooting', 'at', 'the', 'TallGrass', 'Distributors', 'Ltd.', 'Christmas', 'party', 'as', 'Benjamin', 'Banky', ',', '40', ',', 'the', 'Canadian', 'Broadcasting', 'Corp.', 'reported', '.', 'Police', 'said', 'Eric', 'Allen', 'Kirkpatrick', ',', '61', ',', 'surrendered', 'after', 'two', 'hours', 'of', 'negotiations', 'with', 'police', 'and', 'was', 'charged', 'Saturday', 'with', 'first', '-', 'degree', 'murder', ',', 'the', 'CBC', 'reported', '.', 'Kirkpatrick', 'had', 'been', 'dismissed', 'from', 'his', 'job', 'Thursday', '.', 'Authorities', 'said', 'they', 'did', 'not', 'know', 'how', 'long', 'he', 'had', 'worked', 'at', 'TallGrass', 'or', 'th

 38%|███▊      | 72/192 [07:13<14:56,  7.47s/it]

Not matching:  ['A', 'former', 'employee', 'walked', 'into', 'a', 'staff', 'Christmas', 'party', 'at', 'an', 'east', 'Vancouver', 'health', 'products', 'business', 'Friday', 'and', 'allegedly', 'shot', 'and', 'killed', 'a', 'man', '.', 'Police', 'said', 'they', 'were', 'called', 'to', 'Tallgrass', 'Distributors', 'Ltd.', ',', 'at', '40', 'East', 'Fifth', 'Avenue', '.', ',', 'where', 'a', '61', '-', 'year', '-', 'old', 'man', 'carrying', 'a', 'firearm', 'had', 'confronted', 'the', 'crowd', '.', 'Const', '.', 'Tim', 'Fanning', 'said', 'the', 'man', ',', 'who', 'had', 'recently', 'lost', 'his', 'job', 'at', 'Tallgrass', ',', 'walked', 'into', 'the', 'party', 'and', 'pointed', 'the', 'gun', 'at', 'the', 'victim', '.', 'He', 'then', 'let', 'about', 'a', 'dozen', 'employees', 'leave', 'before', 'holing', 'up', 'inside', 'the', 'office', 'building', 'with', 'the', 'victim', '.', 'Hostage', 'negotiators', 'were', 'called', 'in', 'to', 'get', 'the', 'man', 'out', 'of', 'the', 'building', '.', '

 38%|███▊      | 73/192 [07:22<16:10,  8.16s/it]

Not matching:  ['http://www.nytimes.com/2007/04/10/us/10revenge', '.', 'html?_r=0', 'Shooting', 'at', 'Accounting', 'Firm', 'Leaves', 'Woman', 'Dead', 'and', '2', 'Hurt', 'Published', ':', 'April', '10', ',', '2007', 'A', 'man', 'who', 'was', 'fired', 'last', 'week', 'from', 'his', 'job', 'at', 'an', 'accounting', 'firm', 'in', 'suburban', 'Detroit', 'walked', 'into', 'his', 'former', 'office', 'with', 'a', 'shotgun', 'on', 'Monday', 'and', 'shot', 'three', 'people', ',', 'killing', 'one', ',', 'before', 'fleeing', 'and', 'leading', 'police', 'officers', 'on', 'a', 'high', '-', 'speed', 'chase', ',', 'the', 'authorities', 'said', '.', 'The', 'suspect', ',', 'identified', 'as', 'Anthony', 'LaCalamita', ',', 'entered', 'the', 'second', '-', 'floor', 'office', 'of', 'the', 'accounting', 'firm', ',', 'Gordon', 'Advisors', ',', 'about', '10', 'a.m.', 'and', 'opened', 'fire', ',', 'the', 'police', 'said', '.', 'A', '63', '-', 'year', '-', 'old', 'woman', 'and', 'two', 'men', ',', 'ages', '47

 39%|███▊      | 74/192 [07:24<12:28,  6.34s/it]

Parsing ----> 21_10ecbplus.xml


 39%|███▉      | 75/192 [07:30<11:47,  6.04s/it]

Parsing ----> 21_11ecbplus.xml


 40%|███▉      | 76/192 [07:32<09:31,  4.92s/it]

Parsing ----> 21_12ecb.xml


 40%|████      | 77/192 [07:33<07:25,  3.87s/it]

Parsing ----> 21_11ecb.xml


 41%|████      | 78/192 [07:39<08:04,  4.25s/it]

Parsing ----> 21_12ecbplus.xml


 41%|████      | 79/192 [07:53<13:49,  7.34s/it]

Parsing ----> 21_13ecbplus.xml


 42%|████▏     | 80/192 [08:02<14:46,  7.91s/it]

Parsing ----> 21_1ecbplus.xml


 42%|████▏     | 81/192 [08:04<11:18,  6.11s/it]

Parsing ----> 21_2ecb.xml


 43%|████▎     | 82/192 [08:07<09:03,  4.94s/it]

Parsing ----> 21_14ecbplus.xml


 43%|████▎     | 83/192 [08:09<07:46,  4.28s/it]

Parsing ----> 21_2ecbplus.xml


 44%|████▍     | 84/192 [08:14<07:42,  4.28s/it]

Not matching:  ['http://abclocal', '.', 'go.com/wabc/story', '?', 'section', '=', 'news', '/', 'local', '/', 'new_york&id=9248033', 'Queens', 'hit', 'and', 'run', 'leaves', 'woman', 'dead', 'Saturday', ',', 'September', '14', ',', '2013', 'Police', 'have', 'a', 'suspect', 'in', 'custody', 'after', 'a', 'hit', 'and', 'run', 'accident', 'that', 'killed', 'a', 'woman', 'in', 'Queens', 'Friday', 'night', '.', 'Investigators', 'say', '59', '-', 'year', 'old', 'Raj', 'Chohan', 'was', 'hit', 'while', 'walking', 'along', 'parked', 'cars', 'at', '97th', 'Avenue', 'and', '117th', 'Street', 'in', 'Richmond', 'Hill', 'at', 'about', '7', 'p.m.', 'Police', 'say', 'she', 'was', 'struck', 'by', 'a', 'grey', 'Toyota', 'Camry', 'heading', 'north', 'on', '117th', 'Street', ',', 'which', 'then', 'fled', 'the', 'scene', '.', 'They', 'later', 'apprehended', 'the', 'suspect', ',', 'identified', 'as', '23', '-', 'year', 'old', 'Vishwanand', 'Subryan', 'of', 'Schenectady', ',', 'New', 'York', '.', 'He', 'has',

 44%|████▍     | 85/192 [08:18<07:41,  4.31s/it]

Parsing ----> 21_3ecb.xml


 45%|████▍     | 86/192 [08:23<07:59,  4.52s/it]

Not matching:  ['Charges', 'Filed', 'In', 'Parking', 'Lot', 'Drunken', 'Hit', '-', 'And', '-', 'Run', 'The', 'Hennepin', 'County', 'Attorney', 'filed', 'criminal', 'charges', 'Friday', 'against', 'the', 'man', 'who', 'they', 'say', 'hit', 'and', 'killed', 'a', 'woman', 'walking', 'out', 'of', 'a', 'store', '.', 'Anthony', 'Phillip', 'LaSalle', 'is', 'now', 'in', 'jail', 'for', 'that', 'crime', '.', 'Bloomington', 'Police', 'say', 'he', 'drank', 'so', 'much', 'his', 'blood', '-', 'alcohol', 'level', 'was', '0.41', ',', 'more', 'than', 'five', 'times', 'the', 'legal', 'limit', '.', 'LaSalle', 'faces', 'serious', 'charges', ',', 'including', 'murder', 'and', 'vehicular', 'homicide', '.', 'Both', 'of', 'them', 'are', 'felonies', '.', 'The', 'criminal', 'complaint', 'in', 'this', 'case', 'gives', 'details', 'about', 'exactly', 'what', 'happened', 'Wednesday', 'night', '.', 'LaSalle', 'drank', '1/2', 'liter', 'of', 'Vodka', 'before', 'he', 'drove', 'to', 'the', 'Sam', "'s", 'Club', 'in', 'Bl

 45%|████▌     | 87/192 [08:27<07:52,  4.50s/it]

Parsing ----> 21_4ecb.xml


 46%|████▌     | 88/192 [08:29<06:23,  3.69s/it]

Parsing ----> 21_5ecbplus.xml


 46%|████▋     | 89/192 [08:34<07:05,  4.13s/it]

Not matching:  ['http://www.dnainfo.com/new-york/20130914/richmond-hill/woman-killed-queens-hit-and-run-police-arrest-allegedly-drunk-driver', 'Woman', 'Killed', 'in', 'Queens', 'Hit', '-', 'And', '-', 'Run', ',', 'Police', 'Arrest', 'Allegedly', 'Drunk', 'Driver', 'on', 'September', '14', ',', '2013', '11', ':', '41am', '|', 'Updated', 'on', 'September', '14', ',', '2013', '11', ':', '41am', 'An', 'allegedly', 'intoxicated', 'driver', 'who', 'tried', 'to', 'flee', 'after', 'striking', 'and', 'fatally', 'injuring', 'a', 'woman', 'in', 'Queens', 'has', 'been', 'charged', 'in', 'her', 'murder', ',', 'according', 'to', 'police', '.', 'Police', 'responded', 'to', 'the', 'Richmond', 'Hill', 'crash', 'on', 'the', 'corner', 'of', '117th', 'Street', 'and', '97th', 'Avenue', 'at', '7', ':', '01', 'p.m.', 'on', 'Friday', 'to', 'find', 'the', 'unconscious', 'Raj', 'K.', 'Chohan', ',', '59', ',', 'of', 'College', 'Point', ',', 'lying', 'in', 'the', 'roadway', ',', 'the', 'NYPD', 'said', '.', 'EMS'

 47%|████▋     | 90/192 [08:37<06:16,  3.69s/it]

Parsing ----> 21_6ecbplus.xml


 47%|████▋     | 91/192 [08:42<06:42,  3.99s/it]

Parsing ----> 21_5ecb.xml


 48%|████▊     | 92/192 [08:46<06:55,  4.15s/it]

Parsing ----> 21_7ecb.xml


 48%|████▊     | 93/192 [08:52<07:33,  4.58s/it]

Parsing ----> 21_7ecbplus.xml


 49%|████▉     | 94/192 [08:56<07:03,  4.32s/it]

Parsing ----> 21_9ecb.xml


 49%|████▉     | 95/192 [08:58<05:54,  3.66s/it]

Parsing ----> 21_8ecb.xml


 50%|█████     | 96/192 [08:59<04:41,  2.93s/it]

Parsing ----> 21_8ecbplus.xml


 51%|█████     | 97/192 [09:00<03:37,  2.28s/it]

Parsing ----> 21_9ecbplus.xml


 51%|█████     | 98/192 [09:02<03:46,  2.41s/it]

Not matching:  ['http://cached', '.', 'newslookup.com/cached', '.', 'php', '?', 'ref_id=225&siteid=2182&id=3110701&t=1379163972', 'Woman', 'Killed', 'In', 'Queens', 'Hit', '-', 'And', '-', 'Run', 'Accident', ',', 'Driver', 'Charged', 'Updated', '9', ':', '06', 'AM', 'A', 'woman', 'has', 'died', 'after', 'being', 'hit', 'by', 'a', 'car', 'that', 'fled', 'the', 'scene', 'in', 'Queens', ',', 'and', 'now', ',', 'the', 'driver', 'is', 'facing', 'manslaughter', 'charges', '.', 'Witnesses', 'say', '59', '-', 'year', '-', 'old', 'Raj', 'Chohan', 'was', 'leaving', 'her', 'car', 'on', '117th', 'Street', 'in', 'South', 'Richmond', 'Hill', 'Friday', 'when', 'she', 'was', 'hit', 'by', 'a', 'silver', 'Toyota', 'Camry', 'and', 'sent', 'flying', 'into', 'the', 'air', '.', 'She', 'died', 'at', 'the', 'hospital', '.', 'Police', 'caught', 'up', 'with', 'the', 'driver', ',', 'who', 'is', 'from', 'Schnectady', ',', 'a', 'few', 'blocks', 'away', '.', 'He', 'has', 'been', 'charged', 'with', 'vehicular', 'man

 52%|█████▏    | 99/192 [09:03<02:44,  1.77s/it]

Parsing ----> 23_2ecb.xml


 52%|█████▏    | 100/192 [09:03<02:01,  1.32s/it]

Parsing ----> 23_1ecb.xml


 53%|█████▎    | 101/192 [09:03<01:30,  1.00it/s]

Parsing ----> 23_1ecbplus.xml


 53%|█████▎    | 102/192 [09:05<02:02,  1.36s/it]

Not matching:  ['http://www.stuff', '.', 'co.', 'nz', '/', 'national/9166103', '/', 'Climber', '-', 'dead', '-', 'after', '-', 'Aoraki', '-', 'Mount', '-', 'Cook', '-', 'fall', 'Climber', 'dead', 'after', 'Aoraki', 'Mount', 'Cook', 'fall', 'Last', 'updated', '13', ':', '26', '14/09/2013', 'A', '36', '-', 'year', '-', 'old', 'Australian', 'climber', 'is', 'dead', 'after', 'falling', 'about', '150', 'metres', 'at', 'Aoraki', 'Mount', 'Cook', '.', 'The', 'man', ',', 'from', 'Hampton', 'East', ',', 'fell', 'in', 'the', 'Tasman', 'Glacier', 'area', '.', 'A', 'Westpac', 'Rescue', 'Helicopter', 'from', 'Christchurch', 'was', 'called', 'about', '3', 'pm', 'to', 'assist', 'with', 'search', 'and', 'rescue', 'co', '-', 'ordination', 'at', 'the', 'Mt', 'Cook', 'base', '.', 'When', 'it', 'got', 'there', ',', 'the', 'man', 'had', 'been', 'transported', 'to', 'Mt', 'Cook', 'Village', 'by', 'a', 'local', 'helicopter', 'service', 'and', 'pronounced', 'dead', 'by', 'paramedics', '.', 'The', 'man', 'is',

 54%|█████▎    | 103/192 [09:10<03:18,  2.23s/it]

Parsing ----> 23_11ecbplus.xml


 54%|█████▍    | 104/192 [09:14<04:03,  2.77s/it]

Not matching:  ['http://news', '.', 'sky.com/story/1142198/new-zealand-british-climber-killed-in-fall', 'New', 'Zealand', ':', 'British', 'Climber', 'Killed', 'In', 'Fall', '8', ':', '05am', 'UK', ',', 'Monday', '16', 'September', '2013', 'A', '32', '-', 'year', '-', 'old', 'British', 'climber', ',', 'described', 'as', '"', 'lacking', 'experience', '"', ',', 'dies', 'after', 'falling', 'from', 'a', 'mountain', 'in', 'New', 'Zealand', '.', 'A', 'British', 'man', 'has', 'been', 'killed', 'after', 'falling', 'around', '2', ',', '000', 'ft', 'while', 'climbing', 'in', 'New', 'Zealand', '.', 'Robert', 'Buckley', ',', '32', ',', 'was', 'climbing', 'to', 'a', 'small', 'hut', 'on', 'Mount', 'Sefton', 'in', 'the', 'Aoraki', '-', 'Mt', 'Cook', 'National', 'Park', 'on', 'Saturday', '.', 'He', 'was', 'wearing', 'crampons', 'but', 'was', 'inexperienced', ',', 'local', 'police', 'inspector', 'Dave', 'Gaskin', 'said', '.', 'His', 'body', 'was', 'recovered', 'by', 'a', 'team', 'of', 'rescuers', 'on', 

 55%|█████▍    | 105/192 [09:14<02:56,  2.03s/it]

Parsing ----> 23_2ecbplus.xml


 55%|█████▌    | 106/192 [09:19<04:04,  2.84s/it]

Not matching:  ['http://www.stuff', '.', 'co.', 'nz', '/', 'national/9167416', '/', 'Two', '-', 'climbers', '-', 'killed', '-', 'in', '-', 'Mt', '-', 'Cook', '-', 'falls', 'Two', 'climbers', 'killed', 'in', 'Mt', 'Cook', 'falls', 'Last', 'updated', '15', ':', '48', '15/09/2013', 'The', 'Aoraki', 'Mt', 'Cook', 'Alpine', 'Rescue', 'Team', 'have', 'recovered', 'the', 'body', 'of', 'the', 'second', 'climber', 'to', 'fall', 'to', 'his', 'death', 'at', 'Mount', 'Cook', 'National', 'Park', 'over', 'the', 'weekend', '.', 'Inspector', 'Dave', 'Gaskin', 'said', 'the', 'man', 'fell', 'around', '2000', 'ft', '(', '609', 'metres', ')', 'near', 'the', 'Mount', 'Sefton', 'Bivvy', ',', 'above', 'the', 'Mueller', 'Glacier', '.', '"', 'The', 'fall', 'was', 'unsurvivable', ',', '"', 'Inspector', 'Gaskin', 'said', '.', 'There', 'were', 'four', 'in', 'the', 'climbing', 'party', '.', 'The', 'friends', 'of', 'the', 'deceased', 'had', 'to', 'stay', 'in', 'position', 'overnight', 'until', 'the', 'weather', 'wa

 56%|█████▌    | 107/192 [09:23<04:35,  3.24s/it]

Not matching:  ['http://www.telegraph', '.', 'co.', 'uk', '/', 'news', '/', 'worldnews', '/', 'australiaandthepacific', '/', 'newzealand', '/10311656', '/', 'English', '-', 'and', '-', 'Australian', '-', 'climbers', '-', 'fall', '-', 'to', '-', 'their', '-', 'deaths', '-', 'in', '-', 'New', '-', 'Zealand', '-', 'mountains', '.', 'html', 'English', 'and', 'Australian', 'climbers', 'fall', 'to', 'their', 'deaths', 'in', 'New', 'Zealand', 'mountains', 'One', 'Australian', 'and', 'one', 'English', 'climber', 'have', 'fallen', 'to', 'their', 'deaths', 'in', 'New', 'Zealand', '.', '8', ':', '44AM', 'BST', '16', 'Sep', '2013', 'Police', 'Inspector', 'Dave', 'Gaskin', 'said', 'the', 'two', 'incidents', 'on', 'consecutive', 'days', 'in', 'the', 'Aoraki', '-', 'Mt', '.', 'Cook', 'National', 'Park', 'were', 'not', 'related', 'and', 'were', 'not', 'due', 'to', 'bad', 'weather', '.', 'He', 'said', 'they', 'come', 'as', 'a', 'reminder', 'that', 'climbers', 'need', 'to', 'use', 'extreme', 'caution', 

 56%|█████▋    | 108/192 [09:23<03:23,  2.43s/it]

Parsing ----> 23_5ecbplus.xml


 57%|█████▋    | 109/192 [09:28<04:09,  3.01s/it]

Not matching:  ['http://www.bbc', '.', 'co.', 'uk', '/', 'news', '/', 'world', '-24104939', '16', 'September', '2013', 'Last', 'updated', 'at', '12', ':', '42', 'GMT', 'Briton', 'dies', 'in', 'New', 'Zealand', "'s", 'Aoraki', 'Mount', 'Cook', 'National', 'Park', 'A', 'British', 'climber', 'has', 'fallen', '2', ',', '000', 'ft', 'to', 'his', 'death', 'on', 'a', 'mountain', 'in', 'New', 'Zealand', ',', 'police', 'there', 'have', 'said', '.', 'Robert', 'Buckley', ',', '32', ',', 'died', 'while', 'climbing', 'to', 'a', 'hut', 'on', 'Mount', 'Sefton', 'in', 'the', 'Aoraki', 'Mount', 'Cook', 'National', 'Park', 'on', 'Saturday', '.', 'Police', 'inspector', 'Dave', 'Gaskin', 'said', 'Mr', 'Buckley', 'was', 'well', 'equipped', 'at', 'the', 'time', 'but', 'was', 'an', 'inexperienced', 'climber', '.', 'A', 'Foreign', 'Office', 'spokeswoman', 'said', ':', '"', 'We', 'are', 'aware', 'of', 'the', 'death', 'of', 'a', 'British', 'man', 'in', 'New', 'Zealand', '.', 'We', 'are', 'providing', 'consular'

 57%|█████▋    | 110/192 [09:28<03:03,  2.24s/it]

Parsing ----> 23_5ecb.xml


 58%|█████▊    | 111/192 [09:29<02:17,  1.69s/it]

Parsing ----> 23_4ecbplus.xml


 58%|█████▊    | 112/192 [09:32<02:59,  2.25s/it]

Not matching:  ['http://globalnews', '.', 'ca', '/', 'news/842183', '/', 'australian', '-', 'english', '-', 'climbers', '-', 'fall', '-', 'to', '-', 'their', '-', 'deaths', '-', 'in', '-', 'new', '-', 'zealand', '-', 'mountains', '/', 'September', '15', ',', '2013', '11', ':', '26', 'pm', 'Australian', ',', 'English', 'climbers', 'fall', 'to', 'their', 'deaths', 'in', 'New', 'Zealand', 'mountains', 'One', 'Australian', 'and', 'one', 'English', 'climber', 'have', 'fallen', 'to', 'their', 'deaths', 'in', 'New', 'Zealand', '.', 'Police', 'Inspector', 'Dave', 'Gaskin', 'said', 'Monday', 'the', 'two', 'incidents', 'on', 'consecutive', 'days', 'in', 'the', 'Aoraki', '-', 'Mt', '.', 'Cook', 'National', 'Park', 'were', 'not', 'related', 'and', 'were', 'not', 'due', 'to', 'bad', 'weather', '.', 'He', 'says', 'they', 'come', 'as', 'a', 'reminder', 'that', 'climbers', 'need', 'to', 'use', 'extreme', 'caution', '.', 'On', 'Friday', ',', '36', '-', 'year', '-', 'old', 'Duncan', 'Raite', 'died', 'af

 59%|█████▉    | 113/192 [09:38<04:32,  3.44s/it]

Not matching:  ['http://www.odt', '.', 'co.', 'nz', '/', 'news', '/', 'national/273204', '/', 'second', '-', 'climber', '-', 'dies', '-', 'mt', '-', 'cook', 'Second', 'climber', 'dies', 'at', 'Mt', 'Cook', 'Sun', ',', '15', 'Sep', '2013', 'Two', 'men', 'have', 'died', 'in', 'separate', 'climbing', 'accidents', 'at', 'Mt', 'Cook', 'this', 'weekend', '.', 'The', 'Aoraki', 'Mt', 'Cook', 'Alpine', 'Rescue', 'Team', 'worked', 'today', 'to', 'recover', 'the', 'body', 'of', 'a', 'climber', 'after', 'bad', 'weather', 'hindered', 'efforts', 'yesterday', '.', 'The', 'climber', 'fell', 'in', 'fading', 'light', 'yesterday', 'near', 'the', 'Mt', 'Sefton', 'Bivvy', ',', 'and', 'rescue', 'teams', 'were', 'unable', 'to', 'reach', 'him', 'or', 'his', 'companions', 'due', 'to', 'the', 'weather', '.', 'Police', 'are', 'still', 'at', 'the', 'scene', 'and', 'say', 'more', 'details', 'will', 'be', 'released', 'later', '.', 'The', 'first', 'of', 'the', 'deaths', 'this', 'weekend', 'was', 'that', 'of', 'a', '

 59%|█████▉    | 114/192 [09:42<04:29,  3.46s/it]

Not matching:  ['http://www.theaustralian.com', '.', 'au', '/', 'news', '/', 'world', '/', 'australian', '-', 'english', '-', 'climbers', '-', 'killed', '-', 'in', '-', 'new', '-', 'zealand', '/', 'story', '-', 'e6frg6so-1226720045327', 'Australian', ',', 'English', 'climbers', 'killed', 'in', 'New', 'Zealand', 'SEPTEMBER', '16', ',', '2013', '11', ':', '12AM', 'ONE', 'Australian', 'and', 'one', 'English', 'climber', 'have', 'fallen', 'to', 'their', 'deaths', 'in', 'New', 'Zealand', ',', 'police', 'confirmed', 'today', '.', 'Police', 'Inspector', 'Dave', 'Gaskin', 'said', 'the', 'two', 'incidents', 'on', 'consecutive', 'days', 'in', 'the', 'Aoraki', '-', 'Mt', '.', 'Cook', 'National', 'Park', 'were', 'not', 'related', 'and', 'were', 'not', 'due', 'to', 'bad', 'weather', '.', 'He', 'said', 'they', 'come', 'as', 'a', 'reminder', 'that', 'climbers', 'need', 'to', 'use', 'extreme', 'caution', '.', 'On', 'Friday', ',', '36', '-', 'year', '-', 'old', 'Duncan', 'Rait', 'died', 'after', 'slipp

 60%|█████▉    | 115/192 [09:43<03:24,  2.66s/it]

Parsing ----> 23_8ecb.xml


 60%|██████    | 116/192 [09:43<02:33,  2.03s/it]

Parsing ----> 23_9ecb.xml


 61%|██████    | 117/192 [09:44<01:52,  1.50s/it]

Parsing ----> 23_8ecbplus.xml


 61%|██████▏   | 118/192 [09:49<03:26,  2.79s/it]

Not matching:  ['http://news', '.', 'msn', '.', 'co.', 'nz', '/', 'nationalnews/8723460', '/', 'climber', '-', 'falls', '-', 'on', '-', 'aoraki', '-', 'mount', '-', 'cook', 'Second', 'man', 'dies', 'in', 'Mt', 'Cook', 'National', 'Park', '14', ':', '25', 'Sun', 'Sep', '15', '2013', 'A', 'second', 'man', 'has', 'fallen', 'to', 'his', 'death', 'in', 'Aoraki', 'Mount', 'Cook', 'National', 'Park', '.', 'The', 'man', 'aged', 'in', 'his', 'early', '30s', 'fell', 'about', '600', 'm', 'while', 'climbing', 'near', 'the', 'Mount', 'Sefton', 'Bivvy', 'on', 'Saturday', 'afternoon', '.', 'Rescue', 'personnel', 'were', 'not', 'able', 'to', 'reach', 'him', 'or', 'his', 'three', 'companions', ',', 'who', 'were', 'all', 'from', 'overseas', ',', 'due', 'to', 'low', 'light', 'levels', 'on', 'Saturday', ',', 'Mid', '-', 'South', 'Canterbury', 'Area', 'Commander', 'Inspector', 'Dave', 'Gaskin', 'said', '.', '"', 'They', 'managed', 'to', 'have', 'a', 'look', 'at', 'him', 'and', 'it', 'was', 'pretty', 'clear

 62%|██████▏   | 119/192 [09:55<04:17,  3.53s/it]

Parsing ----> 2_11ecb.xml


 62%|██████▎   | 120/192 [10:07<07:28,  6.23s/it]

Not matching:  ['Even', 'as', 'Oscar', 'organizers', 'on', 'Friday', 'unveiled', 'Hugh', 'Jackman', 'as', 'the', 'host', 'of', 'their', 'gala', 'film', 'awards', ',', 'the', 'prospect', 'of', 'a', 'U.S.', 'actors', 'strike', 'was', 'casting', 'a', 'long', 'shadow', 'over', 'whether', 'Hollywood', "'s", 'big', 'show', 'would', 'go', 'on', 'as', 'usual', '.', 'The', 'Academy', 'Awards', "'", 'February', '22', 'date', 'puts', 'it', 'directly', 'in', 'the', 'path', 'of', 'a', 'potential', 'walkout', 'by', 'Screen', 'Actors', 'Guild', 'members', 'who', 'vote', 'next', 'month', 'on', 'whether', 'to', 'give', 'union', 'leaders', 'permission', 'to', 'call', 'a', 'strike', 'in', 'stalemated', 'contract', 'talks', 'with', 'major', 'studios', '.', 'Movie', 'making', 'by', 'the', 'big', 'studios', 'has', 'wound', 'down', 'since', 'late', 'June', 'in', 'anticipation', 'of', 'labor', 'strife', ',', 'compounding', 'a', 'general', 'slowdown', 'from', 'the', 'U.S.', 'recession', '.', 'The', 'tension', 

 63%|██████▎   | 121/192 [10:09<05:58,  5.05s/it]

Parsing ----> 2_11ecbplus.xml


 64%|██████▎   | 122/192 [10:16<06:28,  5.55s/it]

Parsing ----> 2_3ecb.xml


 64%|██████▍   | 123/192 [10:31<09:25,  8.20s/it]

Parsing ----> 2_2ecbplus.xml


 65%|██████▍   | 124/192 [10:32<07:02,  6.21s/it]

Parsing ----> 2_3ecbplus.xml


 65%|██████▌   | 125/192 [10:34<05:30,  4.93s/it]

Parsing ----> 2_1ecbplus.xml


 66%|██████▌   | 126/192 [10:40<05:46,  5.25s/it]

Not matching:  ['http://www.deadline.com/2013/08/ellen-degeneres-to-host-oscars/', 'Ellen', 'DeGeneres', 'To', 'Host', '86th', 'Oscars', 'Friday', 'August', '2', ',', '2013', '@', '8', ':', '13am', 'PDT', 'Ellen', 'DeGeneres', 'just', 'tweeted', 'she', 'will', 'be', 'hosting', 'the', 'Oscars', 'this', 'year', '.', 'It', 'will', 'be', 'her', 'second', 'stint', 'in', 'the', 'job', ',', 'after', 'hosting', 'the', '2007', 'ceremony', 'and', 'earning', 'an', 'Emmy', 'nomination', 'for', 'it', '.', '“', 'It', "'s", 'official', ':', 'I', '’', 'm', 'hosting', 'the', '#', 'Oscars', '!', 'I', '’', 'd', 'like', 'to', 'thank', '@TheAcademy', ',', 'my', 'wife', 'Portia', 'and', ',', 'oh', 'dear', ',', 'there', 'goes', 'the', 'orchestra', '“', ',', 'the', 'comedian', 'and', 'talk', 'show', 'host', 'said', 'on', 'her', 'Twitter', 'feed', 'just', 'now', '.', 'The', 'announcement', 'of', 'DeGeneres', 'as', 'host', 'comes', 'just', 'a', 'couple', 'of', 'days', 'after', 'the', 'election', 'of', 'Cheryl',

 66%|██████▌   | 127/192 [10:47<06:07,  5.65s/it]

Parsing ----> 2_4ecb.xml


 67%|██████▋   | 128/192 [11:02<09:12,  8.63s/it]

Not matching:  ['He', 'sings', ',', 'dances', ',', 'acts', 'on', 'stage', 'and', 'screen', ',', 'and', 'he', "'s", 'the', '"', 'Sexiest', 'Man', 'Alive', '.', '"', 'Hugh', 'Jackman', 'does', 'everything', 'but', 'standup', 'comedy', '-', 'and', 'that', "'s", 'why', 'the', 'first', '-', 'time', 'Oscars', 'host', 'fulfills', 'the', 'academy', "'s", 'promise', 'to', 'shake', 'up', 'the', 'show', 'in', 'a', 'way', 'that', "'s", 'as', 'tough', 'to', 'predict', 'as', 'the', 'winners', '.', 'Jackman', 'was', 'announced', 'Friday', 'as', 'the', 'host', 'of', 'the', '81st', 'Academy', 'Awards', ',', 'a', 'marked', 'departure', 'from', 'the', 'academy', "'s", 'standard', 'of', 'big', '-', 'name', 'comedians', '.', 'Jon', 'Stewart', ',', 'who', 'hosted', 'in', '2008', 'and', '2006', ',', 'and', 'Ellen', 'DeGeneres', ',', 'the', '2007', 'host', ',', 'were', 'the', 'latest', 'in', 'a', 'line', 'of', 'funny', 'emcees', 'since', '1990', '.', 'Billy', 'Crystal', 'did', 'it', 'eight', 'times', ',', 'Wh

 67%|██████▋   | 129/192 [11:03<06:42,  6.39s/it]

Parsing ----> 2_5ecb.xml


 68%|██████▊   | 130/192 [11:11<07:04,  6.84s/it]

Parsing ----> 2_4ecbplus.xml


 68%|██████▊   | 131/192 [11:15<06:01,  5.93s/it]

Parsing ----> 2_6ecb.xml


 69%|██████▉   | 132/192 [11:25<07:12,  7.21s/it]

Parsing ----> 2_6ecbplus.xml


 69%|██████▉   | 133/192 [11:29<05:58,  6.07s/it]

Not matching:  ['http://www.nytimes.com/2013/08/03/business/media/ellen-degeneres-to-host-next-years-oscars', '.', 'html?_r=0', 'Ellen', 'DeGeneres', 'to', 'Host', 'Next', 'Year', "'s", 'Oscars', 'Published', ':', 'August', '2', ',', '2013', 'Reaching', 'back', 'to', 'television', 'for', 'a', 'comedy', 'star', ',', 'the', 'producers', 'of', 'the', 'Academy', 'Awards', 'telecast', 'announced', 'Friday', 'that', 'Ellen', 'DeGeneres', 'would', 'return', 'next', 'year', 'to', 'host', 'the', 'annual', 'Oscar', 'ceremony', '.', 'The', 'producers', ',', 'Craig', 'Zadan', 'and', 'Neil', 'Meron', ',', 'caused', 'some', 'backlash', 'last', 'year', 'after', 'giving', 'the', 'job', 'to', 'Seth', 'MacFarlane', ',', 'the', 'creator', 'of', 'the', '“', 'Family', 'Guy', '”', 'animated', 'show', 'on', 'the', 'Fox', 'network', '.', 'Ms.', 'DeGeneres', 'has', 'even', 'less', 'to', 'do', 'with', 'the', 'theatrical', 'movie', 'business', 'than', 'Mr.', 'MacFarlane', 'does', '(', 'his', 'movie', '“', 'Ted',

 70%|██████▉   | 134/192 [12:06<14:56, 15.45s/it]

Parsing ----> 2_7ecbplus.xml


 70%|███████   | 135/192 [12:09<11:07, 11.71s/it]

Parsing ----> 2_8ecb.xml


 71%|███████   | 136/192 [12:20<10:46, 11.55s/it]

Not matching:  ['Now', 'that', 'it', "'s", 'been', 'confirmed', 'that', 'Hugh', 'Jackman', 'will', 'be', 'hosting', 'the', '81st', 'Annual', 'Academy', 'Awards', ',', 'instantly', 'upping', 'the', 'annual', 'ceremony', "'s", 'overall', 'attractiveness', ',', 'the', 'Ampersand', 'would', 'like', 'to', 'offer', 'some', 'heartfelt', 'advice', 'to', 'the', 'first', '-', 'time', 'Oscar', 'host', '.', 'True', ',', 'Jackman', 'held', 'his', 'own', 'just', 'fine', 'as', 'the', 'host', 'of', 'both', 'the', '2004', 'and', '2005', 'Tony', 'Awards', ',', 'earning', 'an', 'Emmy', 'for', 'the', 'former', ',', 'but', 'this', 'is', 'a', 'whole', 'new', 'ballgame', '.', 'So', 'without', 'further', 'ado', ',', 'Hugh', ',', 'a', 'list', 'of', 'dos', 'and', "don'ts", 'when', 'hosting', 'the', 'Academy', 'Awards', ':', 'DO', 'Keep', 'jokes', 'tight', ',', 'snappy', 'and', 'to', '-', 'the', '-', 'point', ',', 'a', 'la', 'Jon', 'Stewart', '.', 'Stewart', 'pulled', 'off', 'a', 'television', 'miracle', 'last',

 71%|███████▏  | 137/192 [12:30<10:08, 11.06s/it]

Not matching:  ['http://www.reuters.com/article/2013/08/02/us-oscars-degeneres-idUSBRE9710UQ20130802', 'Comedian', 'Ellen', 'DeGeneres', 'picked', 'to', 'host', '2014', 'Oscars', 'Fri', 'Aug', '2', ',', '2013', '6', ':', '50pm', 'EDT', 'Comedian', 'Ellen', 'DeGeneres', 'will', 'make', 'her', 'second', 'appearance', 'as', 'an', 'Academy', 'Awards', 'host', 'next', 'year', ',', 'organizers', 'said', 'on', 'Friday', ',', 'in', 'a', 'move', 'widely', 'seen', 'as', 'a', 'less', 'provocative', 'choice', 'for', 'Hollywood', "'s", 'highest', 'honors', 'after', 'a', 'ribald', 'performance', 'by', 'Seth', 'MacFarlane', '.', 'DeGeneres', ',', 'the', 'star', 'of', 'her', 'own', 'daytime', 'talk', 'show', '"', 'Ellen', ',', '"', 'first', 'hosted', 'the', 'Oscars', 'in', '2007', ',', 'becoming', 'only', 'the', 'second', 'woman', 'to', 'fill', 'that', 'exalted', 'role', 'alone', ',', 'after', 'Whoopi', 'Goldberg', '.', '"', 'I', 'am', 'so', 'excited', 'to', 'be', 'hosting', 'the', 'Oscars', 'for', 't

 72%|███████▏  | 138/192 [12:36<08:30,  9.45s/it]

Parsing ----> 2_9ecb.xml


 72%|███████▏  | 139/192 [12:42<07:32,  8.53s/it]

Parsing ----> 5_10ecb.xml


 73%|███████▎  | 140/192 [12:52<07:51,  9.07s/it]

Not matching:  ['The', 'Philadelphia', '76ers', 'fired', 'coach', 'Maurice', 'Cheeks', 'on', 'Saturday', ',', 'one', 'day', 'after', 'the', 'team', 'continued', 'its', 'slide', 'with', 'a', 'season', '-', 'worst', 'offensive', 'effort', ',', 'dpa', 'reported', '.', 'Expected', 'by', 'many', 'to', 'be', 'one', 'of', 'the', 'top', 'teams', 'in', 'the', 'Eastern', 'Conference', ',', 'the', '76ers', 'have', 'opened', '9', '-', '14', 'with', 'eight', 'losses', 'in', 'their', 'last', '10', 'games', ',', 'including', 'Friday', "'s", 'dispirited', '88', '-', '72', 'setback', 'at', 'Cleveland', '.', 'Cheeks', 'was', 'replaced', 'on', 'an', 'interim', 'basis', 'by', 'assistant', 'general', 'manager', 'Tony', 'DiLeo', ',', 'a', 'long', '-', 'time', 'member', 'of', 'the', 'team', "'s", 'front', 'office', '.', 'DiLeo', 'will', 'coach', 'the', 'Sixers', 'in', 'Saturday', "'s", 'game', 'at', 'Washington', 'and', 'is', 'expected', 'to', 'remain', 'coach', 'for', 'the', 'rest', 'of', 'the', 'season', '

 73%|███████▎  | 141/192 [12:55<05:59,  7.05s/it]

Parsing ----> 5_13ecb.xml


 74%|███████▍  | 142/192 [12:59<05:16,  6.33s/it]

Parsing ----> 5_11ecb.xml


 74%|███████▍  | 143/192 [13:01<04:00,  4.92s/it]

Parsing ----> 5_10ecbplus.xml


 75%|███████▌  | 144/192 [13:02<02:59,  3.74s/it]

Not matching:  ['http://forums', '.', 'nba-live.com/viewtopic', '.', 'php', '?', 'f=44&t=25677', 'Sixers', 'Hire', 'Maurice', 'Cheeks', 'as', 'Head', 'Coach', 'Maurice', 'Cheeks', 'was', 'officially', 'introduced', 'as', 'the', '21st', 'head', 'coach', 'of', 'the', '76ers', '.', 'Cheeks', ',', 'who', 'played', 'for', 'the', 'Sixers', 'for', '11', 'seasons', 'and', 'served', 'as', 'an', 'assistant', 'coach', 'for', 'seven', ',', 'was', 'named', 'coach', 'after', 'Jim', "O'Brien", 'was', 'relieved', 'of', 'his', 'coaching', 'duties', '.'] ['http://forums', '.', 'nba-live.com/viewtopic', '.', 'php', '?', 'f', '=', '44', '&', 't', '=', '25677', 'Sixers', 'Hire', 'Maurice', 'Cheeks', 'as', 'Head', 'Coach', 'Maurice', 'Cheeks', 'was', 'officially', 'introduced', 'as', 'the', '21st', 'head', 'coach', 'of', 'the', '76ers', '.', 'Cheeks', ',', 'who', 'played', 'for', 'the', 'Sixers', 'for', '11', 'seasons', 'and', 'served', 'as', 'an', 'assistant', 'coach', 'for', 'seven', ',', 'was', 'named', 

 76%|███████▌  | 145/192 [13:07<03:14,  4.14s/it]

Parsing ----> 5_2ecb.xml


 76%|███████▌  | 146/192 [13:14<03:45,  4.89s/it]

Parsing ----> 5_1ecb.xml


 77%|███████▋  | 147/192 [13:21<04:16,  5.71s/it]

Parsing ----> 5_4ecb.xml


 77%|███████▋  | 148/192 [13:44<07:56, 10.83s/it]

Parsing ----> 5_3ecbplus.xml


 78%|███████▊  | 149/192 [13:44<05:28,  7.65s/it]

Not matching:  ['http://query', '.', 'nytimes.com/gst/fullpage', '.', 'html', '?', 'res=9402E6D71239F937A15756C0A9639C8B63', 'PHILADELPHIA', '76ERS', 'Fired', 'Jim', "O'Brien", ',', 'coach', '.', 'Named', 'Maurice', 'Cheeks', 'coach', '.'] ['http://query', '.', 'nytimes.com/gst/fullpage', '.', 'html', '?', 'res', '=', '9402E6D71239F937A15756C0A9639C8B63', 'PHILADELPHIA', '76ERS', 'Fired', 'Jim', "O'Brien", ',', 'coach', '.', 'Named', 'Maurice', 'Cheeks', 'coach', '.']
Skip ----> 5_3ecbplus.xml
Parsing ----> 5_3ecb.xml


 78%|███████▊  | 150/192 [13:57<06:29,  9.27s/it]

Parsing ----> 5_2ecbplus.xml


 79%|███████▊  | 151/192 [14:12<07:18, 10.71s/it]

Not matching:  ['http://sports', '.', 'espn', '.', 'go.com/nba/news/story', '?', 'id=2066717', 'Mo', 'Cheeks', 'will', 'take', 'over', 'as', 'new', 'coach', 'Updated', ':', 'May', '24', ',', '2005', ',', '3', ':', '23', 'PM', 'ET', 'Maurice', 'Cheeks', 'is', 'running', 'the', 'show', 'again', 'for', 'the', 'Philadelphia', '76ers', '.', 'In', 'a', 'surprising', 'move', 'nearly', 'three', 'weeks', 'following', 'the', 'end', 'of', 'their', 'season', ',', 'the', 'Sixers', 'fired', 'Jim', "O'Brien", 'on', 'Monday', 'after', 'one', 'season', 'as', 'coach', 'and', 'replaced', 'him', 'with', 'Cheeks', ',', 'one', 'of', 'the', 'most', 'popular', 'players', 'in', 'franchise', 'history', '.', '"', 'Mo', 'is', 'family', '.', 'Mo', 'bleeds', '76ers', '.', 'He', 'bleeds', 'Philadelphia', ',', '"', 'team', 'president', 'Billy', 'King', 'said', '.', 'Cheeks', 'will', 'be', 'introduced', 'as', 'Philadelphia', "'s", '21st', 'head', 'coach', 'at', 'a', 'Tuesday', 'news', 'conference', '.', 'Philadelphia'

 79%|███████▉  | 152/192 [14:13<05:13,  7.83s/it]

Not matching:  ['http://sportzassassin2', '.', 'blogspot', '.', 'nl/2005_05_01', '_archive', '.', 'html', 'Tuesday', ',', 'May', '24', ',', '2005', 'Up', 'the', 'coast', '...', 'the', 'Philly', 'Sixers', 'canned', 'Jim', "O'Brien", '[', 'who', 'had', 'been', 'under', 'fire', 'all', 'season', ']', 'and', 'quickly', 'hired', 'Mo', 'Cheeks', '.', 'Mo', 'ran', 'the', 'Sixers', 'in', 'the', '1980s', 'as', 'their', 'point', 'guard', '.', 'He', 'looks', 'to', 'be', 'a', 'perfect', 'coach', 'for', 'Allen', 'Iverson', '.', 'Really', ',', 'this', 'move', 'has', 'been', 'in', 'the', 'making', 'for', 'quite', 'some', 'time', '.', 'The', 'Sixers', 'have', 'bugged', 'Portland', 'about', 'talking', 'to', 'him', 'about', 'their', 'head', 'coaching', 'job', 'over', 'the', 'past', '3', 'years', '.', 'After', 'the', 'Blazers', 'fired', 'Cheeks', '...', 'it', 'was', 'only', 'a', 'matter', 'of', 'time', 'before', 'Philly', 'and', 'Cheeks', 'hooked', 'up', 'again', '.'] ['http://sportzassassin2', '.', 'blog

 80%|███████▉  | 153/192 [14:18<04:35,  7.07s/it]

Parsing ----> 5_4ecbplus.xml


 80%|████████  | 154/192 [14:28<04:59,  7.89s/it]

Parsing ----> 5_5ecb.xml


 81%|████████  | 155/192 [14:30<03:53,  6.30s/it]

Parsing ----> 5_6ecbplus.xml


 81%|████████▏ | 156/192 [14:39<04:12,  7.01s/it]

Not matching:  ['http://sports', '.', 'espn', '.', 'go.com/ncaa/news/story', '?', 'id=2061565', 'NCAA', ':', 'Improper', 'gifts', ',', 'assistance', 'given', 'to', 'athletes', 'Updated', ':', 'May', '16', ',', '2005', ',', '6', ':', '03', 'PM', 'ET', 'The', 'NCAA', 'notified', 'Ohio', 'State', 'on', 'Monday', 'of', 'nine', 'alleged', 'rules', 'violations', ',', 'including', 'cash', 'gifts', ',', 'improper', 'academic', 'assistance', 'and', 'failure', 'of', 'the', 'school', 'and', 'coaches', 'to', 'monitor', 'the', 'men', "'s", 'basketball', 'program', '.', 'Seven', 'of', 'the', 'violations', 'involved', 'men', "'s", 'basketball', ',', 'including', 'a', 'school', 'booster', 'allegedly', 'giving', 'cash', 'and', 'academic', 'help', 'to', 'a', 'former', 'player', '.', 'The', 'others', 'involved', 'a', 'booster', 'allegedly', 'giving', '$', '500', 'to', 'a', 'football', 'player', ',', 'and', 'an', 'orthodontist', 'providing', 'free', 'and', 'discounted', 'services', 'to', 'five', 'women', 

 82%|████████▏ | 157/192 [14:39<02:54,  5.00s/it]

Not matching:  ['http://nbasource', '.', 'blogspot', '.', 'nl/2005_05_01', '_archive', '.', 'html', 'MONDAY', ',', 'MAY', '23', ',', '2005', 'FIRED', 'IN', 'PHILLY', '!', '!', 'Sixers', "'", 'Coach', 'Jim', "O'Brien", 'was', 'fired', 'today', 'and', 'taking', 'over', 'will', 'be', 'former', 'Portland', 'Trailblazers', 'Coach', '...'] ['http://nbasource', '.', 'blogspot', '.', 'nl', '/', '2005_05_01_archive', '.', 'html', 'MONDAY', ',', 'MAY', '23', ',', '2005', 'FIRED', 'IN', 'PHILLY', '!', '!', 'Sixers', "'", 'Coach', 'Jim', "O'Brien", 'was', 'fired', 'today', 'and', 'taking', 'over', 'will', 'be', 'former', 'Portland', 'Trailblazers', 'Coach', '...']
Skip ----> 5_7ecbplus.xml
Parsing ----> 5_9ecb.xml


 82%|████████▏ | 158/192 [14:45<03:01,  5.35s/it]

Parsing ----> 5_8ecbplus.xml


 83%|████████▎ | 159/192 [14:47<02:16,  4.14s/it]

Not matching:  ['https://law', '.', 'marquette', '.', 'edu', '/', 'national', '-', 'sports', '-', 'law', '-', 'institute', '/', 'volume-7', '-', '1', 'Jim', "O'Brien", 'was', 'terminated', 'from', 'his', 'position', 'as', 'head', 'men', "'s", 'basketball', 'coach', 'at', 'Ohio', 'State', 'University', 'following', 'the', 'disclosure', 'that', 'he', 'loaned', 'a', 'recruit', "'s", 'mother', 'money', '.', "O'Brien", 'sued', ',', 'claiming', 'he', 'was', 'terminated', 'without', 'cause', ',', 'and', 'both', 'parties', 'filed', 'motions', 'for', 'summary', 'judgment', '.', 'The', 'court', 'of', 'claims', 'denied', 'the', 'motions', 'because', 'there', 'was', 'a', 'factual', 'dispute', 'concerning', 'what', 'constituted', 'a', 'material', 'breach', 'under', 'the', 'employment', 'contract', '.'] ['https://law', '.', 'marquette', '.', 'edu', '/', 'national', '-', 'sports', '-', 'law', '-', 'institute', '/', 'volume', '-', '7', '-', '1', 'Jim', "O'Brien", 'was', 'terminated', 'from', 'his', 'p

 83%|████████▎ | 160/192 [14:51<02:14,  4.21s/it]

Not matching:  ['http://sports', '.', 'espn', '.', 'go.com/nba/news/story', '?', 'id=2115562', 'Agent', 'says', 'Sixers', 'to', 'sign', 'Dalembert', 'to', 'six', '-', 'year', 'deal', 'July', '25', ',', '2005', ',', '12', ':', '18', 'AM', 'ET', 'Samuel', 'Dalembert', 'agreed', 'to', 'a', 'six', 'year', 'deal', 'with', 'the', 'Philadelphia', '76ers', 'on', 'Saturday', ',', 'his', 'agent', 'Marc', 'Cornstein', ',', 'told', 'ESPN.com', '.', 'Cornstein', 'refused', 'to', 'disclose', 'the', 'amount', 'of', 'the', 'contract', ',', 'but', 'it', "'s", 'believed', 'to', 'be', 'worth', 'between', '$', '60', 'million', 'to', '$', '70', 'million', ',', 'based', 'on', 'previous', 'negotiations', '.', 'The', 'signing', 'came', 'just', 'one', 'day', 'before', 'Dalembert', 'was', 'set', 'to', 'get', 'on', 'a', 'plane', 'and', 'visit', 'the', 'Atlanta', 'Hawks', ',', 'who', 'had', 'the', 'money', 'under', 'the', 'cap', 'to', 'pay', 'him', 'a', 'maximum', 'five', '-', 'year', ',', '$', '70', 'million', '

 84%|████████▍ | 161/192 [14:55<02:06,  4.07s/it]

Parsing ----> 5_1ecbplus.xml


 84%|████████▍ | 162/192 [15:15<04:22,  8.76s/it]

Not matching:  ['http://articles', '.', 'philly.com/2005-05-24/news/25441279_1_sixers-samuel-dalembert-10-win-improvement', 'Cheeks', 'returns', 'as', 'Sixers', "'", 'fixer', 'Jim', "O'Brien", 'was', 'shown', 'the', 'door', 'after', 'a', 'single', 'tumultuous', 'season', '.', 'Posted', ':', 'May', '24', ',', '2005', 'Billy', 'King', 'spun', 'the', '76ers', "'", 'head', 'coach', 'revolving', 'door', 'again', 'yesterday', ',', 'pushing', 'out', 'Jim', "O'Brien", 'after', 'just', 'one', 'season', 'and', 'bringing', 'in', 'Maurice', 'Cheeks', ',', 'the', 'popular', 'former', 'Sixers', 'player', 'and', 'assistant', 'coach', ',', 'as', "O'Brien", "'s", 'replacement', '.', 'Nearly', 'three', 'weeks', 'after', 'the', 'Sixers', 'ended', 'their', 'season', ',', 'King', 'decided', 'to', 'fire', "O'Brien", ',', 'who', 'still', 'has', 'two', 'years', 'and', '$', '8', 'million', 'left', 'on', 'the', 'contract', 'he', 'signed', 'when', 'he', 'was', 'hired', 'by', 'the', 'Sixers', 'on', 'April', '21',

 85%|████████▍ | 163/192 [15:16<03:06,  6.42s/it]

Parsing ----> 12_11ecbplus.xml


 85%|████████▌ | 164/192 [15:21<02:49,  6.05s/it]

Parsing ----> 12_13ecb.xml


 86%|████████▌ | 165/192 [15:23<02:13,  4.93s/it]

Parsing ----> 12_10ecbplus.xml


 86%|████████▋ | 166/192 [15:26<01:54,  4.41s/it]

Parsing ----> 12_11ecb.xml


 87%|████████▋ | 167/192 [15:28<01:31,  3.66s/it]

Parsing ----> 12_12ecb.xml


 88%|████████▊ | 168/192 [15:59<04:41, 11.74s/it]

Not matching:  ['In', 'another', 'successful', 'anti', '-', 'piracy', 'operation', ',', 'Navy', 'warship', 'on', 'Saturday', 'repulsed', 'an', 'attack', 'on', 'a', 'merchant', 'vessel', 'in', 'the', 'Gulf', 'of', 'Aden', 'and', 'nabbed', '23', 'Somali', 'and', 'Yemeni', 'sea', 'brigands', ',', 'in', 'a', 'show', 'of', 'resolve', 'to', 'weed', 'out', 'the', 'menace', 'that', 'affected', 'maritime', 'trade', 'in', 'the', 'region', '.', 'The', 'pirates', 'on', 'two', 'speed', 'boats', 'had', 'surrounded', 'the', 'merchant', 'vessel', 'flying', 'the', 'Ethiopian', 'flag', 'around', 'noon', ',', 'when', 'INS', 'Mysore', 'warship', 'intervened', 'and', 'warded', 'off', 'the', 'attack', ',', 'Navy', 'spokesperson', 'said', '.', 'The', 'pirates', 'had', 'fired', 'at', 'the', 'merchant', 'vessel', 'with', 'their', 'small', 'arms', ',', 'when', 'it', 'sent', 'out', 'a', 'rescue', 'call', 'and', 'the', 'Indian', 'warship', ',', 'which', 'was', 'sailing', 'nearby', 'moved', 'its', 'Marine', 'Comma

 88%|████████▊ | 169/192 [16:11<04:36, 12.01s/it]

Skip ----> 12_17ecb.xml
Parsing ----> 12_15ecb.xml


 89%|████████▊ | 170/192 [16:12<03:10,  8.64s/it]

Not matching:  ['MSNBC', 'is', 'reporting', 'that', 'the', 'Indian', 'Navy', 'claims', 'they', 'have', 'captured', '23', 'pirates', 'in', 'the', 'Gulf', 'of', 'Aden', '.', 'This', 'the', 'region', 'where', 'so', 'many', 'ships', 'have', 'recently', 'been', 'commandeered', '.', 'They', 'had', '"', 'a', 'large', 'cache', 'of', 'weapons', 'including', '7', 'AK-47', 'rifles', ',', 'a', 'rocket', 'propelled', 'grenade', 'launcher', ',', 'and', '3', 'machine', 'guns', '.', '"', 'They', 'are', 'taking', 'the', 'vessels', 'to', 'pirate', 'controlled', 'regions', 'in', 'Somalia', ',', 'with', 'an', 'estimated', '1,500', 'pirates', '.'] ['MSNBC', 'is', 'reporting', 'that', 'the', 'Indian', 'Navy', 'claims', 'they', 'have', 'captured', '23', 'pirates', 'in', 'the', 'Gulf', 'of', 'Aden', '.', 'This', 'the', 'region', 'where', 'so', 'many', 'ships', 'have', 'recently', 'been', 'commandeered', '.', 'They', 'had', '"', 'a', 'large', 'cache', 'of', 'weapons', 'including', '7', 'AK', '-', '47', 'rifles

 89%|████████▉ | 171/192 [16:14<02:18,  6.59s/it]

Parsing ----> 12_16ecb.xml


 90%|████████▉ | 172/192 [16:20<02:06,  6.30s/it]

Not matching:  ['Commandos', 'from', 'an', 'Indian', 'warship', 'today', 'caught', 'pirates', 'in', 'the', 'act', 'in', 'the', 'Gulf', 'of', 'Aden', 'and', 'took', '23', 'of', 'them', 'into', 'custody', ',', 'a', 'spokesperson', 'at', 'naval', 'headquarters', 'said', '.', 'Indian', 'naval', 'ship', 'Mysore', ',', 'a', 'destroyer', ',', 'picked', 'up', 'a', 'distress', 'call', 'from', 'merchant', 'vessel', 'Gibe', ',', 'which', 'flies', 'an', 'Ethiopian', 'flag', ',', 'when', 'it', 'was', '13', 'nautical', 'miles', '(', 'about', '24', 'km', ')', 'from', 'the', 'cargo', 'ship', '.', 'Details', 'of', 'what', 'exactly', 'happened', 'are', 'still', 'awaited', '.', 'The', 'Gibe', 'was', 'about', '150', 'nautical', 'miles', '(', 'about', '277', 'km', ')', 'from', 'Aden', 'when', 'the', 'pirates', ',', 'said', 'to', 'be', 'in', 'a', 'dhow', 'or', 'fishing', 'vessel', 'named', 'Salaluddin', ',', 'fired', 'at', 'it', 'and', 'tried', 'to', 'board', 'it', ',', 'according', 'to', 'information', 'av

 90%|█████████ | 173/192 [16:29<02:15,  7.12s/it]

Not matching:  ['The', 'Indian', 'Navy', 'on', 'Saturday', 'prevented', 'pirates', 'from', 'attacking', 'a', 'merchant', 'vessel', 'flying', 'an', 'Ethiopian', 'flag', 'in', 'the', 'Gulf', 'of', 'Aden', 'and', 'took', '23', 'into', 'custody', '.', 'Responding', 'to', 'a', 'distress', 'call', 'by', 'MV', 'Gibe', ',', 'INS', 'Mysore', 'on', 'anti', '-', 'piracy', 'patrol', 'in', 'the', 'area', ',', 'swung', 'into', 'action', 'against', 'the', 'pirates', 'in', 'two', 'boats', ',', 'a', 'Defence', 'Ministry', 'release', 'said', '.', 'The', 'apprehended', 'included', '12', 'Somalis', 'and', '11', 'Yemenis', '.', 'Seven', 'AK-47s', ',', 'three', 'other', 'automatic', 'rifles', ',', '13', 'loaded', 'magazines', ',', 'a', 'rocket', '-', 'propelled', 'grenade', '-', 'launcher', ',', 'rockets', ',', 'cartridges', 'and', 'grenades', 'were', 'found', 'in', 'the', 'boats', '.', 'There', 'were', 'also', 'three', 'outboard', 'motors', 'and', 'a', 'global', 'positioning', 'system', 'receiver', '.', 'T

 91%|█████████ | 174/192 [16:50<03:23, 11.28s/it]

Parsing ----> 12_18ecb.xml


 91%|█████████ | 175/192 [16:59<03:02, 10.71s/it]

Parsing ----> 12_1ecbplus.xml


 92%|█████████▏| 176/192 [17:49<06:00, 22.53s/it]

Not matching:  ['http://www.thehindu.com/news/national/navy-foils-somali-pirate-attack-off-gulf-of-aden/article2618961', '.', 'ece', 'Updated', ':', 'November', '11', ',', '2011', '20', ':', '09', 'IST', 'Navy', 'foils', 'Somali', 'pirate', 'attack', 'off', 'Gulf', 'of', 'Aden', 'The', 'Indian', 'Navy', 'patrolling', 'the', 'Gulf', 'of', 'Aden', 'on', 'Thursday', 'thwarted', 'a', 'multi', '-', 'boat', 'attack', 'by', 'sea', 'brigands', 'on', 'merchant', 'vessels', ',', 'apprehending', '26', 'Somali', 'pirates', 'and', 'confiscating', 'arms', 'and', 'ammunition', ',', 'in', 'the', 'fifth', 'successful', 'anti', '-', 'piracy', 'operation', 'since', 'September', '.', 'At', '9', '.', '25', 'a.m.', 'on', 'Thursday', ',', 'navy', 'personnel', 'aboard', 'warship', 'INS', 'Sukanya', 'spotted', 'a', 'group', 'of', 'five', 'suspicious', 'boats', 'speedily', 'approaching', 'the', 'merchant', 'vessels', 'of', 'her', 'group', '.', '“', 'The', 'warship', 'immediately', 'altered', '(', 'its', 'direct

 92%|█████████▏| 177/192 [17:54<04:17, 17.16s/it]

Parsing ----> 12_3ecbplus.xml


 93%|█████████▎| 178/192 [18:40<06:00, 25.78s/it]

Not matching:  ['http://www.defence', '.', 'pk', '/', 'forums', '/', 'indian', '-', 'defence/140085-indian', '-', 'navy', '-', 'foils', '-', 'attack', '-', 'gulf', '-', 'aden', '-', 'nabs', '-26', '-', 'pirates', '.', 'html', 'Indian', 'Navy', 'foils', 'attack', 'in', 'Gulf', 'of', 'Aden', ',', 'nabs', '26', 'pirates', 'Mumbai', ',', 'Nov', '11', '-', 'Indian', 'Navy', 'patrolling', 'the', 'Gulf', 'of', 'Aden', 'yesterday', 'thwarted', 'a', 'multi', '-', 'boat', 'attack', 'by', 'sea', 'brigands', 'on', 'merchant', 'vessels', ',', 'apprehending', '26', 'Somali', 'pirates', 'and', 'confiscating', 'arms', 'and', 'ammunition', ',', 'in', 'the', 'fifth', 'successful', 'anti', '-', 'piracy', 'operation', 'since', 'September', '.', 'At', 'about', '9', ':', '25', 'Thursday', 'morning', ',', 'navy', 'personnel', 'aboard', 'warship', 'INS', 'Sukanya', 'spotted', 'a', 'group', 'of', 'five', 'suspicious', 'boats', 'speedily', 'approaching', 'the', 'merchant', 'vessels', 'of', 'her', 'group', '.', 

 93%|█████████▎| 179/192 [18:51<04:37, 21.38s/it]

Not matching:  ['The', 'Indian', 'navy', 'captured', '23', 'pirates', 'who', 'threatened', 'a', 'merchant', 'vessel', 'in', 'the', 'lawless', 'waters', 'of', 'the', 'Gulf', 'of', 'Aden', 'and', 'a', 'German', 'naval', 'helicopter', 'thwarted', 'another', 'attack', 'Saturday', 'on', 'a', 'freighter', 'being', 'chased', 'by', 'speedboats', 'off', 'Yemen', '.', 'The', 'successes', 'came', 'days', 'before', 'Secretary', 'of', 'State', 'Condoleezza', 'Rice', 'was', 'to', 'ask', 'the', 'United', 'Nations', 'to', 'authorize', '"', 'all', 'necessary', 'measures', '"', 'against', 'increasingly', 'bold', 'Somalian', 'pirates', 'operating', 'in', 'one', 'of', 'the', 'world', "'s", 'busiest', 'shipping', 'lanes', '.', 'An', 'Indian', 'navy', 'ship', ',', 'the', 'INS', 'Mysore', ',', 'was', 'escorting', 'merchant', 'ships', 'in', 'waters', 'off', 'Somalia', "'s", 'coast', 'Saturday', 'when', 'it', 'received', 'a', 'distress', 'call', 'from', 'seamen', 'on', 'board', 'the', 'MV', 'Gibe', ',', 'who',

 94%|█████████▍| 180/192 [18:58<03:25, 17.13s/it]

Not matching:  ['http://www.ndtv.com/article/india/indian-ship-thwarts-piracy-attempt-in-gulf-of-aden-26-pirates-arrested-148985', 'Indian', 'ship', 'thwarts', 'piracy', 'attempt', 'in', 'Gulf', 'of', 'Aden', ',', '26', 'pirates', 'arrested', 'Updated', ':', 'November', '11', ',', '2011', '18', ':', '50', 'IST', 'Indian', 'Naval', 'ship', ',', 'INS', 'Sukanya', ',', 'thwarted', 'a', 'piracy', 'attack', 'in', 'the', 'Gulf', 'of', 'Aden', 'and', 'captured', 'three', 'boats', 'of', 'the', 'pirates', '.', 'A', 'statement', 'from', 'the', 'Defence', 'PRO', 'says', 'the', 'incident', 'happened', 'yesterday', 'when', 'INS', 'Sukanya', 'was', 'escorting', 'a', 'group', 'of', 'merchant', 'vessels', '.', 'Five', 'pirate', 'boats', 'tried', 'to', 'attack', 'these', 'vessels', 'but', 'were', 'challenged', 'by', 'the', 'Indian', 'Naval', 'ship', '.', 'After', 'a', 'brief', 'skirmish', ',', 'two', 'of', 'them', 'managed', 'to', 'flee', 'the', 'area', 'but', 'three', 'boats', 'were', 'captured', 'by'

 94%|█████████▍| 181/192 [19:07<02:40, 14.61s/it]

Not matching:  ['Striking', 'another', 'blow', 'against', 'the', 'pirates', 'infesting', 'the', 'waters', 'off', 'the', 'Horn', 'of', 'Africa', ',', 'an', 'Indian', 'Navy', 'warship', 'not', 'only', 'repulsed', 'an', 'attack', 'on', 'a', 'merchant', 'vessel', 'in', 'the', 'Gulf', 'of', 'Aden', ',', 'but', 'nabbed', '23', 'Somali', 'and', 'Yemeni', 'sea', 'brigands', '.', 'The', 'pirates', ',', 'in', 'three', 'boats', ',', 'were', 'attempting', 'to', 'hijack', 'the', 'Ethopian', '-', 'flagged', 'merchant', 'vessel', 'around', 'noon', 'today', ',', 'when', 'the', 'destroyer', 'INS', 'Mysore', 'intervened', 'and', 'repulsed', 'the', 'attack', ',', 'a', 'Navy', 'spokesman', 'said', 'here', '.', 'The', 'incident', 'took', 'place', 'about', '150', 'nautical', 'miles', 'off', 'Aden', '.', 'After', 'the', 'Mysore', 'picked', 'up', 'a', 'distress', 'call', 'from', 'the', 'merchant', 'vessel', ',', 'MV', 'Gibe', ',', 'at', 'around', '11', 'a.m.', 'saying', 'it', 'was', 'under', 'attack', ',', 'i

 95%|█████████▍| 182/192 [19:46<03:41, 22.15s/it]

Parsing ----> 12_4ecbplus.xml


 95%|█████████▌| 183/192 [19:57<02:46, 18.53s/it]

Not matching:  ['http://www.pardaphash.com/news/indian-navy-captures-26-somali-pirates-in-gulf-of-aden/683284', '.', 'html', '#', '.', 'UlFH6xaKwlI', 'Indian', 'Navy', 'captures', '26', 'Somali', 'pirates', 'in', 'Gulf', 'of', 'Aden', 'Published', 'on', ':', 'Fri', ',', '11', 'Nov', '2011', 'at', '09', ':', '56', 'IST', 'New', 'Delhi', '/', 'Mumbai', ':', 'An', 'Indian', 'warship', 'foiled', 'a', 'five', '-', 'boat', 'pirate', 'attack', 'on', 'merchant', 'vessels', 'transiting', 'the', 'Gulf', 'of', 'Aden', 'on', 'Friday', 'capturing', '26', 'Somani', 'brigands', 'on', 'three', 'of', 'the', 'skiffs', 'and', 'confiscating', 'their', 'weapons', '.', 'Patrol', 'vessel', 'INS', 'Sukanya', 'was', 'escorting', 'the', 'five', 'merchant', 'vessels', 'through', 'the', 'Internationally', 'Recognised', 'Transit', 'Corridor', 'in', 'the', 'Gulf', 'of', 'Aden', 'on', 'Thursday', 'morning', 'when', 'the', 'warship', "'s", 'crew', 'noticed', 'the', 'five', '-', 'boat', 'pirate', 'formation', 'approac

 96%|█████████▌| 184/192 [20:15<02:27, 18.41s/it]

Not matching:  ['The', 'Indian', 'Navy', 'on', 'Saturday', 'foiled', 'a', 'hijack', 'attempt', 'in', 'the', 'notorious', 'Gulf', 'of', 'Aden', 'and', 'arrested', '23', 'heavily', 'armed', 'Somalian', 'and', 'Yemeni', 'pirates', '.', 'Within', 'minutes', 'of', 'receiving', 'a', 'distress', 'call', 'from', 'an', 'Ethiopian', 'vessel', ',', 'MV', 'Gibe', ',', 'INS', 'Mysore', ',', 'a', 'Delhi', '-', 'class', 'destroyer', ',', 'launched', 'an', 'armed', 'helicopter', 'with', 'marine', 'commandos', 'onboard', 'to', 'stop', 'the', 'pirates', 'from', 'boarding', 'and', 'hijacking', 'the', 'vessel', '.', 'The', 'skipper', 'of', 'the', 'Ethiopian', 'vessel', ',', 'sailing', '150', 'nautical', 'miles', 'west', 'of', 'Aden', ',', 'radioed', 'a', 'message', 'at', 'around', '11', 'am', '(', 'IST', ')', 'saying', 'that', 'pirates', 'had', 'brought', 'his', 'ship', 'under', 'heavy', 'fire', '.', 'The', 'Mysore', ',', 'which', 'replaced', 'stealth', 'frigate', 'INS', 'Tabar', 'in', 'November', '-', 'e

 96%|█████████▋| 185/192 [20:34<02:11, 18.81s/it]

Not matching:  ['Striking', 'yet', 'another', 'blow', 'at', 'the', 'very', 'heart', 'of', 'piracy', 'in', 'the', 'Gulf', 'of', 'Aden', ',', 'Indian', 'warship', 'INS', 'Mysore', 'and', 'its', 'marine', 'commandos', 'thwarted', 'a', 'hijack', 'attempt', 'on', 'an', 'Ethiopian', 'merchant', 'vessel', 'on', 'Saturday', 'and', 'captured', '23', 'pirates', 'and', 'a', 'large', 'arms', 'cache', 'in', 'the', 'operation', '.', 'The', 'arrest', 'of', 'the', '23', 'pirates', '(', '12', 'Somali', 'and', '11', 'Yemeni', ')', 'and', 'the', 'arms', 'haul', 'is', 'the', 'largest', 'such', 'seizure', 'in', 'the', 'ongoing', 'anti', '-', 'piracy', 'operations', 'off', 'Somalia', '.', 'The', 'confiscated', 'arms', 'and', 'equipment', 'included', 'seven', 'AK-47s', ',', 'three', 'other', 'assault', 'rifles', ',', '13', 'loaded', 'magazines', ',', 'a', 'rocket', '-', 'propelled', 'grenade', 'launcher', 'with', 'two', 'rockets', ',', 'several', 'grenades', ',', 'a', 'GPS', 'set', 'and', 'a', 'mobile', 'pho

 97%|█████████▋| 186/192 [20:38<01:25, 14.28s/it]

Parsing ----> 12_8ecb.xml


 97%|█████████▋| 187/192 [20:50<01:08, 13.69s/it]

Not matching:  ['The', 'Indian', 'navy', 'captured', '23', 'piracy', 'suspects', 'who', 'tried', 'to', 'take', 'over', 'a', 'merchant', 'vessel', 'in', 'the', 'Gulf', 'of', 'Aden', ',', 'between', 'the', 'Horn', 'of', 'Africa', 'and', 'the', 'Arabian', 'Peninsula', ',', 'Indian', 'officials', 'said', '.', 'Piracy', 'suspects', 'raise', 'their', 'hands', 'in', 'surrender', 'as', 'an', 'Indian', 'navy', 'boat', 'approaches', '.', 'In', 'addition', 'to', 'the', '12', 'Somali', 'and', '11', 'Yemeni', 'suspects', ',', 'the', 'Indian', 'navy', 'seized', 'two', 'small', 'boats', 'and', '"', 'a', 'substantial', 'cache', 'of', 'arms', 'and', 'equipment', ',', '"', 'the', 'military', 'said', 'in', 'a', 'statement', '.', 'Among', 'the', 'seized', 'items', 'were', 'seven', 'AK-47', 'automatic', 'rifles', ',', 'three', 'other', 'automatic', 'weapons', 'and', '13', 'loaded', 'magazines', ';', 'a', 'rocket', '-', 'propelled', 'grenade', 'launcher', 'along', 'with', 'rockets', ',', 'cartridges', 'and'

 98%|█████████▊| 188/192 [21:00<00:49, 12.34s/it]

Not matching:  ['http://www.bharatwaves', '.', 'co.', 'in', '/', 'news', '/', 'Indian', '-', 'Navy', '-', 'foils', '-', 'another', '-', 'piracy', '-', 'attempt--26046', '.', 'html', 'A', 'five', '-', 'boat', 'pirate', 'attack', 'on', 'merchant', 'vessels', 'transiting', 'the', 'Gulf', 'of', 'Aden', 'was', 'foiled', 'by', 'an', 'Indian', 'warship', 'that', 'was', 'on', 'escort', 'duty', ',', 'capturing', '26', 'Somani', 'brigands', 'on', 'three', 'of', 'the', 'skiffs', 'and', 'confiscating', 'their', 'weapons', ',', 'a', 'naval', 'officer', 'said', 'here', 'Friday', '.', 'Patrol', 'vessel', 'INS', 'Sukanya', 'was', 'escorting', 'the', 'five', 'merchant', 'vessels', 'through', 'the', 'Internationally', 'Recognised', 'Transit', 'Corridor', 'in', 'the', 'Gulf', 'of', 'Aden', 'Thursday', 'morning', 'when', 'the', 'warship', "'s", 'crew', 'noticed', 'the', 'five', '-', 'boat', 'pirate', 'formation', 'approaching', 'the', 'cargo', 'ship', 'group', '.', '"', 'The', 'warship', 'immediately', 'a

 98%|█████████▊| 189/192 [21:04<00:29,  9.84s/it]

Not matching:  ['http://articles', '.', 'timesofindia', '.', 'indiatimes.com/2011-11-12/india/30391127_1_somali-pirate-attack-indian-warship-merchant-vessels', 'Somali', 'pirate', 'attack', 'foiled', 'in', 'Gulf', 'of', 'Aden', 'TNN', 'Nov', '12', ',', '2011', ',', '05', '.', '51AM', 'IST', 'MUMBAI', ':', 'An', 'Indian', 'warship', 'on', 'Thursday', 'intercepted', 'three', 'boats', 'with', '26', 'Somali', 'pirates', 'in', 'the', 'Gulf', 'of', 'Aden', 'and', 'foiled', 'an', 'attack', 'on', 'merchant', 'vessels', '.', 'INS', 'Sukanya', 'detected', 'five', 'suspicious', 'boats', ',', 'speedily', 'approaching', 'the', 'merchant', 'vessels', 'that', 'the', 'warship', 'was', 'escorting', 'through', 'the', 'Internationally', 'Recognized', 'Transit', 'Corridor', '.', '"', 'The', 'warship', 'immediately', 'altered', 'towards', 'the', 'suspicious', 'vessels', 'and', 'challenged', 'them', '.', 'While', 'two', 'of', 'them', 'managed', 'to', 'escape', ',', 'INS', 'Sukanya', 'successfully', 'interce

 99%|█████████▉| 190/192 [22:04<00:49, 24.89s/it]

Skip (client issue) ----> 12_6ecbplus.xml
Parsing ----> 12_9ecb.xml


 99%|█████████▉| 191/192 [22:10<00:19, 19.40s/it]

Not matching:  ['The', 'Indian', 'navy', 'says', 'it', 'has', 'arrested', '23', 'Somali', 'and', 'Yemeni', 'pirates', 'who', 'tried', 'to', 'storm', 'a', 'ship', 'in', 'the', 'Gulf', 'of', 'Aden', '.', 'A', 'navy', 'spokesman', 'said', 'it', 'had', 'responded', 'to', 'a', 'mayday', 'call', 'from', 'MV', 'Gibe', ',', 'flying', 'under', 'the', 'Ethiopian', 'flag', '.', 'Several', 'countries', 'have', 'warships', 'patrolling', 'the', 'gulf', 'amid', 'growing', 'international', 'concern', 'about', 'piracy', '.', 'Meanwhile', ',', 'US', 'Defence', 'Secretary', 'Robert', 'Gates', 'said', 'better', 'intelligence', 'was', 'needed', 'for', 'a', 'land', 'attack', 'on', 'pirate', 'bases', 'to', 'be', 'considered', '.', 'Mr', 'Gates', ',', 'speaking', 'at', 'a', 'security', 'conference', 'in', 'Bahrain', ',', 'also', 'called', 'for', 'shipping', 'companies', 'to', 'do', 'more', 'to', 'protect', 'their', 'vessels', 'travelling', 'through', 'the', 'Arabian', 'Sean', 'and', 'Indian', 'Ocean', '.', 'T

100%|██████████| 192/192 [23:10<00:00,  7.24s/it]

Skip (client issue) ----> 12_9ecbplus.xml
Files saved with headers Index(['pair', 'label', 'pred', 'mention_pair', 'sent_idx', 'sentence',
       'sent_filter', 'doc_name'],
      dtype='object')
skipped 65 in total





# Shut down the server

In [27]:
# Shut down the background CoreNLP server
client.stop()

# time.sleep(10)
!ps -o pid,cmd | grep java

   2212 /bin/bash -c ps -o pid,cmd | grep java
   2214 grep java
