In [None]:
import os, re, json
import spacy
from functools import partial
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET
from nltk.tokenize.treebank import TreebankWordDetokenizer

nlp = spacy.load('en_core_web_sm', disable=['textcat'])

class ECBMentionsDoc:

    # Problematic tokens in the dataset
    # From the CDLM repo
    error_tokens = [('31_10ecbplus.xml', 979),
                  ('9_3ecbplus.xml', 30),
                  ('9_4ecbplus.xml', 32)]


    def __init__(self, doc_path, doc_name, topic_id):
        self.doc_path = doc_path
        self.doc_name = doc_name
        self.topic_id = topic_id
        self.mentions_fields = {}
        self.mention_cluster_info = {}
        self.relation_source_target = {}
        self.relation_ids = {}
        self.relation_tag = {}
        self.event_singleton_idx = int(1E8)
        self.entity_singleton_idx = int(2E8)
        self.entity_mentions = []
        self.event_mentions = []
        self.tagged_event_tokens = {}
        self.tagged_entity_tokens = {}
        self.doc_token_texts = {}
        self.b_open, self.b_close = "{", "}"
        self.prev_wrap, self.prev_tag_id = '', ''
        self.tag_is_opened = False



    def parse_xml(self):
        # Start parsing
        self.root = ET.parse(self.doc_path).getroot()

        # Set all mention ids from the full document for both event and entity mentions
        self.set_all_marked_mentions()

        # Set all cross doc ids
        self.set_cross_doc_mentions()

        # Creates both arrays containing all event and entity mention info
        self.compute_event_entity_mentions()

        # Parses all the actual tokens from the current document into a dict we can use
        self.set_doc_texts()



    def set_doc_texts(self):
        '''
        Example text:
        <token t_id="53" sentence="4" number="1">Williams</token>
        <token t_id="54" sentence="4" number="2">,</token>
        <token t_id="55" sentence="4" number="3">the</token>
        <token t_id="56" sentence="4" number="4">swimming</token>
        <token t_id="57" sentence="4" number="5">champion</token>
        <token t_id="58" sentence="4" number="6">turned</token>
        <token t_id="59" sentence="4" number="7">actress</token>
        '''
        prev_sent_id = -1
        for token in self.root.findall('token'):
            token_id = int(token.get('t_id'))
            
            # A few tokens per should not be used
            if (self.doc_name, token_id) not in self.error_tokens:
                # Parse actual token text in the right format
                token_text = token.text.replace('�', '').strip()
                sent_id = int(token.get('sentence'))

                # word_id_sent = token.get('number') # word index per sentence
                token_info = (token_text, sent_id)
                
                # Write data for sentence reconstruction
                if (prev_sent_id > -1) and (sent_id != prev_sent_id):
                    prev_token_info = self.doc_token_texts[prev_token_id]
                    self.doc_token_texts[prev_token_id] = (prev_token_info[0] + " [EOS]", prev_token_info[1])

                self.doc_token_texts[token_id] = token_info
                prev_sent_id = sent_id
                prev_token_id = token_id


    def compute_event_entity_mentions(self):
        
        # Loop through all mentions of the current document
        for m_id, mention in self.mentions_fields.items():

            # For this specific mention check if's a source by checking if it maps to a target
            # Since the dict containts {source_mention_id: target_mention_id}
            target_id = self.relation_source_target.get(m_id, None)

            # If it's just a source_id with no second target_id in it's cluster;
            # then we know that this event or enntity mention has to be a singleton
            if target_id is None:
                if mention['event']:
                    cluster_id = self.event_singleton_idx
                    self.event_singleton_idx += 1
                else:
                    cluster_id = self.entity_singleton_idx
                    self.entity_singleton_idx += 1

                # cluster_id =  'Singleton_' + file_name + '_' + m_id
                cluster_desc = ''
            else:
                # Relation id is basically the cluster's id to identify a cluser
                r_id = self.relation_ids[target_id]
                tag = self.relation_tag[target_id] # E.g. CROSS_DOC_COREF
                
                # Only within doc link
                if tag.startswith('INTRA'):
                    # Entity and event mentions may have the same intra cluster id 
                    suffix = '1' if mention['event'] else '0' 
                    cluster_id =  int(r_id + suffix)
                else:
                    # Grab the cluster info dict from the mention clusters we created
                    target_cluster_info = self.mention_cluster_info[target_id]

                    # E.g. ACT16236402809085484
                    target_cluster_id_str = target_cluster_info['cluster_id']

                    # We grab all the integers from this string to construct an int we can use
                    cluster_id = int(target_cluster_id_str[3:])

                # e.g. t4_swimming_skills
                cluster_desc = self.mention_cluster_info[target_id]['cluster_desc']


            # Now that we retrieved the cluster id and description for this mention;
            # We can update the mention dict we create before with this and append;
            # it to the entities correpsonding group -> Event or Entity mention
            mention_info = mention.copy()
            mention_info["cluster_id"] = cluster_id
            mention_info["cluster_desc"] = cluster_desc
            event = mention_info.pop("event")
            if event:
                self.event_mentions.append(mention_info)
            else:
                self.entity_mentions.append(mention_info)


    def set_cross_doc_mentions(self):
        '''
        Example part to parse:
        <CROSS_DOC_COREF r_id="22306" note="ACT16195873839112917">
            <source m_id="28" />
            <source m_id="34" />
            <target m_id="60" />
        </CROSS_DOC_COREF>
        '''

        # Relation -> Cross doc relation
        for relation in self.root.find('Relations'):
            
            # Last element of each cluster is 'target'
            target_mention_id = relation[-1].attrib['m_id']
            
            # All the other elements are of type 'source'
            source_tags = relation[:-1]

            # Set a mapping from coref source id to it's master target
            for source_tag in source_tags:
                source_mention_id = source_tag.attrib['m_id']
                self.relation_source_target[source_mention_id] = target_mention_id

            
            # Save tag 'CROSS_DOC_COREF' 
            self.relation_tag[target_mention_id] = relation.tag

            # Save the target mention id to cross doc id entries
            self.relation_ids[target_mention_id] = relation.attrib['r_id']





    def set_all_marked_mentions(self):
        '''
        Example part to parse:
        <ACTION_ASPECTUAL m_id="53">
            <token_anchor t_id="186"/>
        </ACTION_ASPECTUAL>
        <ACTION_OCCURRENCE m_id="50">
            <token_anchor t_id="179"/>
            <token_anchor t_id="180"/>
            <token_anchor t_id="181"/>
        </ACTION_OCCURRENCE>
        '''

        # Store our results
        subtopic = '0' if 'plus' in self.doc_name else '1'

        for mention in self.root.find('Markables'):
            m_id = mention.attrib['m_id']

            if 'RELATED_TO' not in mention.attrib:

                # ACTION or NEG is an event mention 
                is_event_mention = mention.tag.startswith('ACT') or mention.tag.startswith('NEG')
                
                # Grab all token ids under current Markable tag
                tokens_ids = [int(term.attrib['t_id']) for term in mention]

                if len(tokens_ids) == 0:
                    print(ET.tostring(mention, encoding='unicode'))
                    continue

                # print(is_event_mention, tokens_ids)

                # Indexing our sentences also starts at 0
                token_sent_index = tokens_ids[0]
                sent_id = self.root[token_sent_index].attrib['sentence']

                # Construct the actual mention text, e.g. "Barack Obama"
                # NOTE: We -1 the token id itself, since they started indexing at 1 and map starts at 0
                mention_word_tokens = ' '.join(list(map(lambda x: self.root[x-1].text, tokens_ids)))

                lemmas, tags = [], []
                for tok in nlp(mention_word_tokens):
                    lemmas.append(tok.lemma_)
                    tags.append(tok.tag_)
                
                self.mentions_fields[m_id] = {
                    "doc_id": self.doc_name,
                    "topic": self.topic_id,
                    "subtopic": self.doc_name.split('_')[0] + '_' + subtopic,
                    "m_id": m_id,
                    "sentence_id" : int(sent_id),
                    "tokens_ids": tokens_ids,
                    "tokens": mention_word_tokens,
                    "tags": ' '.join(tags),
                    "lemmas": ' '.join(lemmas),
                    "event": is_event_mention
                }
            else:
                self.mention_cluster_info[m_id] = {
                    "cluster_id": mention.attrib.get('instance_id', ''),
                    "cluster_desc": mention.attrib['TAG_DESCRIPTOR']
                }



    def get_word_tags(self, mentions):
        # Format to df for easy indexing
        mentions_df = pd.DataFrame(mentions)

        # To tag each group of words
        tag_id = 0
        tagged_mention_tokens = {}
        matched_cluster_ids = {}

        # E.g. [10] or [111, 112]
        for _, row in mentions_df[['tokens_ids', 'cluster_id']].iterrows():
            cluster_id = row['cluster_id']
            token_ids = row['tokens_ids']

            # This takes of formatting cluster ids to normal tag ids
            # e.g. 15737539387899295 -> 1 and 15743207473194727 -> 2
            if cluster_id not in matched_cluster_ids:
                matched_cluster_ids[cluster_id] = tag_id
                tag_id += 1
            
            word_tag_id = matched_cluster_ids[cluster_id]

            # So all tokens in a tagged part of a string have the same tag_id
            for token_id in token_ids:
                tagged_mention_tokens[token_id] = word_tag_id

        return tagged_mention_tokens



    def get_wrap_span_word(self, word, token_id, token_tag_opened, max_token_id):
        
        tag_id = self.tagged_entity_tokens[token_id]
        next_token_id = token_id + 1
        word_wrap = word
        
        # If span only has 1 token, then fully wrap it -> Check if next has same tag
        if next_token_id not in self.tagged_entity_tokens.keys():
            if self.b_open in self.prev_wrap:
                word_wrap = f"{word} {self.b_close}"
                self.tag_is_opened = False

            else:
                word_wrap = f"{self.b_open}tag_id:{tag_id} {word} {self.b_close}"
            
        else:
            next_tag_id = self.tagged_entity_tokens[next_token_id]
            
            # Check this is the start of a span
            if self.b_open not in self.prev_wrap:
                
                # Previous token is not the start and the tag ends here so self contained
                if tag_id != next_tag_id:
                    word_wrap = f"{self.b_open}tag_id:{tag_id} {word} {self.b_close}"
                
                
                # Check token itself is the start of a span or middle
                if tag_id != self.prev_tag_id and not self.tag_is_opened:
           
                    # This means we are starting a multi-token span
                    word_wrap = f"{self.b_open}tag_id:{tag_id} {word}"
                    self.tag_is_opened = True
         
            
            # Token tag has already opened, so either closing or a middle token
            else:
                
                # print(tag_id, next_tag_id)
                # If the next tag is different it means this tag ends
                if tag_id != next_tag_id:
                    word_wrap = f"{word} {self.b_close}"
                    self.tag_is_opened = False

                
                # print(" -> Next tag is the same as this one!")
                # A middle word so no closing tag
                word_wrap = word
                
            self.prev_tag_id = tag_id
            
        return word_wrap                        
                

    def clear_url(self):
        
        no_space_doc =  self.original_text.replace(" ", "")

        print(no_space_doc)
        

    def compute_formatted_entity_doc(self, b_open = "{", b_close="}"):
        # Maps each tagged token id to the tag_id we want to use
        # e.g. {token_id: tag_id} {14: 3, 15: 3}
        self.tagged_entity_tokens = self.get_word_tags(self.entity_mentions)

        # To store the actual words/characters of the documents
        self.formatted_doc_tokens = []
        self.clean_doc_tokens = []
        clean_tokens_v2 = []
        token_tag_opened = 1
        max_token_id = len(self.doc_token_texts.keys()) + 1 # Starts at 1
        prev_wrap = ''
        self.original_text = ''

        for token_id, (word, sent_id) in  self.doc_token_texts.items():
            clean_tokens_v2.append(word)
            current_wrap = word
            # Check if the current token is part of a tagged span
            if token_id in self.tagged_entity_tokens:
                current_wrap = self.get_wrap_span_word(word, token_id, token_tag_opened, max_token_id)
                
            next_token_id = token_id
            prev_token_id = token_id if token_id == 1 else token_id - 1

            
            if next_token_id != max_token_id:
                token_exceptions =  [",", ".", "'", "’", "\"", "/", "“","www.", "http", ":", "com/","-"]
                
                
                prev_word = self.doc_token_texts[prev_token_id][0]
                next_word = self.doc_token_texts[next_token_id][0]
                
                # print(word)
                # If next token is aplhanumer, then it means it's word so it needs a space
                if word not in token_exceptions and next_word not in token_exceptions:                    
                    self.formatted_doc_tokens.append(' ')
                    self.clean_doc_tokens.append(' ')
                # else:
                    # print(f"Next word '{next_word[:1]}' is NOT alpha")
            
            # print(word, current_wrap)
            self.formatted_doc_tokens.append(current_wrap)
            self.clean_doc_tokens.append(word)
            self.prev_wrap = current_wrap
        
        # Convert list to 1 string
        self.doc_prompt = ''.join(word for word in self.formatted_doc_tokens)
        self.original_text = ''.join(word for word in self.clean_doc_tokens)
        original_textv2 = TreebankWordDetokenizer().detokenize(clean_tokens_v2)
        # print(clean_tokens_v2)
        # print(original_textv2)
        # print("-----clean v2 custom---\n")
        # print(untokenize(clean_tokens_v2))
        # # Remove url
        # if self.original_text.startswith("http"):
        #     self.clear_url()
        

    # http://www.ws.com/May 2, 2013.. -> http://www.ws.com/ May 2, 2013.. or May 2, 2013..
    def split_url_on_month(self, match, keep_start_url):
        matched_url = match.group()
        months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

        # Minimum amount of characters for an url plus month string
        if len(matched_url) > 10 and any([x in matched_url for x in months]):
            
            # Compile a regex with each month as option
            regexPattern = '|'.join(map(re.escape, months))
            
            # Split the string by 1 of the months, also keeping the matched month itself
            matches = re.split(f"({regexPattern})", matched_url, 1)
            new_url, month = matches[0], matches[1]   
            
            # So we want to keep the original url            
            if keep_start_url:
            
                # Return the url with the space in between the month
                return f"{new_url} {month}"
            
            return month

        
        # So we want to keep the original url            
        if keep_start_url:
            return matched_url
        
        # We can just skip the url altogether
        return ''            
    

    # https://stackoverflow.com/questions/21948019/python-untokenize-a-sentence
    # https://github.com/commonsense/metanl/blob/master/metanl/token_utils.py
    def untokenize(self, words, keep_start_url=False):
        """
        Untokenizing a text undoes the tokenizing operation, restoring
        punctuation and spaces to the places that people expect them to be.
        Ideally, `untokenize(tokenize(text))` should be identical to `text`,
        except for line breaks.
        """
        text = ' '.join(words)
        text = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
        text = text.replace(" ( ", " (").replace(" ) ", ") ")
        text = re.sub(r' ([.,:;?!%]+)([ \'`])', r"\1\2", text)
        text = re.sub(r' ([.,:;?!%]+)$', r"\1", text)
        text = text.replace(" '", "'").replace(" n't", "n't").replace(
            "can not", "cannot")
        text = text.replace(" ` ", " '").replace(" -", "-").replace("- ", "-")
        text = text.replace(" ,", ",").replace(' /',  '/').replace('/ ',  '/')
        text = text.replace(" ’ s", "'s").replace("“ ", "“").replace(" ”", "”")
        text = text.replace(" ’ s", "'s").replace("“ ", "“").replace(" ”", "”")
        text = text.replace("www. ", "www.").replace(". com", ".com").replace(" ”", "”")
        text = text.replace(" _ ", "_")
        text = text.replace("p. m.", "p.m.").replace("a. m.", "a.m.")

        # Regex to match even amount of ", because removing trailing or start space;
        # Will also remove any characters before and after quotes start.
        # So we need to match the even amount, see: https://stackoverflow.com/a/53436792/8970591
        # Inspiration for regex: https://stackoverflow.com/questions/14906492/how-can-whitespace-be-trimmed-from-a-regex-capture-group
        quote_regex = '\\"\s?([^\]]*?)\s?\\"'
        text = re.sub(quote_regex, '\"'+r'\1'+'\"' , text)
                
        # A lot of articles start with an url in the as the source it came from
        # So we can optionally get rid of this to get a cleaner text for text generation
        first_url_regex = '^(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'        
        pattern = re.compile(first_url_regex)
        
        # Partials can be used to make new derived functions that have some input parameters pre-assigned
        re_sub_callback = partial(self.split_url_on_month, keep_start_url=keep_start_url)
        
        # Sometime the first word, which is a month, can be captured by our regex
        # So we want to split on this character to we can keep the month
        text = re.sub(pattern, re_sub_callback, text, 1)


        return text.strip()



    def format_doc_and_mentions(self, keep_start_url=True):
        doc_tokens = [el[1][0] for el in self.doc_token_texts.items()]
    
        plain_text = ' '.join(word for word in doc_tokens)
        clean_text = self.untokenize(doc_tokens, keep_start_url)
        # print(doc_tokens)
        return clean_text

    def get_clusters(self, mentions):
        clusters = {}
        
        for i, mention in enumerate(mentions):
            cluster_id = mention['cluster_id']

            # Create empty list entry if not existent
            if cluster_id not in clusters:
                clusters[cluster_id] = []


            clusters[cluster_id] += mention["tokens_ids"]

        return clusters
        


In [None]:
import random
from google.colab import drive
import pickle
drive.mount('/content/drive',force_remount=True)

# Make sure to click "Add shortcut to drive" for the "Coref-for-GPT" folder
gdrive_dir_path = "/content/drive/MyDrive/Coref-for-GPT"


Mounted at /content/drive


In [None]:
local_path = ""

# Change this to "local_path" if you run the notebook locally
root_path = gdrive_dir_path

In [None]:
# Path to the ecb data
ecb_path = f"{root_path}/Data/ECB+/"
ecb_gold_path = f"{root_path}/Data/ECB+/gold/"

In [None]:
# Load gold train, dev, test entity dataset
def load_gold_conll(ds_type):
    file_path = f"{ecb_gold_path}/mentions/%s_entities.json"%(ds_type) 
    with open (file_path) as f:
        file = json.load(f)
    return file

In [None]:
dev = load_gold_conll("dev")
train = load_gold_conll("train")
print(len(dev), len(train))

1476 4758


In [None]:
def get_info_for_docs(ds, dir_path):
    docs = []
    # info: (text, mentions, clusters)
    info_list = []

    for doc_info in tqdm(ds):
        doc_name = doc_info["doc_id"]
        if len(docs) == 0:
            prev_doc = ""
        else:
            prev_doc = docs.pop()

        if prev_doc == doc_name:
            docs.append(doc_name)
        else:
            if prev_doc:
                docs.append(prev_doc)
            docs.append(doc_name)
            topic, _ = doc_name.split("_")
            ecb_path = dir_path+ "ECB+/"+ f"{topic}/{doc_name}"
            try:
                ecb_mention_doc = ECBMentionsDoc(ecb_path, doc_name, 2)
                ecb_mention_doc.parse_xml()
                mentions = ecb_mention_doc.entity_mentions
            
                clusters = ecb_mention_doc.get_clusters(mentions)
                text = ecb_mention_doc.format_doc_and_mentions(keep_start_url=True)
            except:
                print(doc_name)
                continue
            info_list.append((text, mentions, clusters))
      
    return docs,info_list

In [None]:
dev_docs, dev_info = get_info_for_docs(dev, ecb_path)
print(len(dev_docs))

100%|██████████| 1476/1476 [00:00<00:00, 9562.99it/s] 

35_10ecb.xml
35_11ecbplus.xml
35_10ecbplus.xml
35_1ecb.xml
35_1ecbplus.xml
35_2ecbplus.xml
35_2ecb.xml
35_3ecbplus.xml
35_3ecb.xml
35_5ecbplus.xml
35_4ecb.xml
35_5ecb.xml
35_4ecbplus.xml
35_7ecb.xml
35_7ecbplus.xml
35_6ecbplus.xml
35_6ecb.xml
35_8ecbplus.xml
35_9ecb.xml
35_8ecb.xml
35_9ecbplus.xml
34_10ecb.xml
34_11ecb.xml
34_10ecbplus.xml
34_13ecb.xml
34_12ecb.xml
34_11ecbplus.xml
34_12ecbplus.xml
34_15ecb.xml
34_1ecb.xml
34_16ecb.xml
34_14ecb.xml
34_2ecb.xml
34_2ecbplus.xml
34_3ecb.xml
34_1ecbplus.xml
34_5ecbplus.xml
34_4ecbplus.xml
34_3ecbplus.xml
34_4ecb.xml
34_6ecb.xml
34_6ecbplus.xml
34_7ecb.xml
34_7ecbplus.xml
34_8ecbplus.xml
34_9ecb.xml
34_8ecb.xml
34_9ecbplus.xml
18_10ecb.xml
18_10ecbplus.xml
18_11ecb.xml
18_11ecbplus.xml
18_13ecb.xml
18_12ecb.xml
18_16ecb.xml
18_1ecbplus.xml
18_14ecb.xml
18_1ecb.xml
18_15ecb.xml
18_3ecb.xml
18_3ecbplus.xml
18_2ecb.xml
18_2ecbplus.xml
18_4ecb.xml
18_5ecbplus.xml
18_5ecb.xml
18_4ecbplus.xml
18_6ecbplus.xml
18_6ecb.xml
18_8ecb.xml
18_7ecbplus.xm




In [None]:
devset = dict(zip(dev_docs, dev_info))
file_path = ecb_path + "processed/dev_with_original_index.json"
with open(file_path, 'w') as f:
    json.dump(devset,f)

In [None]:
train_docs, train_info = get_info_for_docs(train, ecb_input_path)


 22%|██▏       | 1035/4758 [00:00<00:00, 10335.84it/s]

20_11ecbplus.xml
20_1ecb.xml
20_2ecb.xml
20_1ecbplus.xml
20_4ecb.xml
20_3ecbplus.xml
20_2ecbplus.xml
20_3ecb.xml
20_4ecbplus.xml
20_5ecbplus.xml
20_5ecb.xml
20_6ecbplus.xml
20_8ecbplus.xml
20_7ecbplus.xml
20_9ecbplus.xml
32_10ecbplus.xml
32_1ecb.xml
32_11ecbplus.xml
32_1ecbplus.xml
32_2ecb.xml
32_3ecb.xml
32_3ecbplus.xml
32_2ecbplus.xml
32_5ecb.xml
32_4ecbplus.xml
32_5ecbplus.xml
32_4ecb.xml
32_7ecb.xml
32_8ecb.xml
32_6ecb.xml
32_6ecbplus.xml
32_7ecbplus.xml
32_9ecbplus.xml
32_8ecbplus.xml
33_11ecbplus.xml
33_1ecb.xml
33_10ecbplus.xml
33_3ecb.xml
33_2ecb.xml
33_1ecbplus.xml
33_2ecbplus.xml
33_3ecbplus.xml
33_4ecb.xml
33_4ecbplus.xml
33_9ecbplus.xml
33_5ecb.xml
33_5ecbplus.xml
33_8ecbplus.xml
33_6ecbplus.xml
33_7ecbplus.xml
7_10ecbplus.xml
7_11ecb.xml
7_11ecbplus.xml
7_10ecb.xml
7_1ecb.xml
7_1ecbplus.xml
7_2ecbplus.xml
7_2ecb.xml
7_4ecbplus.xml
7_3ecb.xml
7_3ecbplus.xml
7_5ecb.xml
7_7ecb.xml
7_6ecb.xml
7_5ecbplus.xml
7_6ecbplus.xml
7_9ecbplus.xml
7_7ecbplus.xml
7_8ecb.xml
7_8ecbplus.xml

 62%|██████▏   | 2953/4758 [00:00<00:00, 8560.37it/s]

1_15ecbplus.xml
1_18ecbplus.xml
1_19ecb.xml
1_18ecb.xml
1_17ecbplus.xml
1_1ecbplus.xml
1_20ecbplus.xml
1_19ecbplus.xml
1_1ecb.xml
1_3ecb.xml
1_21ecbplus.xml
1_2ecb.xml
1_2ecbplus.xml
1_4ecbplus.xml
1_5ecb.xml
1_3ecbplus.xml
1_4ecb.xml
1_5ecbplus.xml
1_7ecb.xml
1_6ecbplus.xml
1_7ecbplus.xml
1_6ecb.xml
1_8ecb.xml
1_9ecb.xml
1_8ecbplus.xml
1_9ecbplus.xml
28_10ecb.xml
28_10ecbplus.xml
28_11ecb.xml
28_12ecbplus.xml
28_11ecbplus.xml
28_13ecb.xml
28_12ecb.xml
28_2ecb.xml
28_1ecbplus.xml
28_1ecb.xml
28_4ecb.xml
28_3ecb.xml
28_2ecbplus.xml
28_3ecbplus.xml
28_5ecb.xml
28_5ecbplus.xml
28_4ecbplus.xml
28_6ecb.xml
28_6ecbplus.xml
28_7ecbplus.xml
28_9ecb.xml
28_8ecb.xml
28_8ecbplus.xml
28_7ecb.xml
28_9ecbplus.xml
26_10ecb.xml
26_10ecbplus.xml
26_11ecbplus.xml
26_12ecb.xml
26_11ecb.xml
26_13ecb.xml
26_1ecbplus.xml
26_2ecb.xml
26_1ecb.xml
26_2ecbplus.xml
26_4ecb.xml
26_3ecb.xml
26_3ecbplus.xml
26_5ecbplus.xml
26_4ecbplus.xml
26_5ecb.xml
26_7ecb.xml
26_6ecb.xml
26_8ecb.xml
26_8ecbplus.xml
26_6ecbplus.x

100%|██████████| 4758/4758 [00:00<00:00, 8375.48it/s]

4_12ecb.xml
4_13ecb.xml
4_1ecb.xml
4_2ecb.xml
4_2ecbplus.xml
4_3ecb.xml
4_3ecbplus.xml
4_4ecbplus.xml
4_5ecb.xml
4_5ecbplus.xml
4_4ecb.xml
4_7ecb.xml
4_6ecbplus.xml
4_8ecbplus.xml
4_6ecb.xml
4_8ecb.xml
4_9ecb.xml
4_9ecbplus.xml
3_10ecbplus.xml
3_11ecbplus.xml
3_2ecbplus.xml
3_1ecbplus.xml
3_2ecb.xml
3_1ecb.xml
3_3ecb.xml
3_5ecb.xml
3_4ecb.xml
3_3ecbplus.xml
3_4ecbplus.xml
3_5ecbplus.xml
3_6ecb.xml
3_7ecbplus.xml
3_6ecbplus.xml
3_7ecb.xml
3_9ecb.xml
3_8ecb.xml
3_8ecbplus.xml
3_9ecbplus.xml
24_10ecb.xml
24_11ecb.xml
24_12ecb.xml
24_11ecbplus.xml
24_13ecb.xml
24_10ecbplus.xml
24_1ecb.xml
24_14ecb.xml
24_15ecb.xml
24_2ecb.xml
24_4ecb.xml
24_3ecb.xml
24_2ecbplus.xml
24_1ecbplus.xml
24_3ecbplus.xml
24_6ecb.xml
24_5ecb.xml
24_4ecbplus.xml
24_5ecbplus.xml
24_7ecb.xml
24_8ecb.xml
24_7ecbplus.xml
24_6ecbplus.xml
24_8ecbplus.xml
24_9ecbplus.xml
24_9ecb.xml
13_11ecb.xml
13_10ecbplus.xml
13_10ecb.xml
13_11ecbplus.xml
13_12ecbplus.xml
13_12ecb.xml
13_13ecb.xml
13_14ecbplus.xml
13_15ecb.xml
13_13ecbp




In [None]:
trainset = dict(zip(train_docs, train_info))

file_path = ecb_path + "processed/train_with_original_index.json"
with open(file_path, 'w') as f:
    json.dump(trainset,f)