In [1]:
import codecs

with codecs.open("ned.testb.txt", "r", "utf-8") as f:
    content = f.readlines()

In [2]:
# Remove line endings
content = [line.strip('\n') for line in content]

In [3]:
# Remove document start/end markers
content = [line for line in content if '-DOCSTART-' not in line]

In [4]:
# Split the content into sentences

sentences = []
current_sentence = []

for line in content:
    if line == '':
        sentences.append(current_sentence)
        current_sentence = []
    else:
        current_sentence.append(line)
    
    
    

In [5]:
def format_sentence(sentence):
    
    # Join words to form sentences
    text = ' '.join([word.split(' ')[0] for word in sentence]) 
    
    # Extract the toponyms using entity tags
    toponyms = [[word.split(' ')[0]] for word in sentence if 'B-LOC' in word or 'I-LOC' in word]
    
    # Create temporary variable
    tmp = text
    
    # Iteratively find the index positioning of the toponyms
    for toponym in toponyms:
        start = tmp.find(toponym[0])
        end = start + len(toponym[0])
        
        # Add starting and ending index to toponym
        toponym.append(start)
        toponym.append(end)
        
        # Remove from temp variable once found
        tmp = tmp.replace(toponym[0], '', 1)
        
    return {'text': text, 'toponyms': toponyms}

In [6]:
formatted_content = [format_sentence(sentence) for sentence in sentences]

In [7]:
# Only include sentences for which toponyms exist
subset = [sentence for sentence in formatted_content if sentence['toponyms']]

In [10]:
# Filter really short sentences
subset = [sentence for sentence in subset if len(sentence['text'].split()) >= 4]

In [11]:
# creating the proper xml file structure

import xml.etree.ElementTree as ET

entries = ET.Element('entries')

for ENTRY in subset:
    
    entry = ET.SubElement(entries, 'entry')
    text = ET.SubElement(entry, 'text')
    text.text = ENTRY['text']
    
    
    toponyms = ET.SubElement(entry, 'toponyms')
    
    for TOP in ENTRY['toponyms']:
        toponym = ET.SubElement(toponyms, 'toponym')
        
        phrase = ET.SubElement(toponym, 'phrase')
        phrase.text = TOP[0]
        
        start = ET.SubElement(toponym, 'start')
        start.text = str(TOP[1])
        
        end = ET.SubElement(toponym, 'end')
        end.text = str(TOP[2])
        
        place = ET.SubElement(toponym, 'place')

        
tree = ET.ElementTree(entries)
tree.write('testing.xml') 

