forked from sebneu/geolabelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
time_tagger.py
46 lines (38 loc) · 1.66 KB
/
time_tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import subprocess
import datetime
from dateutil import parser
import lxml.etree as etree
from xml.sax.saxutils import escape, unescape
import os
def get_heideltime_annotations(value, heideltime_path, language):
tmp_file = 'tmp_temporalinfo.txt'
with open(tmp_file, 'w') as f:
f.write(value.encode('utf-8'))
cwd = os.getcwd()
p = subprocess.Popen(['java', '-jar', 'de.unihd.dbs.heideltime.standalone.jar', '-l', language, os.path.join(cwd, tmp_file)], cwd=heideltime_path, stdout=subprocess.PIPE)
res, err = p.communicate()
root = etree.fromstring(res)
return root
def get_temporal_information(dist, dataset, heideltime_path='heideltime-standalone', language='GERMAN'):
# get temporal information
dataset_name = dataset.get('name', '')
dataset_description = dataset.get('description', '')
keywords = dataset.get('keywords', [])
dist_name = dist.get('name', '')
dist_description = dist.get('description', '')
# priorities to different sources of datetime information: dist > dataset info
for value in [dist_name, dist_description, dataset_name, dataset_description, ', '.join(keywords)]:
dates = []
# first escape any xml escaped characters
esc_v = escape(value)
root = get_heideltime_annotations(esc_v, heideltime_path, language)
for t in root:
if t.attrib['type'] == 'DATE':
v = t.attrib['value']
date = parser.parse(v)
dates.append(date)
if len(dates) > 0:
start = min(dates).strftime("%Y-%m-%d")
end = max(dates).strftime("%Y-%m-%d")
return start, end
return None, None