Skip to content
This repository has been archived by the owner on Apr 5, 2024. It is now read-only.

Commit

Permalink
added support for extracting date and time portions of an ISO datetime
Browse files Browse the repository at this point in the history
  • Loading branch information
richardboyd committed Aug 6, 2019
1 parent 54abc44 commit ed80b48
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 49 deletions.
72 changes: 45 additions & 27 deletions nlp/algorithms/finder/date_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
###############################################################################

_VERSION_MAJOR = 0
_VERSION_MINOR = 3
_VERSION_MINOR = 4
_MODULE_NAME = 'date_finder.py'

# set to True to enable debug output
Expand Down Expand Up @@ -237,48 +237,57 @@

# optional sign, four-digit year, two-digit month, two-digit day, dashes
_str_iso_s4y2m2d = r'(?P<sign>[-+]?)' +\
r'(?P<year>' + _str_YY + r')' + r'-' +\
r'(?<!\d)(?P<year>' + _str_YY + r')' + r'-' +\
r'(?P<month>' + _str_MM + r')' + r'-' +\
r'(?P<day>' + _str_DD + r')\b'
r'(?P<day>' + _str_DD + r'(?!\d))'
_regex_iso_2 = re.compile(_str_iso_s4y2m2d)

# four-digit year, two-digit month, two-digit day, fwd slashes
_str_iso_4y2m2d = r'(?<!\d)(?P<year>' + _str_YY + r')' + r'/' +\
r'(?P<month>' + _str_MM + r')' + r'/' +\
r'(?P<day>' + _str_DD + r')\b'
r'(?P<day>' + _str_DD + r'(?!\d))'
_regex_iso_3 = re.compile(_str_iso_4y2m2d)

# two-digit year, two-digit month, two-digit day, dashes
_str_iso_2y2m2d = r'(?<!\d)(?P<year>' + _str_yy + r')' + r'-' +\
r'(?P<month>' + _str_MM + r')' + r'-' +\
r'(?P<day>' + _str_DD + r')\b'
r'(?P<day>' + _str_DD + r'(?!\d))'
_regex_iso_4 = re.compile(_str_iso_2y2m2d)

# ISO datetime format: YYYY-MM-DDTHH:MM:SS.ffffff
# fractional seconds are optional
_str_iso_datetime = _str_iso_s4y2m2d + r'T\d\d:\d\d:\d\d(\.\d+)?'
_regex_iso_datetime = re.compile(_str_iso_datetime)

# all date regexes
_regexes = [
_regex_iso_1, # 0
_regex_iso_2, # 1
_regex_iso_3, # 2
_regex_iso_4, # 3
_regex_1, # 4
_regex_2, # 5
_regex_3, # 6
_regex_4, # 7
_regex_5, # 8
_regex_6, # 9
_regex_7, # 10
_regex_8, # 11
_regex_9, # 12
_regex_10, # 13
_regex_11, # 14
_regex_12, # 15
_regex_13, # 16
_regex_14, # 17
_regex_15, # 18
_regex_16, # 19
_regex_17 # 20
_regex_iso_datetime, # 0
_regex_iso_1, # 1
_regex_iso_2, # 2
_regex_iso_3, # 3
_regex_iso_4, # 4
_regex_1, # 5
_regex_2, # 6
_regex_3, # 7
_regex_4, # 8
_regex_5, # 9
_regex_6, # 10
_regex_7, # 11
_regex_8, # 12
_regex_9, # 13
_regex_10, # 14
_regex_11, # 15
_regex_12, # 16
_regex_13, # 17
_regex_14, # 18
_regex_15, # 19
_regex_16, # 20
_regex_17 # 21
]

# index of the ISO datetime regex in the _regexes array
_ISO_DATETIME_REGEX_INDEX = 0

# match (), {}, and []
_str_brackets = r'[(){}\[\]]'
_regex_brackets = re.compile(_str_brackets)
Expand Down Expand Up @@ -327,6 +336,11 @@ def run(sentence):
iterator = regex.finditer(sentence)
for match in iterator:
match_text = match.group().strip()
if _ISO_DATETIME_REGEX_INDEX == regex_index:
# extract only the date portion
t_pos = match_text.find('T')
assert -1 != t_pos
match_text = match_text[:t_pos]
start = match.start()
end = start + len(match_text)
candidates.append(overlap.Candidate(start, end, match_text, regex))
Expand Down Expand Up @@ -364,7 +378,11 @@ def run(sentence):
for pc in pruned_candidates:

# use the saved regex to match the saved text again
match = pc.regex.match(pc.match_text)
if _regex_iso_datetime == pc.regex:
# match only the date portion
match = _regex_iso_2.match(pc.match_text)
else:
match = pc.regex.match(pc.match_text)
assert match

int_year = EMPTY_FIELD
Expand Down
26 changes: 26 additions & 0 deletions nlp/algorithms/finder/test_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,12 +388,38 @@ def test_time_finder():
if not _run_tests(_MODULE_TIME, test_data):
return False

# UTC datetime YYYY-MM-DDTHH:MM:SS.ffffff
test_data = {
'The datetimes are 2016-05-20T11:12:13.12345, and 2016-05-20T11:12:13':[
_TimeResult(text='11:12:13.12345',
hours=11, minutes=12, seconds=13,
fractional_seconds='12345'),
_TimeResult(text='11:12:13',
hours=11, minutes=12, seconds=13)
]
}

if not _run_tests(_MODULE_TIME, test_data):
return False

return True


###############################################################################
def test_date_finder():

# UTC datetime YYYY-MM-DDTHH:MM:SS.ffffff
test_data = {
'The datetimes are 2017-06-18T11:12:13.12345, and 2017-06-18T11:12:13':[
_DateResult(text='2017-06-18', year=2017, month=6, day=18),
_DateResult(text='2017-06-18', year=2017, month=6, day=18)
]
}

if not _run_tests(_MODULE_DATE, test_data):
return False


# ISO 8601 8-digit format
test_data = {
'The date 20121128 is in iso_8 format.':[
Expand Down
62 changes: 40 additions & 22 deletions nlp/algorithms/finder/time_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@


_VERSION_MAJOR = 0
_VERSION_MINOR = 2
_VERSION_MINOR = 3
_MODULE_NAME = 'time_finder.py'

# set to True to see debug output
Expand Down Expand Up @@ -376,14 +376,14 @@
r'(?P<gmt_sign>[-+])' + _str_iso_zone_hm + r')'

# note the essential negative lookahead in these
_str_iso_hh_only = r'\b(?P<hours>' + _str_iso_hh + r'(?!\d))' +\
_str_iso_hh_only = r'(?<!\d)(?P<hours>' + _str_iso_hh + r'(?!\d))' +\
r'((?P<gmt_delta>' + _str_iso_zone + r'))?'

_str_iso_hhmm_only = r'\b(?P<hours>' + _str_iso_hh + r')' +\
r'(?P<minutes>' + _str_iso_mm + r'(?!\d))' +\
_str_iso_hhmm_only = r'(?<!\d)(?P<hours>' + _str_iso_hh + r')' +\
r'(?P<minutes>' + _str_iso_mm + r'(?!\d))' +\
r'((?P<gmt_delta>' + _str_iso_zone + r'))?'

_str_iso_hms = r'\b(?P<hours>' + _str_iso_hh + r'):?' +\
_str_iso_hms = r'(?<!\d)(?P<hours>' + _str_iso_hh + r'):?' +\
r'((?P<minutes>' + _str_iso_mm + r')):?' +\
r'((?P<seconds>' + _str_iso_ss + r'))' +\
r'((?P<frac>' + r'\.\d+' + r'))?'
Expand All @@ -394,24 +394,33 @@
_regex_iso_hhmm = re.compile(_str_iso_hhmm_only)
_regex_iso_time = re.compile(_str_iso_time)

# ISO datetime format: YYYY-MM-DDTHH:MM:SS.ffffff
# fractional seconds are optional
_str_iso_datetime = r'(?<!\d-)\d{4}\-\d\d\-\d\dT' + _str_iso_hms
_regex_iso_datetime = re.compile(_str_iso_datetime)

_regexes = [
_regex_iso_hhmm, # 0
_regex_iso_hh, # 1
_regex_iso_time, # 2
_regex_h24ms_with_gmt_delta, # 3
_regex_h24ms_with_timezone, # 4
_regex_h24ms_no_colon, # 5
_regex_h24m_no_colon, # 6
_regex_h12msf_am_pm, # 7
_regex_h12ms_am_pm, # 8
_regex_h12m_am_pm, # 9
_regex_h12_am_pm, # 10
_regex_h24msf, # 11
_regex_h24ms, # 12
_regex_h24m, # 13
_regex_h12m, # 14
_regex_iso_datetime, # 0
_regex_iso_hhmm, # 1
_regex_iso_hh, # 2
_regex_iso_time, # 3
_regex_h24ms_with_gmt_delta, # 4
_regex_h24ms_with_timezone, # 5
_regex_h24ms_no_colon, # 6
_regex_h24m_no_colon, # 7
_regex_h12msf_am_pm, # 8
_regex_h12ms_am_pm, # 9
_regex_h12m_am_pm, # 10
_regex_h12_am_pm, # 11
_regex_h24msf, # 12
_regex_h24ms, # 13
_regex_h24m, # 14
_regex_h12m, # 15
]

# index of the ISO datetime regex in the _regexes array
_ISO_DATETIME_REGEX_INDEX = 0

# match (), {}, and []
_str_brackets = r'[(){}\[\]]'
_regex_brackets = re.compile(_str_brackets)
Expand Down Expand Up @@ -459,7 +468,12 @@ def run(sentence):
for regex_index, regex in enumerate(_regexes):
iterator = regex.finditer(sentence)
for match in iterator:
match_text = match.group()
match_text = match.group().strip()
if _ISO_DATETIME_REGEX_INDEX == regex_index:
# extract only the time portion
t_pos = match_text.find('T')
assert -1 != t_pos
match_text = match_text[t_pos+1:]
if _TRACE:
print('[{0:2}]: MATCH TEXT: ->{1}<-'.
format(regex_index, match_text))
Expand Down Expand Up @@ -496,7 +510,11 @@ def run(sentence):
for pc in pruned_candidates:

# used the saved regex to match the saved text again
match = pc.regex.match(pc.match_text)
if _regex_iso_datetime == pc.regex:
# match only time portion
match = _regex_iso_time.match(pc.match_text)
else:
match = pc.regex.match(pc.match_text)
assert match

int_hours = EMPTY_FIELD
Expand Down

0 comments on commit ed80b48

Please sign in to comment.