added support for extracting date and time portions of an ISO datetime

ClarityNLP · Aug 6, 2019 · ed80b48 · ed80b48
1 parent 54abc44
commit ed80b48
Show file tree

Hide file tree

Showing 3 changed files with 111 additions and 49 deletions.
diff --git a/nlp/algorithms/finder/date_finder.py b/nlp/algorithms/finder/date_finder.py
@@ -90,7 +90,7 @@
 ###############################################################################
 
 _VERSION_MAJOR = 0
-_VERSION_MINOR = 3
+_VERSION_MINOR = 4
 _MODULE_NAME   = 'date_finder.py'
 
 # set to True to enable debug output
@@ -237,48 +237,57 @@
 
 # optional sign, four-digit year, two-digit month, two-digit day, dashes
 _str_iso_s4y2m2d = r'(?P<sign>[-+]?)'                                   +\
-                   r'(?P<year>' + _str_YY + r')' + r'-'                 +\
+                   r'(?<!\d)(?P<year>' + _str_YY + r')' + r'-'                 +\
                    r'(?P<month>' + _str_MM + r')' + r'-'                +\
-                   r'(?P<day>' + _str_DD + r')\b'
+                   r'(?P<day>' + _str_DD + r'(?!\d))'
 _regex_iso_2 = re.compile(_str_iso_s4y2m2d)
 
 # four-digit year, two-digit month, two-digit day, fwd slashes
 _str_iso_4y2m2d = r'(?<!\d)(?P<year>' + _str_YY + r')' + r'/'           +\
                   r'(?P<month>' + _str_MM + r')' + r'/'                 +\
-                  r'(?P<day>' + _str_DD + r')\b'
+                  r'(?P<day>' + _str_DD + r'(?!\d))'
 _regex_iso_3 = re.compile(_str_iso_4y2m2d)
 
 # two-digit year, two-digit month, two-digit day, dashes
 _str_iso_2y2m2d = r'(?<!\d)(?P<year>' + _str_yy + r')' + r'-'           +\
                   r'(?P<month>' + _str_MM + r')' + r'-'                 +\
-                  r'(?P<day>' + _str_DD + r')\b'
+                  r'(?P<day>' + _str_DD + r'(?!\d))'
 _regex_iso_4 = re.compile(_str_iso_2y2m2d)
 
+# ISO datetime format: YYYY-MM-DDTHH:MM:SS.ffffff
+# fractional seconds are optional
+_str_iso_datetime = _str_iso_s4y2m2d + r'T\d\d:\d\d:\d\d(\.\d+)?'
+_regex_iso_datetime = re.compile(_str_iso_datetime)
+
 # all date regexes
 _regexes = [
-    _regex_iso_1,   # 0
-    _regex_iso_2,   # 1
-    _regex_iso_3,   # 2
-    _regex_iso_4,   # 3
-    _regex_1,       # 4
-    _regex_2,       # 5
-    _regex_3,       # 6
-    _regex_4,       # 7
-    _regex_5,       # 8
-    _regex_6,       # 9
-    _regex_7,       # 10
-    _regex_8,       # 11
-    _regex_9,       # 12
-    _regex_10,      # 13
-    _regex_11,      # 14
-    _regex_12,      # 15
-    _regex_13,      # 16
-    _regex_14,      # 17
-    _regex_15,      # 18
-    _regex_16,      # 19
-    _regex_17       # 20
+    _regex_iso_datetime, # 0
+    _regex_iso_1,        # 1
+    _regex_iso_2,        # 2
+    _regex_iso_3,        # 3
+    _regex_iso_4,        # 4
+    _regex_1,            # 5
+    _regex_2,            # 6
+    _regex_3,            # 7
+    _regex_4,            # 8
+    _regex_5,            # 9
+    _regex_6,            # 10
+    _regex_7,            # 11
+    _regex_8,            # 12
+    _regex_9,            # 13
+    _regex_10,           # 14
+    _regex_11,           # 15
+    _regex_12,           # 16
+    _regex_13,           # 17
+    _regex_14,           # 18
+    _regex_15,           # 19
+    _regex_16,           # 20
+    _regex_17            # 21
 ]
 
+# index of the ISO datetime regex in the _regexes array
+_ISO_DATETIME_REGEX_INDEX = 0
+
 # match (), {}, and []
 _str_brackets = r'[(){}\[\]]'
 _regex_brackets = re.compile(_str_brackets)
@@ -327,6 +336,11 @@ def run(sentence):
         iterator = regex.finditer(sentence)
         for match in iterator:
             match_text = match.group().strip()
+            if _ISO_DATETIME_REGEX_INDEX == regex_index:
+                # extract only the date portion
+                t_pos = match_text.find('T')
+                assert -1 != t_pos
+                match_text = match_text[:t_pos]
             start = match.start()
             end = start + len(match_text)
             candidates.append(overlap.Candidate(start, end, match_text, regex))
@@ -364,7 +378,11 @@ def run(sentence):
     for pc in pruned_candidates:
 
         # use the saved regex to match the saved text again
-        match = pc.regex.match(pc.match_text)
+        if _regex_iso_datetime == pc.regex:
+            # match only the date portion
+            match = _regex_iso_2.match(pc.match_text)
+        else:
+            match = pc.regex.match(pc.match_text)
         assert match
 
         int_year  = EMPTY_FIELD

diff --git a/nlp/algorithms/finder/test_finder.py b/nlp/algorithms/finder/test_finder.py
@@ -388,12 +388,38 @@ def test_time_finder():
     if not _run_tests(_MODULE_TIME, test_data):
         return False
 
+    # UTC datetime YYYY-MM-DDTHH:MM:SS.ffffff
+    test_data = {
+        'The datetimes are 2016-05-20T11:12:13.12345, and 2016-05-20T11:12:13':[
+            _TimeResult(text='11:12:13.12345',
+                        hours=11, minutes=12, seconds=13,
+                        fractional_seconds='12345'),
+            _TimeResult(text='11:12:13',
+                        hours=11, minutes=12, seconds=13)
+        ]
+    }
+
+    if not _run_tests(_MODULE_TIME, test_data):
+        return False
+
     return True
 
 
 ###############################################################################
 def test_date_finder():
 
+    # UTC datetime YYYY-MM-DDTHH:MM:SS.ffffff
+    test_data = {
+        'The datetimes are 2017-06-18T11:12:13.12345, and 2017-06-18T11:12:13':[
+            _DateResult(text='2017-06-18', year=2017, month=6, day=18),
+            _DateResult(text='2017-06-18', year=2017, month=6, day=18)
+        ]
+    }
+
+    if not _run_tests(_MODULE_DATE, test_data):
+        return False
+
+
     # ISO 8601 8-digit format
     test_data = {
         'The date 20121128 is in iso_8 format.':[

diff --git a/nlp/algorithms/finder/time_finder.py b/nlp/algorithms/finder/time_finder.py
@@ -194,7 +194,7 @@
 
 
 _VERSION_MAJOR = 0
-_VERSION_MINOR = 2
+_VERSION_MINOR = 3
 _MODULE_NAME = 'time_finder.py'
 
 # set to True to see debug output
@@ -376,14 +376,14 @@
                 r'(?P<gmt_sign>[-+])' + _str_iso_zone_hm + r')'
 
 # note the essential negative lookahead in these
-_str_iso_hh_only = r'\b(?P<hours>' + _str_iso_hh + r'(?!\d))'     +\
+_str_iso_hh_only = r'(?<!\d)(?P<hours>' + _str_iso_hh + r'(?!\d))'           +\
                    r'((?P<gmt_delta>' + _str_iso_zone + r'))?'
 
-_str_iso_hhmm_only = r'\b(?P<hours>' + _str_iso_hh + r')'         +\
-                     r'(?P<minutes>' + _str_iso_mm + r'(?!\d))'   +\
+_str_iso_hhmm_only = r'(?<!\d)(?P<hours>' + _str_iso_hh + r')'               +\
+                     r'(?P<minutes>' + _str_iso_mm + r'(?!\d))'              +\
                      r'((?P<gmt_delta>' + _str_iso_zone + r'))?'
 
-_str_iso_hms = r'\b(?P<hours>'  + _str_iso_hh + r'):?'                       +\
+_str_iso_hms = r'(?<!\d)(?P<hours>'  + _str_iso_hh + r'):?'                  +\
                r'((?P<minutes>' + _str_iso_mm + r')):?'                      +\
                r'((?P<seconds>' + _str_iso_ss + r'))'                        +\
                r'((?P<frac>'    + r'\.\d+'   + r'))?'
@@ -394,24 +394,33 @@
 _regex_iso_hhmm = re.compile(_str_iso_hhmm_only)
 _regex_iso_time = re.compile(_str_iso_time)
 
+# ISO datetime format: YYYY-MM-DDTHH:MM:SS.ffffff
+# fractional seconds are optional
+_str_iso_datetime = r'(?<!\d-)\d{4}\-\d\d\-\d\dT' + _str_iso_hms
+_regex_iso_datetime = re.compile(_str_iso_datetime)
+
 _regexes = [
-    _regex_iso_hhmm,             # 0
-    _regex_iso_hh,               # 1
-    _regex_iso_time,             # 2
-    _regex_h24ms_with_gmt_delta, # 3
-    _regex_h24ms_with_timezone,  # 4
-    _regex_h24ms_no_colon,       # 5
-    _regex_h24m_no_colon,        # 6
-    _regex_h12msf_am_pm,         # 7
-    _regex_h12ms_am_pm,          # 8
-    _regex_h12m_am_pm,           # 9
-    _regex_h12_am_pm,            # 10
-    _regex_h24msf,               # 11
-    _regex_h24ms,                # 12
-    _regex_h24m,                 # 13
-    _regex_h12m,                 # 14
+    _regex_iso_datetime,         # 0
+    _regex_iso_hhmm,             # 1
+    _regex_iso_hh,               # 2
+    _regex_iso_time,             # 3
+    _regex_h24ms_with_gmt_delta, # 4
+    _regex_h24ms_with_timezone,  # 5
+    _regex_h24ms_no_colon,       # 6
+    _regex_h24m_no_colon,        # 7
+    _regex_h12msf_am_pm,         # 8
+    _regex_h12ms_am_pm,          # 9
+    _regex_h12m_am_pm,           # 10
+    _regex_h12_am_pm,            # 11
+    _regex_h24msf,               # 12
+    _regex_h24ms,                # 13
+    _regex_h24m,                 # 14
+    _regex_h12m,                 # 15
 ]
 
+# index of the ISO datetime regex in the _regexes array
+_ISO_DATETIME_REGEX_INDEX = 0
+
 # match (), {}, and []
 _str_brackets = r'[(){}\[\]]'
 _regex_brackets = re.compile(_str_brackets)
@@ -459,7 +468,12 @@ def run(sentence):
     for regex_index, regex in enumerate(_regexes):
         iterator = regex.finditer(sentence)
         for match in iterator:
-            match_text = match.group()
+            match_text = match.group().strip()
+            if _ISO_DATETIME_REGEX_INDEX == regex_index:
+                # extract only the time portion
+                t_pos = match_text.find('T')
+                assert -1 != t_pos
+                match_text = match_text[t_pos+1:]
             if _TRACE:
                 print('[{0:2}]: MATCH TEXT: ->{1}<-'.
                       format(regex_index, match_text))
@@ -496,7 +510,11 @@ def run(sentence):
     for pc in pruned_candidates:
 
         # used the saved regex to match the saved text again
-        match = pc.regex.match(pc.match_text)
+        if _regex_iso_datetime == pc.regex:
+            # match only time portion
+            match = _regex_iso_time.match(pc.match_text)
+        else:
+            match = pc.regex.match(pc.match_text)
         assert match
 
         int_hours         = EMPTY_FIELD