From d72414c3de86689b737976c85c172c8b71670dc2 Mon Sep 17 00:00:00 2001 From: Jenny Chen Date: Tue, 11 Aug 2020 13:43:07 -0700 Subject: [PATCH 1/2] fix date edge cases in i2b2 2006 preprocessing --- .../i2b2_2006_deid/to_conll.py | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py index 3ee8544..1ce908f 100644 --- a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py +++ b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py @@ -16,8 +16,6 @@ phi_tags = re.findall(regex, line) for tag in phi_tags: line = line.replace(tag[0], '__phi__').strip() - - # Walk through sentence phi_ind = 0 for w in line.split(): @@ -29,8 +27,30 @@ for t in toks[1:]: print(t, 'I-%s'%tag) phi_ind += 1 + # Two elif statements check for edge cases with Dates + elif w.startswith('__phi__'): + # examples like following format: + # 01/01/1995 or 01-01-95 + phi = phi_tags[phi_ind] + tag = phi[1] + toks = phi[2].split() + print(toks[0], 'B-%s'%tag) + if w[7:8] == '/' or w[7:8] == '-': + print(w[8:], 'O') # remove the / or - in the year + else: + print(w[7:], 'O') + phi_ind += 1 + elif w.endswith('__phi__'): + # 19950101 + phi = phi_tags[phi_ind] + tag = phi[1] + toks = phi[2].split() + print(w[:-7], 'O') + print(toks[0], 'B-%s'%tag) + phi_ind += 1 else: print(w, 'O') print() + i+=1 From 5916042f22ea99099fcb8cc2bdda5552f83f38fe Mon Sep 17 00:00:00 2001 From: Jenny Chen Date: Tue, 11 Aug 2020 13:47:39 -0700 Subject: [PATCH 2/2] fix syntax --- downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py | 1 + 1 file changed, 1 insertion(+) diff --git a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py index 1ce908f..083eef1 100644 --- a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py +++ b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py @@ -16,6 +16,7 @@ phi_tags = re.findall(regex, line) for tag in phi_tags: line = line.replace(tag[0], '__phi__').strip() + # Walk through sentence phi_ind = 0 for w in line.split():