From d72414c3de86689b737976c85c172c8b71670dc2 Mon Sep 17 00:00:00 2001
From: Jenny Chen <t-chenje@microsoft.com>
Date: Tue, 11 Aug 2020 13:43:07 -0700
Subject: [PATCH 1/2] fix date edge cases in i2b2 2006 preprocessing

---
 .../i2b2_2006_deid/to_conll.py                | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
index 3ee8544..1ce908f 100644
--- a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
+++ b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
@@ -16,8 +16,6 @@
         phi_tags = re.findall(regex, line)
         for tag in phi_tags:
             line = line.replace(tag[0], '__phi__').strip()
-
-
         # Walk through sentence
         phi_ind = 0
         for w in line.split():
@@ -29,8 +27,30 @@
                 for t in toks[1:]:
                     print(t, 'I-%s'%tag)
                 phi_ind += 1
+            # Two elif statements check for edge cases with Dates
+            elif w.startswith('__phi__'):
+                # examples like following format:
+                # <PHI TYPE="DATE">01/01</PHI>/1995 or <PHI TYPE="DATE">01-01</PHI>-95
+                phi = phi_tags[phi_ind]
+                tag = phi[1]
+                toks = phi[2].split()
+                print(toks[0], 'B-%s'%tag)
+                if w[7:8] == '/' or w[7:8] == '-':
+                    print(w[8:], 'O') # remove the / or - in the year
+                else:
+                    print(w[7:], 'O')
+                phi_ind += 1
+            elif w.endswith('__phi__'):
+                # 1995<PHI TYPE="DATE">0101</PHI>
+                phi = phi_tags[phi_ind]
+                tag = phi[1]
+                toks = phi[2].split()
+                print(w[:-7], 'O')
+                print(toks[0], 'B-%s'%tag)
+                phi_ind += 1
             else:
                 print(w, 'O')
         print()
+        i+=1
 
 

From 5916042f22ea99099fcb8cc2bdda5552f83f38fe Mon Sep 17 00:00:00 2001
From: Jenny Chen <t-chenje@microsoft.com>
Date: Tue, 11 Aug 2020 13:47:39 -0700
Subject: [PATCH 2/2] fix syntax

---
 downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
index 1ce908f..083eef1 100644
--- a/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
+++ b/downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
@@ -16,6 +16,7 @@
         phi_tags = re.findall(regex, line)
         for tag in phi_tags:
             line = line.replace(tag[0], '__phi__').strip()
+
         # Walk through sentence
         phi_ind = 0
         for w in line.split():