Skip to content

Commit

Permalink
fix date edge cases in i2b2 2006 preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
Jenny Chen committed Aug 11, 2020
1 parent 169542d commit d72414c
Showing 1 changed file with 22 additions and 2 deletions.
24 changes: 22 additions & 2 deletions downstream_tasks/i2b2_preprocessing/i2b2_2006_deid/to_conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
phi_tags = re.findall(regex, line)
for tag in phi_tags:
line = line.replace(tag[0], '__phi__').strip()


# Walk through sentence
phi_ind = 0
for w in line.split():
Expand All @@ -29,8 +27,30 @@
for t in toks[1:]:
print(t, 'I-%s'%tag)
phi_ind += 1
# Two elif statements check for edge cases with Dates
elif w.startswith('__phi__'):
# examples like following format:
# <PHI TYPE="DATE">01/01</PHI>/1995 or <PHI TYPE="DATE">01-01</PHI>-95
phi = phi_tags[phi_ind]
tag = phi[1]
toks = phi[2].split()
print(toks[0], 'B-%s'%tag)
if w[7:8] == '/' or w[7:8] == '-':
print(w[8:], 'O') # remove the / or - in the year
else:
print(w[7:], 'O')
phi_ind += 1
elif w.endswith('__phi__'):
# 1995<PHI TYPE="DATE">0101</PHI>
phi = phi_tags[phi_ind]
tag = phi[1]
toks = phi[2].split()
print(w[:-7], 'O')
print(toks[0], 'B-%s'%tag)
phi_ind += 1
else:
print(w, 'O')
print()
i+=1


0 comments on commit d72414c

Please sign in to comment.