In [141]:
import re
import sys
phone_pattern ='(\d{3}[-\.\s/]??\d{3}[-\.\s/]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s/]??\d{4})'
age_pattern = r'(["is"]?\d[?<125]{1,3})'
#  , "yo", "years old", "years of age", "yrs of age"
# (?<=125)
# r'\d{1,3}'

# compiling the reg_ex would save sime time!
ph_reg = re.compile(phone_pattern)
age_reg = re.compile(age_pattern);

# TODO for age: age_reg

In [142]:
def check_for_age(patient,note,chunk, output_handle):
    """
    Inputs:
        - patient: Patient Number, will be printed in each occurance of personal information found
        - note: Note Number, will be printed in each occurance of personal information found
        - chunk: one whole record of a patient
        - output_handle: an opened file handle. The results will be written to this file.
            to avoid the time intensive operation of opening and closing the file multiple times
            during the de-identification process, the file is opened beforehand and the handle is passed
            to this function. 
    Logic:
        Search the entire chunk for age occurances. Find the location of these occurances 
        relative to the start of the chunk, and output these to the output_handle file. 
        If there are no occurances, only output Patient X Note Y (X and Y are passed in as inputs) in one line.
        Use the precompiled regular expression to find age.
    """
    # The perl code handles texts a bit differently, 
    # we found that adding this offset to start and end positions would produce the same results
    offset = 27

    # For each new note, the first line should be Patient X Note Y and then all the personal information positions
    output_handle.write('Patient {}\tNote {}\n'.format(patient,note))

    # search the whole chunk, and find every position that matches the regular expression
    # for each one write the results: "Start Start END"
    # Also for debugging purposes display on the screen (and don't write to file) 
    # the start, end and the actual personal information that we found
    for match in age_reg.finditer(chunk):
                
            # debug print, 'end=" "' stops print() from adding a new line
            print(patient, note,end=' ')
            print((match.start()-offset),match.end()-offset, match.group())
                
            # create the string that we want to write to file ('start start end')    
            result = str(match.start()-offset) + ' ' + str(match.start()-offset) +' '+ str(match.end()-offset) 
            
            # write the result to one line of output
            output_handle.write(result+'\n')

In [143]:
def check_for_phone(patient,note,chunk, output_handle):
    """
    Inputs:
        - patient: Patient Number, will be printed in each occurance of personal information found
        - note: Note Number, will be printed in each occurance of personal information found
        - chunk: one whole record of a patient
        - output_handle: an opened file handle. The results will be written to this file.
            to avoid the time intensive operation of opening and closing the file multiple times
            during the de-identification process, the file is opened beforehand and the handle is passed
            to this function. 
    Logic:
        Search the entire chunk for phone number occurances. Find the location of these occurances 
        relative to the start of the chunk, and output these to the output_handle file. 
        If there are no occurances, only output Patient X Note Y (X and Y are passed in as inputs) in one line.
        Use the precompiled regular expression to find phones.
    """
    # The perl code handles texts a bit differently, 
    # we found that adding this offset to start and end positions would produce the same results
    offset = 27

    # For each new note, the first line should be Patient X Note Y and then all the personal information positions
    output_handle.write('Patient {}\tNote {}\n'.format(patient,note))

    # search the whole chunk, and find every position that matches the regular expression
    # for each one write the results: "Start Start END"
    # Also for debugging purposes display on the screen (and don't write to file) 
    # the start, end and the actual personal information that we found
    for match in ph_reg.finditer(chunk):
                
            # debug print, 'end=" "' stops print() from adding a new line
            print(patient, note,end=' ')
            print((match.start()-offset),match.end()-offset, match.group())
                
            # create the string that we want to write to file ('start start end')    
            result = str(match.start()-offset) + ' ' + str(match.start()-offset) +' '+ str(match.end()-offset) 
            
            # write the result to one line of output
            output_handle.write(result+'\n')


In [144]:
def deid_phone(text_path= 'id.text', output_path = 'phone.phi'):
    """
    Inputs: 
        - text_path: path to the file containing patient records
        - output_path: path to the output file.
    
    Outputs:
        for each patient note, the output file will start by a line declaring the note in the format of:
            Patient X Note Y
        then for each phone number found, it will have another line in the format of:
            start start end
        where the start is the start position of the detected phone number string, and end is the detected
        end position of the string both relative to the start of the patient note.
        If there is no phone number detected in the patient note, only the first line (Patient X Note Y) is printed
        to the output
    Screen Display:
        For each phone number detected, the following information will be displayed on the screen for debugging purposes 
        (these will not be written to the output file):
            start end phone_number
        where `start` is the start position of the detected phone number string, and `end` is the detected end position of the string
        both relative to the start of patient note.
    
    """
    # start of each note has the patter: START_OF_RECORD=PATIENT||||NOTE||||
    # where PATIENT is the patient number and NOTE is the note number.
    start_of_record_pattern = '^start_of_record=(\d+)\|\|\|\|(\d+)\|\|\|\|$'

    # end of each note has the patter: ||||END_OF_RECORD
    end_of_record_pattern = '\|\|\|\|END_OF_RECORD$'

    # open the output file just once to save time on the time intensive IO
    with open(output_path,'w+') as output_file:
        with open(text_path) as text:
            # initilize an empty chunk. Go through the input file line by line
            # whenever we see the start_of_record pattern, note patient and note numbers and start 
            # adding everything to the 'chunk' until we see the end_of_record.
            chunk = ''
            for line in text:
                record_start = re.findall(start_of_record_pattern,line,flags=re.IGNORECASE)
                if len(record_start):
                    patient, note = record_start[0]
                chunk += line

                # check to see if we have seen the end of one note
                record_end = re.findall(end_of_record_pattern, line,flags=re.IGNORECASE)

                if len(record_end):
                    # Now we have a full patient note stored in `chunk`, along with patient numerb and note number
                    # pass all to check_for_phone to find any phone numbers in note.
                    check_for_phone(patient,note,chunk.strip(), output_file)
                    # initialize the chunk for the next note to be read
                    chunk = ''

In [145]:
def deid_age(text_path= 'id.text', output_path = 'age-sina-dabiri.phi'):
    """
    Inputs: 
        - text_path: path to the file containing patient records
        - output_path: path to the output file.
    
    Outputs:
        for each patient note, the output file will start by a line declaring the note in the format of:
            Patient X Note Y
        then for each age found, it will have another line in the format of:
            start start end
        where the start is the start position of the detected age string, and end is the detected
        end position of the string both relative to the start of the patient note.
        If there is no age detected in the patient note, only the first line (Patient X Note Y) is printed
        to the output
    Screen Display:
        For each phone number detected, the following information will be displayed on the screen for debugging purposes 
        (these will not be written to the output file):
            start end age
        where `start` is the start position of the detected phone number string, and `end` is the detected end position of the string
        both relative to the start of patient note.
    
    """
    # start of each note has the patter: START_OF_RECORD=PATIENT||||NOTE||||
    # where PATIENT is the patient number and NOTE is the note number.
    start_of_record_pattern = '^start_of_record=(\d+)\|\|\|\|(\d+)\|\|\|\|$'

    # end of each note has the patter: ||||END_OF_RECORD
    end_of_record_pattern = '\|\|\|\|END_OF_RECORD$'

    # open the output file just once to save time on the time intensive IO
    with open(output_path,'w+') as output_file:
        with open(text_path) as text:
            # initilize an empty chunk. Go through the input file line by line
            # whenever we see the start_of_record pattern, note patient and note numbers and start 
            # adding everything to the 'chunk' until we see the end_of_record.
            chunk = ''
            for line in text:
                record_start = re.findall(start_of_record_pattern,line,flags=re.IGNORECASE)
                if len(record_start):
                    patient, note = record_start[0]
                chunk += line

                # check to see if we have seen the end of one note
                record_end = re.findall(end_of_record_pattern, line,flags=re.IGNORECASE)

                if len(record_end):
                    # Now we have a full patient note stored in `chunk`, along with patient numerb and note number
                    
                    # check for age PHI
                    check_for_age(patient,note,chunk.strip(), output_file)
                    # initialize the chunk for the next note to be read
                    # TODO: Replace ages over 90 with 'Age over 90'
                    chunk = ''

In [9]:
pwd

'C:\\Users\\sinad\\OneDrive - Georgia Institute of Technology\\BMI 500 Bio Informatics\\wk8\\deid2021\\python'

In [10]:
deid_phone(text_path= 'C:\\Users\\sinad\\OneDrive - Georgia Institute of Technology\\BMI 500 Bio Informatics\\wk8\\deid2021\\python\\id.text', 
           output_path = 'C:\\Users\\sinad\\OneDrive - Georgia Institute of Technology\\BMI 500 Bio Informatics\\wk8\\deid2021\\python\\phone.phi')

8 1 552 564 201/324/1423
8 1 2296 2308 201-561-8910
8 4 987 999 201-223-4567
17 2 1197 1209 410-322-1419
17 2 1216 1228 603-960-5357
17 2 1235 1247 888-130-8121
17 2 1296 1308 450-928-6612
17 2 1338 1350 830-650-2352
17 2 1356 1368 301-680-6286
17 2 1376 1388 410-164-4517
28 2 1075 1087 285-482-5518
28 2 1104 1116 648-199-1703
41 2 2245 2257 204-943-1045
41 2 2288 2300 989-290-8303
44 4 1193 1205 135-442-9738
44 4 1694 1706 410-422-6213
47 2 431 443 301 944-5032
47 2 468 480 301 343-2822
48 3 961 973 410-555-9876
60 3 1628 1640 410 202-6694
60 3 1648 1660 410 671-9309
70 3 1729 1740 202 2671093
73 1 1766 1777 240444-1243
73 36 1890 1902 301 273 4516
82 2 1374 1386 410 392 0780
82 7 1109 1121 301-152-9058
96 1 1065 1076 202232-4455
139 1 1550 1562 858-492-5403
139 1 1567 1579 858-789-7896
139 1 1602 1614 415-999-8604


In [146]:
# f = open("stats-sina-dabiri.txt", "w")
deid_age(text_path= 'C:\\Users\\sinad\\OneDrive - Georgia Institute of Technology\\BMI 500 Bio Informatics\\wk8\\deid2021\\python\\id.text',
         output_path = 'C:\\Users\\sinad\\OneDrive - Georgia Institute of Technology\\BMI 500 Bio Informatics\\wk8\\deid2021\\python\\age-sina-dabiri.phi')
# f.close()
# ToDO: store in >> stats-sina-dabiri.txt

1 1 194 196 92
1 1 335 337 22
1 1 758 760 21
1 3 25 27 32
1 3 51 53 11
1 4 167 169 92
1 4 531 533 11
1 5 24 26 12
1 5 315 318 911
1 5 530 532 25
1 5 540 542 32
1 5 2298 2300 22
1 6 37 39 92
1 6 56 58 95
1 6 710 712 01
1 7 717 719 12
1 7 996 998 95
1 8 438 440 31
1 8 562 564 15
1 8 586 588 22
1 8 591 593 25
1 8 600 602 72
1 8 603 605 35
1 8 726 728 85
1 8 730 732 01
1 8 1126 1128 12
1 9 479 481 65
1 9 498 500 72
1 9 682 684 72
1 9 755 757 62
1 9 948 950 32
1 9 1009 1011 75
1 9 1074 1076 91
1 9 1121 1123 91
1 9 1158 1160 91
1 9 1161 1163 42
1 9 1169 1171 25
1 10 31 34 115
1 10 59 61 32
1 10 134 136 61
1 10 195 197 55
1 10 291 293 92
1 10 536 538 32
1 10 542 544 21
1 10 835 837 75
1 11 -6 -4 11
1 11 145 147 15
1 11 201 203 71
1 11 212 214 91
1 11 216 218 02
1 11 219 221 45
1 11 222 224 52
1 11 242 244 31
1 11 503 505 21
1 11 645 648 112
1 12 -6 -4 12
1 12 42 44 92
1 12 45 47 95
1 12 53 55 15
1 12 56 58 22
1 12 105 107 41
1 12 147 149 61
1 12 164 166 71
1 12 194 196 25
1 12 197 199 31
1 12

In [13]:
import re

exampleString = '''
Jessica is 15 years old, and Daniel is 27 years old.
Edward is 97 years old, and his grandfather, Oscar, is 102. 
'''

In [134]:
# TODO: add prefix and suffix 

# Age indicators that precede ages
age_prefix = ["age", "he is", "she is", "patient is"]
# Age indicators that follow ages
age_suffix = ["years old", "y\. o\.", "y\.o\.", "yo", "y", "years old", "year-old", "-year-old", "years-old", "-years-old", "years of age", "yrs of age"]

ages = re.findall(r'["is"]?\d{1,3}?["years old" , "."]',exampleString)

print(ages)

['15 ', '27 ', '97 ', '102.']


In [15]:
for i in range(1,len(ages)):
    if int(ages[i])> 90:
        ages[i] = 'age over 90'
        
print(ages)

['15', '27', 'age over 90', 'age over 90']
