# Purpose

The purpose of this file is to contain all of the functions relating to detecting PHIs and their types within given datasets as well as to call fake_PHI_generation.ipynb's functions and prepare for file creation. 

# Importing necessary libaries

In [None]:
import ipynb #for importing other iPython notebooks
import re #for detecting PHIs and their types

## For debugging purposes only

In order to test that the generator function calls are working, fake_PHI_generation.ipynb must be imported into this file. However, this shouldn't be done when main.ipynb runs this file, so there will be an if statement gate that should be set to False when not debugging.

## Importing required iPython notebooks

In [None]:
%run fake_PHI_generation.ipynb #for functions relating to generating synthetic PHIs

# Defining a function that detects PHIs and their types

This function will use regular expressions to find PHI markers within Philter's output files (the original ones, with brackets and stars to denote PHI locations) as well as what types of PHI each of them are (name, date, etc.).

Currently, PHI will be randomized for every instance one is detected. For example, all names will be randomized, no matter if the PHI tag in the MIMIC dataset is the same. This ensures the utmost security as any names that slip through will be indistinguishable from the randomly generated ones.

In [None]:
def detect_text_PHIs(text):
    #here, text is the text of a single clinical note (already de-identified)
    
    #list to track all of the PHIs within a single clinical note
    text_PHIs = [] 
    
    #PHI_list: general format
    '''
    [
        {
            "location": match.span() (tuple containing the start and end locations of the PHI)
            "type": "name", "date", "patient number", etc.
            "subtype": varies depending on broad type, but this should help with making synthetic generation more realistic later on
            "modifiers": this is a dictionary that can be filled anything, in case there is a desire to implement more features
            "text": physical texts of the PHI (not including "[**" and "**]") - in case it is needed later on
        }
    ]
    '''
    
    #for detecting all instances where "[** ... **]" exists - these are where PHIs are located (also has to detect trouble characters)
    pattern = re.compile('''\[\*\*.*?\*\*\]|&|<|>|"|''' + "'")
    
    #variable to ensure that entries in the PHI dictionary have unique keys (because of the way python's dict.update() function works)
    number = 0
    
    #list of unrecognized PHIs (for debugging)
    unrecognized = []
    
    #searching through all PHIs for their location, type, and subtype
    for match in pattern.finditer(text):
        
        #figure out whether the match is a PHI or a trouble character and define test_text accordingly
        if re.compile('\[\*\*.*?\*\*\]').search(match.group()) != None:
            test_text = match.group()[3:-3].lower() #since the starting and ending sequences for PHI's are 3 characters long, doing this isolates just the text in the middle of the indicators
        else:
            test_text = match.group()        
        
        #dates
        date_pattern = re.compile('[0-9]+\-[0-9]+\-[0-9]+|[0-9]+/[0-9]+/[0-9]+') #for dates, which seem to come in the format YYYY-MM-DD, but I made it more flexible just in case
        second_date_pattern = re.compile('[0-9]+\-[0-9]+|[0-9]+/[0-9]+')#this is an alternate date type, which seems to be MM-DD (variants with / instead of - are also accounted for)
        third_date_pattern = re.compile('date|month|year|january|february|march|april|may|june|july|august|september|october|november|december', re.IGNORECASE) #for date ranges and dates that just say "month (only)", "year (only)", and month names
        fourth_date_pattern = re.compile('[0-9][0-9][0-9][0-9]') #this is for another alternate date type, which seems to be YYYY
        date_range_pattern = re.compile('range', re.IGNORECASE) #this is so that date ranges will actually be converted to date ranges and not just a single date
        
        #names
        name_pattern = re.compile('name', re.IGNORECASE) #searches for the word "name" (i.e. doctor and patient names), regardless of case
        gender_name_patterns = [re.compile('male', re.IGNORECASE), re.compile('female', re.IGNORECASE)]  #these regular expressions could technically be made different variables, but I think this is a bit clearer in terms of the grouping
        format_name_patterns = [re.compile('first', re.IGNORECASE), re.compile('last', re.IGNORECASE)] #these regular expressions aim to capture instances where the tag is only for a first or last name
            
        #holidays
        holiday_pattern = re.compile('holiday', re.IGNORECASE) #searches for the word "holiday", regardless of case
        
        #contact information
        email_pattern = re.compile('email', re.IGNORECASE) #searches for the word "email" (not just address, as this would catch street addresses as well)
        contact_pattern = re.compile('info', re.IGNORECASE) #searches for the word "info" (there are many variations of this extremely vague tag, and it will be replaced by a phone number or email address at random)
        phone_pattern = re.compile('phone', re.IGNORECASE) #searches for the word "phone", which should usually catch both phone and fax numbers
        pager_pattern = re.compile('pager', re.IGNORECASE) #searches for the word "pager", regardless of case
        url_pattern = re.compile('url', re.IGNORECASE) #searches for the word "url", regardless of case
        
        #IDs (generally numbers)
        numeric_identifier_pattern = re.compile('numeric.*?identifier', re.IGNORECASE) #searches for "numeric identifier" with any number of characters in between (in case there are spaces or something) 
        social_security_pattern = re.compile('social.*?security', re.IGNORECASE) #searches for "social security" with any number of characters in between
        provider_pattern = re.compile('provider', re.IGNORECASE) #searches for the word "provider", regardless of case
        medical_record_pattern = re.compile('medical.*?record', re.IGNORECASE) #searches for "medical record" with any number of characters in between
        md_number_pattern = re.compile('md.*?number', re.IGNORECASE) #searches for "md number" with any number of characters in between
        job_number_pattern = re.compile('job.*?number', re.IGNORECASE) #searches for "job number" with any number of characters in between
        clip_number_pattern = re.compile('clip.*?number', re.IGNORECASE) #searches for "clip number" with any number of characters in between
        
        #ages (over 90 only)
        age_pattern = re.compile('age', re.IGNORECASE) #this only needs to detect "age" since all obscured ages are those of patients over 90 years old
        
        #locations
        hospital_pattern = re.compile('hospital|ward|unit', re.IGNORECASE) #searches for the various terms associated with hospital locations
        hospital_modifier_patterns = [re.compile('ward', re.IGNORECASE), re.compile('unit', re.IGNORECASE)] #searches for the modifiers associated with hospital locations
        home_pattern = re.compile('home|address|zip.*?code|state|country', re.IGNORECASE) #searches for the various terms associated with home locations
        work_pattern = re.compile('work|university', re.IGNORECASE) #searches for the various terms associated with work locations
        po_box_pattern = re.compile('p.*?o.*?.*?box', re.IGNORECASE) #searches for "po box" with any number of characters in between
        address_pattern = re.compile('address', re.IGNORECASE) #searches for the word "address", regardless of case
        number_pattern = re.compile('number', re.IGNORECASE) #searches for the word "number", regardless of case
        location_pattern = re.compile('location', re.IGNORECASE) #searches for the word "location", regardless of case
        
        #create a variable and a dictionary so that subtypes and modifiers can be added (for extra detail about a given PHI)
        subtype = None #setting this to None initially; will be modified by the code below if necessary
        modifiers = {} #setting this to an empty dictionary - the reason it isn't initially set to None is so that the dictionary can be updated with something like modifiers["key"]:value in the code
        unrecognized = [] #to make sure no 
        
        #use the text of the PHI to determine its broad type (date, name, phone number, other number, etc.)
        
        #dates (subtypes: range)
        if date_pattern.search(test_text) != None or second_date_pattern.search(test_text) != None or third_date_pattern.search(test_text) != None or fourth_date_pattern.match(test_text) != None:
            TYPE = "date"
            
            #seeing whether the date given is in the form of a date range (since ranges will have to be treated differently)
            if date_range_pattern.search(test_text) != None:
                subtype = "range"
        
        #names (subtypes: patient, doctor; modifiers: gender (male, female, unknown))
        elif (name_pattern.search(test_text) != None) and hospital_pattern.search(test_text) == None:
            TYPE = "name"
                
            #checking gender of the person (if one is not detected, then leave as None)
            if gender_name_patterns[0].search(test_text) != None:
                modifiers["gender"] = "male"
            elif gender_name_patterns[1].search(test_text)!= None:
                modifiers["gender"] = "female"
            else:
                modifiers["gender"] = None
                
            #checking whether the name tag is only for a first or last name
            if format_name_patterns[0].search(test_text) != None:
                modifiers["format"] = "first"
            elif format_name_patterns[1].search(test_text) != None:
                modifiers["format"] = "last"
            else:
                modifiers["format"] = None
        
        #holidays (no subtypes)
        elif holiday_pattern.search(test_text) != None:
            TYPE = "holiday"
        
        #contact information (subtypes: email, contact (a bit vague in my opinion as well), phone, pager, url)
        elif email_pattern.search(test_text) != None or contact_pattern.search(test_text) != None or phone_pattern.search(test_text) != None or pager_pattern.search(test_text) != None or url_pattern.search(test_text) != None:
            TYPE = "contact"
            
            #checking contact category subtypes
            if email_pattern.search(test_text) != None:
                subtype = "email"
            elif contact_pattern.search(test_text) != None:
                subtype = "contact"
            elif phone_pattern.search(test_text) != None:
                subtype = "phone"
            elif pager_pattern.search(test_text) != None:
                subtype = "pager"
            else:
                subtype = "url"
            
        #identification (subtypes: numeric_identifier, social_security_number, provider_number, medical_record_number)
        elif numeric_identifier_pattern.search(test_text) != None or social_security_pattern.search(test_text) != None or provider_pattern.search(test_text) != None or medical_record_pattern.search(test_text) != None or md_number_pattern.search(test_text) != None or job_number_pattern.search(test_text) != None or clip_number_pattern.search(test_text) != None:
            TYPE = "ID"
            
            #checking identifier category subtypes
            if numeric_identifier_pattern.search(test_text) != None:
                subtype = "numeric_identifier"
            elif social_security_pattern.search(test_text) != None:
                subtype = "social_security_number"
            elif provider_pattern.search(test_text) != None:
                subtype = "provider_number"
            elif medical_record_pattern.search(test_text) != None:
                subtype = "medical_record_number"
            elif md_number_pattern.search(test_text) != None:
                subtype = "md_number"
            elif job_number_pattern.search(test_text) != None:
                subtype = "job_number"
            else:
                subtype = "clip_number"
        
        #age (no subtypes)
        elif age_pattern.search(test_text) != None:
            TYPE = "age"
        
        #locations (subtypes: hospital, home, work, other; modifiers: ward, unit, name, number, address) - a bit oversimplified but plenty satisfactory for Philter (not to mention those more complex versions will be randomly generated anyway)
        elif hospital_pattern.search(test_text) != None or home_pattern.search(test_text) != None or work_pattern.search(test_text) != None or po_box_pattern.search(test_text) != None or location_pattern.search(test_text) != None:
            TYPE = "location"
            
            #checking location category subtypes
            if hospital_pattern.search(test_text) != None:
                subtype = "hospital"
                #checking if the hospital location is a unit or ward
                if hospital_modifier_patterns[0].search(test_text) != None:
                    modifiers["hospital_subtype"] = "ward"
                else:
                    modifiers["hospital_subtype"] = "unit"
            elif home_pattern.search(test_text) != None:
                subtype = "home"
            elif work_pattern.search(test_text) != None:
                subtype = "work"
            elif location_pattern.search(test_text) != None:
                subtype = "unknown"
            else:
                subtype = "other"
            
            #checking location category modifiers
            if name_pattern.search(test_text) != None:
                modifiers["location_type"] = "name"
            elif address_pattern.search(test_text) != None:
                modifiers["location_type"] = "address"
            elif number_pattern.search(test_text) != None:
                modifiers["location_type"] = "number"     
        
        elif test_text in ["&", "<", ">", '''"''', "'"]:
            TYPE = "trouble_character"
        
        #unrecognized (in addition to being added to the list of the given text's PHI, so that they can be removed, it will be added to a JSON as a sanity check to see that PHIs are being properly detected)
        else:
            TYPE = "unrecognized"
            unrecognized.append(test_text)
        
        #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #if there is a desire to add functionality to make synthetic PHI's consistent accross multiple notes as well as occurances within the text, the code for that should probably be added here
        #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        
        #add the PHI to the dictionary for this text's PHIs (needs to be done no matter what, as the text needs to have PHIs subsituted - this will help do that)
        text_PHIs.append({
            "location": match.span(),
            "type": TYPE,
            "subtype": subtype,
            "modifiers": modifiers,
            "text": test_text,
        })
    
        number += 1
    
    return text_PHIs, unrecognized

## Defining a function that substitutes sythetic PHIs into the original text

This function will handle figuring out which PHIs need to be substituted into the text, substituting those PHIs into the text.

In addition to returning the text with the synthetic PHIs added in, the function will also return a dictionary containing information about the PHIs it substituted (so that the XML check file can be created).

In [None]:
def substitute_PHIs(text, text_PHIs):
    
    #variable for how much the string has changed in length so that only the PHI gets substituted
    shift = 0
    
    PHI_to_tag = []
    
    #regular expressions needed to determine whether the area around a PHI is uppercase or not
    PHI_pattern = re.compile("\[\*\*.*?\*\*\]")
    capitalized_pattern = re.compile("[A-Z]")
    total_pattern = re.compile("[a-z]|[A-Z]")
    
    for PHI in text_PHIs:
        var = False
        text_is_upper = False
        
        #determine whether the area of text around the PHI (+- 10 characters, as the PHI tag will be removed) is capitalized or not
        test_text = text[(PHI["location"][0] + shift - 10): (PHI["location"][1] + shift + 10)]
        
        #removing PHI tags since they are typically lowercase
        test_text = re.sub(PHI_pattern, "", test_text) #if we compiled this only once, this would be more effecient
        
        #using the ratio of capital to total letters to determine whether the area is capitalized or not
        ratio = 0
        if PHI["type"] != "trouble_character" and len(re.findall(total_pattern, test_text)) != 0:
            ratio = len(re.findall(capitalized_pattern, test_text))/len(re.findall(total_pattern, test_text))
            if ratio >= 0.75: #threshold is currently set to 75% capitalized, but this can be changed
                text_is_upper = True
        
        #generate a synthetic version of the PHI, ignoring dates, holidays, and unrecognized PHI (which we can't do anything about)
        if PHI["type"] == "date" and PHI["subtype"] != "range":
            synthetic = PHI["text"]
        elif PHI["text"] == "":
            synthetic = ""
        elif PHI["type"] == "holiday" or PHI["type"] == "unrecognized":
            synthetic = "[**" + PHI["text"] + "**]"
        elif PHI["type"] == "trouble_character":
            
            original_location = PHI["location"]
            PHI["location"] = (original_location[0], original_location[1])
            
            if PHI["text"] == "&":
                synthetic = "&amp;"
                var = True
            elif PHI["text"] == "<":
                synthetic = "&lt;"
                var = True
            elif PHI["text"] == ">":
                synthetic = "&gt;"
                var = True
            elif PHI["text"] == '''"''':
                synthetic = "&quot;"
                var = True
            elif PHI["text"] == "'":
                synthetic = "&apos;"
                var = True
        else:
            synthetic = generator(PHI["type"], PHI["subtype"], PHI["modifiers"], PHI["text"])
    
        if text_is_upper:
            synthetic = synthetic.upper()
        
        #define/alter variables as necessary
        start = PHI["location"][0] + shift
        start_string = text[:start]
        
        
        end_string = text[PHI["location"][1] + shift:]
        
        #altering the shift variable according to the original and new lengths of the PHIs
        shift -= ((PHI["location"][1] - PHI["location"][0]) - len(str(synthetic)))
        
        end = PHI["location"][1] + shift
        
        #substitute the synthetic PHI into the text
        text = start_string + str(synthetic) + end_string           
        
        if PHI["type"] in ["date", "age", "name", "ID"]:
            if PHI["subtype"] != "range":
                #add an entry to PHI_to_tag (for XML file creation purposes) - this might seem like a strange ordering, but this is based on the order in the example XML files
                PHI_to_tag.append({
                    "type": (PHI["type"], PHI["subtype"], PHI["modifiers"]),
                    "comment": "",
                    "end": end,
                    "start": start,
                    "text": synthetic
                })
    
    return text, PHI_to_tag