# Imports

In [8]:
import spacy
import re
from spacy import displacy
from collections import Counter
import en_core_web_sm
import rstr
import string
import names
import barnum
from random import randint
nlp = en_core_web_sm.load()

# Data used for detection of private information
The first dictionary contains gender specific pronouns, which are replaced by gender neutral pronouns.
The second list of tuples contain regular expressions for private information.

In [9]:
gender_filter_dict = {
    'male': 'human',
    'female': 'human',
    ' he ': ' it ',
    'she ': 'it ',
    'his ': 'its ',
    'hers ': 'its ',
    ' him': ' it',
    ' her': ' it',
    ' man': ' person'
}

regexes = [
('email_regex', r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
, ('two_digits', r'[0-9]{2}')
, ('ssn_regex', r'\d{3}-\d{2}-\d{4}')
, ('ipv4_regex', r'\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}')
, ('mastercard_regex', r'(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}')
, ('visacard_regex', r'\b([4]\d{3}[\s]\d{4}[\s]\d{4}[\s]\d{4}|[4]\d{3}[-]\d{4}[-]\d{4}[-]\d{4}|[4]\d{3}[.]\d{4}[.]\d{4}[.]\d{4}|[4]\d{3}\d{4}\d{4}\d{4})\b')
, ('american-express_regex', r'3[47][0-9]{13}(?:\s|$|[^a-z0-9])')
, ('zip-code_regex', r'((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))')
]

# Main method
This method takes as input a string, and returns the exact same string with all information that might be private information replaced by synthetic private information following the same pattern.

In [10]:
def filter_text(text, filter_gender=True):
    doc = nlp(text)
    
    # Uncomment if you want to see the NLP data
#     print([(X.text, X.label_) for X in doc.ents])
    
    # First, traverse the regular expressions and replace the data.
    for (name, regex) in regexes:
        if name is 'email_regex':
            text = re.sub('(?:^|\s|[^a-z0-9])' + regex + '(?:\s|$|[^a-z0-9])', " " + barnum.create_email() + " ", text)
        else:
            text = re.sub('(?:^|\s|[^a-z0-9])' + regex + '(?:\s|$|[^a-z0-9])', " " + rstr.xeger(regex) + " ", text)
    
    # Replace the gender specific pronouns with gender neutral pronouns
    if filter_gender:
        for key in gender_filter_dict:
            text = text.replace(key, gender_filter_dict[key])
            text = text.replace(key.capitalize(), gender_filter_dict[key].capitalize())
    
    # Replace all words containing both letters and numbers with a random number in between 4 and 8 digits
    text = re.sub(r'((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9]*)', rstr.rstr(string.digits, randint(4, 8)), text)
    
    # Replace the relevant named entities with synthetic information
    for X in doc.ents:
        if X.label_ is 'PERSON':
            if len(X.text) > 8:
                replacement = barnum.create_name()[0] + " " + barnum.create_name()[1]
            else:
                replacement = barnum.create_name()[0]
            text = text.replace(X.text, replacement)
        
        if X.label_ is 'TIME':
            replacement = barnum.create_date(past=True).strftime("%H:%M")
            text = text.replace(X.text, replacement)
        
        if X.label_ is 'GPE':
            replacement = barnum.create_city_state_zip()[1]
            text = text.replace(X.text, replacement)
            
        if X.label_ is 'CARDINAL':
            text = text.replace(X.text, rstr.rstr(string.digits, len(X.text)))
        
    return text
            

# Example
The string "George Bush is a 22 year old man living in the USA" should be changed to "RANDOM NAME is a XX year old PERSON living in the RANDOM LOCATION"

In [13]:
result = filter_text('George Bush is a 22 year old man living in the USA')
result

'Dorian Bryan is a 47 year old person living in the Rockland'