In [61]:
import spacy
import re
from spacy import displacy
from collections import Counter
import en_core_web_sm
import rstr
import string

In [43]:
nlp = en_core_web_sm.load()

In [40]:
rstr.xeger(r'\d{3}-\d{2}-\d{4}')

'846-78-3964'

In [59]:
gender_filter_dict = {
    'male ': '',
    'female ': '',
    'he': 'it',
    'she': 'it',
    'his': 'its',
    'hers': 'its'
}

regexes = [
('email_regex', r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
, ('ssn_regex', r'\d{3}-\d{2}-\d{4}')
, ('ipv4_regex', r'\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}')
, ('mastercard_regex', r'(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}')
, ('visacard_regex', r'\b([4]\d{3}[\s]\d{4}[\s]\d{4}[\s]\d{4}|[4]\d{3}[-]\d{4}[-]\d{4}[-]\d{4}|[4]\d{3}[.]\d{4}[.]\d{4}[.]\d{4}|[4]\d{3}\d{4}\d{4}\d{4})\b')
, ('american-express_regex', r'3[47][0-9]{13}(?:\s|$|[^a-z0-9])')
, ('zip-code_regex', r'((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))')
]
def filter_text(text, filter_gender=True, filter_dates=True):
    doc = nlp(text)
    print([(X.text, X.label_) for X in doc.ents])
    
    for (name, regex) in regexes:
        text = re.sub('(?:^|\s|[^a-z0-9])' + regex + '(?:\s|$|[^a-z0-9])', " " + rstr.xeger(regex) + " ", text)
    
    if filter_gender:
        for key in gender_filter_dict:
            text = text.replace(key, gender_filter_dict[key])
            text = text.replace(key.capitalize(), gender_filter_dict[key].capitalize())
        
        
    for X in doc.ents:
        if X.label_ is 'PERSON':
            text = text.replace(X.text, 'PERSON')
        
        if X.label_ is 'GPE':
            text = text.replace(X.text, 'LOCATION')
            
        if X.label_ is 'CARDINAL':
            text = text.replace(X.text, rstr.rstr(string.digits, len(X.text)))
            
        if filter_dates:
            if X.label_ is 'DATE':
                text = text.replace(X.text, 'DATE')
    
    return text
            

In [72]:
result = filter_text('Jeffrey is a 23 years old police officer, 11, his zip code is 222-22-2222, he is very good at communicating. He lives in Texas and his credit card number is 12345,')
result

[('Jeffrey', 'PERSON'), ('23 years old', 'DATE'), ('11', 'CARDINAL'), ('222', 'CARDINAL'), ('Texas', 'GPE'), ('12345', 'DATE')]


'PERSON is a DATE police officer, 60, its zip code is 159-39-6862  it is very good at communicating. It lives in LOCATION and its credit card number is 61335-6045 '

In [6]:
result = filter_text('email is ,Deez@nuts.com')
result

[]


'email is email_regex'