In [22]:
import spacy
from spacy.matcher import Matcher

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

pattern = [{"POS": "CARDINAL"},{"POS": "FAC"}] # Trying to identify address pattern
matcher.add("address", [pattern])


# Process whole documents
text = ("My email is dummyemail@gmail.com. It was another early sunset on a rainy day in Seattle. Andrew was walking with a paper bag of groceries back to his downtown studio apartment at 400 Main Street. He had gotten government permission to stay in this apartment as part of the Upbeat program, focusing on bettering drug-affected neighborhoods in King County. It was an artist supplement program, and Andrew was a published haiku poet and teacher of haiku in public education and sometimes colleges. He had to attend an audition for artistic achievements to be able to get the subsidy. Though it was a miracle to live in downtown Seattle for $800 a month, it did have its downsides, with junkies visiting the weekly exhibitions of his haiku and accompanying minimalist art made by his neighbor, Patrick.")
        
doc = nlp(text)
matches = matcher(doc)

print("Matches:", [doc[start:end].text for match_id, start, end in matches])

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Matches: []
Noun phrases: ['My email', 'It', 'another early sunset', 'a rainy day', 'Seattle', 'Andrew', 'a paper bag', 'groceries', 'his downtown studio apartment', '400 Main Street', 'He', 'government permission', 'this apartment', 'part', 'the Upbeat program', 'drug-affected neighborhoods', 'King County', 'It', 'an artist supplement program', 'Andrew', 'a published haiku poet', 'teacher', 'haiku', 'public education', 'sometimes colleges', 'He', 'an audition', 'artistic achievements', 'the subsidy', 'it', 'a miracle', 'downtown Seattle', 'it', 'its downsides', 'junkies', 'the weekly exhibitions', 'his haiku', 'accompanying minimalist art', 'his neighbor', 'Patrick']
Verbs: ['walk', 'get', 'stay', 'focus', 'better', 'affect', 'publish', 'have', 'attend', 'get', 'live', 'have', 'visit', 'accompany', 'make']
Seattle GPE
400 CARDINAL
Main Street FAC
Upbeat ORG
King County GPE
Andrew PERSON
Seattle GPE
800 MONEY
weekly DATE
Patrick ORG


In [10]:
new_text = ""
persons = []
locations = []
pronouns = ["her","Her","him","Him","he","He","she","She","his","His"]
for entity in doc.ents:
    if (entity.label_ == "PERSON"): persons.append(entity.text)
    if (entity.label_ == "GPE"): locations.append(entity.text)
    if (entity.label_ == "FAC"): 
        x = entity.text.split()
        for y in x:
            locations.append(y)
for token in doc:
    string = str(token.text)
    #print(string)
    if(token.i+1 < len(doc)): 
        next_token = doc[token.i+1]
    if(token.text in persons):
        new_text+="[Name " + str(persons.index(token.text)) + "] "
    elif(token.text.find('@') > -1):
        new_text+="[Email]"
    elif(str(token.text + " " + next_token.text) in persons):
        new_text+="[Name " + str(persons.index(next_token.text)) + "] "
    elif(token.text in pronouns):
        new_text+="[Pronoun] "
    elif(token.text == "man" or token.text == "woman" or token.text == "non-binary"):
        new_text+="person "
    elif(token.text in locations):
        new_text+="[Location] "
    elif(token.pos_ == "CARDINAL" and next_token.text in locations):
        new_text+="[Location] "
    elif(str(token.text + " " + next_token.text) in locations):
        new_text+="[Location] "
    elif((token.pos_ == "AUX" and next_token.text == "n\'t") or (next_token.pos_ == "PUNCT")):
        new_text+=token.text + ""
    else:
        new_text+=token.text + " "
print(new_text)


My email is [Email]. It was another early sunset on a rainy day in [Location] . [Name 0] was walking with a paper bag of groceries back to [Pronoun] downtown studio apartment at 400 [Location] [Location] . [Pronoun] had gotten government permission to stay in this apartment as part of the Upbeat program, focusing on bettering drug- affected neighborhoods in [Location] County. It was an artist supplement program, and [Name 0] was a published haiku poet and teacher of haiku in public education and sometimes colleges. [Pronoun] had to attend an audition for artistic achievements to be able to get the subsidy. Though it was a miracle to live in downtown [Location] for $ 800 a month, it did have its downsides, with junkies visiting the weekly exhibitions of [Pronoun] haiku and accompanying minimalist art made by [Pronoun] neighbor, Patrick.


In [None]:
'''
for token in doc:
    if(token.i+1 < len(doc)): 
        next_token = doc[token.i+1]
    if(token.text in persons):
        new_text+="[Name " + str(persons.index(token.text)) + "] "
    elif(str(token.text + " " + next_token.text) in persons):
        new_text+="[Name " + str(persons.index(next_token.text)) + "] "
    elif((token.text == "her" or token.text == "him") and next_token.pos_ == "PUNCT"):
        new_text+="them "
    elif(token.text == "she" or token.text == "he"):
        new_text+="they "
    elif(token.text == "She" or token.text == "He"):
        new_text+="They "
    elif(token.text == "her" or token.text == "his"):
        new_text+="their "
    elif(token.text == "Her" or token.text == "His"):
        new_text+="Their "
    elif(token.text == "him"):
        new_text+="them "
    elif(token.text == "Him"):
        new_text+="Them "
    elif(token.text == "man" or token.text == "woman"):
        new_text+="person "
    elif(token.text in locations):
        new_text+="[Location] "
    elif(str(token.text + " " + next_token.text) in locations):
        new_text+="[Location] "
    elif((token.pos_ == "AUX" and next_token.text == "n\'t") or (next_token.pos_ == "PUNCT")):
        new_text+=token.text + ""
    else:
        new_text+=token.text + " "
'''