In [7]:
import spacy
import en_core_web_lg
from spacy import displacy
import textacy

In [2]:
# Load the large English NLP model
nlp = en_core_web_lg.load()

# The text we want to examine
text = """London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

In [3]:
# Parse the text with spaCy. This runs the entire pipeline.
doc = nlp(text)

# 'doc' now contains a parsed version of text. We can use it to do anything we want!
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

London (GPE)
England (GPE)
the United Kingdom (GPE)
the River Thames (LOC)
the south east 
 (LOC)
Great Britain (GPE)
London (GPE)
two millennia (DATE)
Romans (NORP)
Londinium (ORG)


In [43]:
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.string

# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". In 1957, Noam Chomsky’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

print(scrub(s))

[REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] [REDACTED] 


In [44]:
# The text we want to examine
text = """London is the capital and most populous city of England and  the United Kingdom.  
Standing on the River Thames in the south east of the island of Great Britain, 
London has been a major settlement  for two millennia.  It was founded by the Romans, 
who named it Londinium.
"""

# Parse the document with spaCy
doc = nlp(text)

# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "London")

# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:
 - the capital and most populous city of England and  the United Kingdom.  

 - a major settlement  for two millennia.  


In [9]:
noun_chunks = textacy.extract.noun_chunks(doc, min_freq=3)

# Convert noun chunks to lowercase strings
noun_chunks = map(str, noun_chunks)
noun_chunks = map(str.lower, noun_chunks)

# Print out any nouns that are at least 2 words long
for noun_chunk in set(noun_chunks):
    if len(noun_chunk.split(" ")) > 1:
        print(noun_chunk)

In [10]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}

sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep", options=options)

In [11]:
displacy.render(sentence_spans, style="ent")

In [12]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

London London PROPN NNP nsubj Xxxxx True False
is be AUX VBZ ROOT xx True True
the the DET DT det xxx True True
capital capital NOUN NN attr xxxx True False
and and CCONJ CC cc xxx True True
most most ADV RBS advmod xxxx True True
populous populous ADJ JJ amod xxxx True False
city city NOUN NN conj xxxx True False
of of ADP IN prep xx True True
England England PROPN NNP pobj Xxxxx True False
and and CCONJ CC cc xxx True True
    SPACE _SP    False False
the the DET DT det xxx True True
United United PROPN NNP compound Xxxxx True False
Kingdom Kingdom PROPN NNP conj Xxxxx True False
. . PUNCT . punct . False False
 
  
 SPACE _SP   
 False False
Standing stand VERB VBG ROOT Xxxxx True False
on on ADP IN prep xx True True
the the DET DT det xxx True True
River River PROPN NNP compound Xxxxx True False
Thames Thames PROPN NNP pobj Xxxxx True False
in in ADP IN prep xx True True
the the DET DT det xxx True True
south south PROPN NNP compound xxxx True False
east east PROPN NNP pobj xxxx Tr

In [27]:
for chunk in doc.noun_chunks:
    if len(chunk.text) > 10:
        print(chunk.text, "---", chunk.root.text,"---", chunk.root.dep_,"---",
                chunk.root.head.text)

the capital --- capital --- attr --- is
most populous city --- city --- conj --- capital
the United Kingdom --- Kingdom --- conj --- capital
the River Thames --- Thames --- pobj --- on
the south east --- east --- pobj --- in
Great Britain --- Britain --- pobj --- of
a major settlement --- settlement --- attr --- been
two millennia --- millennia --- pobj --- for


In [14]:
# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('London', 0, 6, 'GPE'), ('England', 48, 55, 'GPE'), ('the United Kingdom', 61, 79, 'GPE'), ('the River Thames', 95, 111, 'LOC'), ('the south east', 115, 129, 'LOC'), ('Great Britain', 147, 160, 'GPE'), ('London', 163, 169, 'GPE'), ('two', 203, 206, 'CARDINAL'), ('Romans', 241, 247, 'NORP'), ('Londinium', 263, 272, 'ORG')]
['London', 'B', 'GPE']
['is', 'O', '']
