In [6]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [7]:
# show an example 
doc = nlp(u"the avengers is better than transformers in the storyline")
print([(w.text, w.pos_) for w in doc])
displacy.render(doc, style= 'dep', jupyter = True, options={'distance':90})

[('the', 'DET'), ('avengers', 'NOUN'), ('is', 'VERB'), ('better', 'ADJ'), ('than', 'ADP'), ('transformers', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('storyline', 'NOUN')]


In [5]:
# # ‘nlp’ already apply dependency parsing on all the movie review
doc = open('movies_com.txt').read()
doc = nlp(doc)

In [16]:
# let us take a look 
dp = [sent for sent in doc.sents if 'avengers' in sent.string.lower()]
sentence = dp[10] 
for word in sentence:
    print(word,':', str(list(word.children)))

the : []
directing : [the, of]
of : [War]
Avengers : []
Infinify : []
War : [Avengers, Infinify]
is : [directing, straightforward, .]
more : []
straightforward : [more, than]
than : [Endgame]
the : []
Avengers : []
Endgame : [the, Avengers]
. : [
]

 : []


In [177]:
all_tags = {w.pos: w.pos_ for w in document}

In [182]:
for word in list(doc.sents)[0]:  
    print(word, word.tag_, word.ancestors)

Captain NNP <generator object at 0x000001FCFD83B048>
America NNP <generator object at 0x000001FCFD83B048>
Civil NNP <generator object at 0x000001FCFD83B048>
War NNP <generator object at 0x000001FCFD83B048>
has VBZ <generator object at 0x000001FCFD83B048>
a DT <generator object at 0x000001FCFD83B048>
more RBR <generator object at 0x000001FCFD83B048>
tedious JJ <generator object at 0x000001FCFD83B048>
stories NNS <generator object at 0x000001FCFD83B048>
than IN <generator object at 0x000001FCFD83B048>
Spider NNP <generator object at 0x000001FCFD83B048>
Man NNP <generator object at 0x000001FCFD83B048>
, , <generator object at 0x000001FCFD83B048>
superheroes NNS <generator object at 0x000001FCFD83B048>
are VBP <generator object at 0x000001FCFD83B048>
a DT <generator object at 0x000001FCFD83B048>
waste NN <generator object at 0x000001FCFD83B048>
of IN <generator object at 0x000001FCFD83B048>
time NN <generator object at 0x000001FCFD83B048>
always RB <generator object at 0x000001FCFD83B048>


In [41]:
#一些参数定义
noisy_pos_tags = ['PROP'
min_token_length = 2

#检查 token 是不是噪音的函数
def isNoise(token):     
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True
    elif token.is_stop == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise
def cleanup(token, lower = True):
    if lower:
       token = token.lower()
    return token.strip()



In [42]:
from collections import Counter
cleaned_list = [cleanup(word.string) for word in document if not isNoise(word)]
Counter(cleaned_list) .most_common(5)


[('avengers', 290),
 ('war', 199),
 ('infinity', 144),
 ('endgame', 143),
 ('better', 74)]

In [55]:
labels = set([w.label_ for w in document.ents])
for label in labels:
    entities = [cleanup(e.string, lower=False) for e in document.ents if label==e.label_]
    entities = list(set(entities))
    print(label,entities)


WORK_OF_ART ['The Winter Soldier', 'X-Men Apocalypse', 'The Winter Soldier and Ragnarok', 'the Doctor Strange', 'Avengers Infinity War', 'the Avengers Endgame']
TIME ['3 and half hours']
ORDINAL ['second', 'first']
EVENT ['the Avengers Infinity War', 'Captain America Civil War', 'Civil War', 'America Civil War', 'The Avengers Infinity War', 'Avengers Infinity War', 'the Doctor Strange', 'Infinity War', 'Avengers Infinity War to Avengers Endgame']
NORP ['Marvel', 'Better', 'Guardians']
PERSON ["Captain Marvel's", 'Avengers Endgame', 'Matrix', 'Forrest Gump', 'Strange', 'Avengers Age', 'Captain Marvel', 'Netflix John Wick 3', 'Team Thor', 'Marvel', 'America Civil', 'Stan Lee']
LAW ['the Doctor Strange']
ORG ['Avengers Infinity War to Avengers Endgame', 'the Avengers Infinity War', 'Avengers Infinity', 'Ultron', 'Avengers Infinify War', 'Team Thorn', 'Avengers Infinity War and Avengers Endgame', 'Avengers Infinity War', 'Blade', "Avengers Infinity War's", 'the Captain America Civil War', 

In [56]:
Avengers = [sent for sent in document.sents if 'avengers' in sent.string.lower()]

In [57]:
sentence = Avengers[2] 
for word in sentence:
    print(word, ': ', str(list(word.children)))

the :  []
directing :  [the, of]
of :  [War]
Avengers :  []
Infinify :  []
War :  [Avengers, Infinify]
is :  [directing, straightforward, .]
more :  []
straightforward :  [more, than]
than :  [Endgame]
the :  []
Avengers :  []
Endgame :  [the, Avengers]
. :  [
]

 :  []


In [48]:
def pos_words (sentence, token, ptag):
    sentences = [sent for sent in sentence.sents if token in sent.string]     
    pwrds = []
    for sent in sentences:
        for word in sent:
            for character in word.string:
                   pwrds.extend([child.string.strip() for child in word.children
                                                      if child.pos_ == ptag] )
    return Counter(pwrds).most_common(10)

pos_words(document, 'Infinity War', 'ADJ')


[('better', 126),
 ('more', 63),
 ('much', 38),
 ('best', 27),
 ('surprising', 26),
 ('final', 22),
 ('latest', 22),
 ('powerful', 21),
 ('exciting', 20),
 ('good', 18)]

In [None]:
# 寻找token 为 better 主语为 电影名称的句子
def pos_words (sentence, token, ptag):
    sentences = [sent for sent in sentence.sents if token in sent.string]     
    pwrds = []
    for sent in sentences:
        for word in sent:
            for character in word.string:
                   pwrds.extend([child.string.strip() for child in word.children
                                                      if child.pos_ == ptag] )
    return Counter(pwrds).most_common(10)

pos_words(document, 'Infinity War', 'ADJ')

In [111]:
document = nlp(u'Captain America Civil War definitely is better than spider-man, well, a litter brother. ')
verbs = []
nsubj = []

for possible_verb in document:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)

In [146]:
document = nlp(u'Captain America Civil War definitely is better than spider-man, well, a litter brother. ')

In [119]:
for token in document:
    print(token.dep_)

compound
compound
compound
nsubj
advmod
ROOT
acomp
prep
compound
punct
pobj
punct
intj
punct
det
compound
attr
punct


In [171]:
# try Shortest path in spacy
import spacy
import networkx as nx
nlp = spacy.load("en_core_web_sm")
doc = nlp(u'Captain America Civil War has a more tedious stories than Spider Man, superheroes are a waste of time always. ')
print('sentence:'.format(doc))
# Load spacy's dependency tree into a networkx graph
edges = []
for token in doc:
    for child in token.children:
        edges.append(('{0}'.format(token.lower_),
                      '{0}'.format(child.lower_)))
graph = nx.Graph(edges)
# Get the length and path
entity1 = 'War'.lower()
entity2 = 'Man'.lower()
print(nx.shortest_path_length(graph, source=entity1, target=entity2))
print(nx.shortest_path(graph, source=entity1, target=entity2))

sentence:
4
['war', 'has', 'stories', 'than', 'man']


In [172]:
from __future__ import unicode_literals, print_function

import plac
import spacy
doc = nlp(
        "displaCy uses CSS and JavaScript to show you how computers "
        "understand language"
    )

    # The easiest way is to find the head of the subtree you want, and then use
    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
    # is the one that does what you're asking for most directly:
for word in doc:
        if word.dep_ in ("xcomp", "ccomp"):
            print("".join(w.text_with_ws for w in word.subtree))

    # It'd probably be better for `word.subtree` to return a `Span` object
    # instead of a generator over the tokens. If you want the `Span` you can
    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
    # object is nice because you can easily get a vector, merge it, etc.
for word in doc:
        if word.dep_ in ("xcomp", "ccomp"):
            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
            print(subtree_span.text, "|", subtree_span.root.text)

to show you how computers understand language
how computers understand language
to show you how computers understand language | show
how computers understand language | understand
