In [17]:
import spacy
from sentences import text
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text) # use doc by convention? - spacy documentation

# Extract single sentence
Split text into sentences

In [18]:
def get_sent(text):
    """
    return:     list of sentences for given text,
                you can extract single sentence using
                indexing. It return str object.
    """
    tokens = nlp(text)
    sents = []
    for sent in tokens.sents:
        sents.append(sent.string.strip())
    return sents

print(f"there is: {len(get_sent(text))} sentences in input text")
print(f"\nfirst sentnece: \n{get_sent(text)[0]}")

there is: 7 sentences in input text

first sentnece: 
An engineer had to plan the construction of an artificial lake to produce electric energy.


### Sentence Analysis

In [25]:
def sent_(sent, dep=False):
    """
    Analyse sentence, breaks sentence into text, pos, dep
    part of speach
    dependency

    """

    sent = nlp(sent)
    print("Text\tPOS\tDEP\tExplained")
    for token in sent:
        # Get the token text, part-of-speech tag and dependency label
        token_text = token.text
        token_pos = token.pos_
        token_dep = token.dep_
        # This is for formatting only
    
        print('{:<12}{:<10}{:<10}{:<10}'.format(token_text, token_pos, token_dep,spacy.explain(token_pos)))
    if dep==True:
        displacy.render(sent, style='dep')

sent_(get_sent(text)[0])

Text	POS	DEP	Explained
An          DET       det       determiner
engineer    NOUN      nsubj     noun      
had         AUX       ROOT      auxiliary 
to          PART      aux       particle  
plan        VERB      xcomp     verb      
the         DET       det       determiner
constructionNOUN      dobj      noun      
of          ADP       prep      adposition
an          DET       det       determiner
artificial  ADJ       amod      adjective 
lake        NOUN      pobj      noun      
to          PART      aux       particle  
produce     VERB      advcl     verb      
electric    ADJ       amod      adjective 
energy      NOUN      dobj      noun      
.           PUNCT     punct     punctuation


In [24]:
spacy.explain('PRE')

In [12]:
## Looking for roots os sentence.
def n_chunk():
    roots = ''
    for chunk in doc.noun_chunks:
        print(f"Text: {chunk.text}\n-> Root: {chunk.root.text}\n-> Arc label:{chunk.root.dep_}\n-> Root head: {chunk.root.head.text}\n")
        roots += chunk.root.text+ ' ' +chunk.root.head.text+' '


    """
    EXPECTED OUTPUT:

    engineer,had,construction
    engineer,plan,construction
    engineer,produce,energy

    """

    return roots

In [35]:
def noun_chunks_sent(sentence):
    '''
    Take in single sntence and output
    nound chunks
    '''
    chunks = []
    sent = nlp(sentence)
    for chunk in sent.noun_chunks:
        chunks.append(chunk)
    displacy.render(sent,style='dep')
    return chunks
noun_chunks_sent(get_sent(text)[0])

[An engineer, the construction, an artificial lake, electric energy]

In [56]:
have_right = []
for i in doc.noun_chunks:
    print(i)
    print(i.lower_)
    print(i.lemma_)
    print(i.n_lefts)
    print(i.n_rights)
    print(i.noun_chunks)
    print(i.rights)
    print(i.subtree)
    print(i.vocab)
    print("\n")
    if (i.n_rights == 1):
        have_right.append
        

An engineer
an engineer
an engineer
0
0
<generator object at 0x11c9caaf0>
<generator object at 0x11c9caaf0>
<generator object at 0x11c9caaf0>
<spacy.vocab.Vocab object at 0x11cb1a740>


the construction
the construction
the construction
0
1
<generator object at 0x11f0f8c10>
<generator object at 0x11f0f8c10>
<generator object at 0x11f0f8c10>
<spacy.vocab.Vocab object at 0x11cb1a740>


an artificial lake
an artificial lake
an artificial lake
0
0
<generator object at 0x11c9d1ee0>
<generator object at 0x11c9d1ee0>
<generator object at 0x11c9d1ee0>
<spacy.vocab.Vocab object at 0x11cb1a740>


electric energy
electric energy
electric energy
0
0
<generator object at 0x11f2ff3a0>
<generator object at 0x11f2ff3a0>
<generator object at 0x11f2ff3a0>
<spacy.vocab.Vocab object at 0x11cb1a740>


the lake
the lake
the lake
0
1
<generator object at 0x11f2ffee0>
<generator object at 0x11f2ffee0>
<generator object at 0x11f2ffee0>
<spacy.vocab.Vocab object at 0x11cb1a740>


he
-PRON-
he
0
0
<generator obj

# Dependency Parser
spaCy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree. The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or “chunks”. You can check whether a Doc object has been parsed with the doc.is_parsed attribute, which returns a boolean value. If this attribute is False, the default sentence iterator will raise an exception.

## Noun chunks
Noun chunks are “base noun phrases” – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, “the lavish green grass” or “the world’s largest tech fund”. To get the noun chunks in a document, simply iterate over Doc.noun_chunks

## Navigating the parse tree
spaCy uses the terms head and child to describe the words connected by a single arc in the dependency tree. The term dep is used for the arc label, which describes the type of syntactic relation that connects the child to the head. As with other attributes, the value of .dep is a hash value. You can get the string value with .dep_.

In [27]:
doc = nlp(sents[0])
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

An det engineer NOUN []
engineer nsubj had AUX [An]
had ROOT had AUX [engineer, plan, .]
to aux plan VERB []
plan xcomp had AUX [to, construction, produce]
the det construction NOUN []
construction dobj plan VERB [the, of]
of prep construction NOUN [lake]
an det lake NOUN []
artificial amod lake NOUN []
lake pobj of ADP [an, artificial]
to aux produce VERB []
produce advcl plan VERB [to, energy]
electric amod energy NOUN []
energy dobj produce VERB [electric]
. punct had AUX []


## Accessing token childrens
first loop for iterate over tokens in nlp object, second if for accessing token childrens 

In [37]:
for token in doc:
    # print(token.text, token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
    # print(token)
    for t in token.children:
        print(t,t.dep_,t.head.text)

An det engineer
engineer nsubj had
plan xcomp had
. punct had
to aux plan
construction dobj plan
produce advcl plan
the det construction
of prep construction
lake pobj of
an det lake
artificial amod lake
to aux produce
energy dobj produce
electric amod energy


Because the syntactic relations form a tree, every word has exactly one head. You can therefore iterate over the arcs in the tree by iterating over the words in the sentence. This is usually the best way to match an arc of interest — from below:

In [48]:
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
        
print(verbs)

{thought}


In [49]:
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

[]
[]
0
0


## Verb Extraction

In [41]:
from spacy.symbols import nsubj, VERB

# nlp = spacy.load("en_core_web_sm")
doc = nlp(sents[1])

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

# Finding a verb with a subject from above — less good
verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break
print(verbs)

{thought}
[thought]
