### Imports

In [22]:
import spacy
import os
import neuralcoref

from spacy import displacy
from textblob import TextBlob

nlp = spacy.load('en_core_web_sm')

### Helpers

In [19]:
def create_nlp_pipe(coreference = True):
    """Creates the nlp pipeline using spacy. Adds neuralcoref if True
    
    Arguments:
        coreference {bool}: If True, coreference is added to pipeline
    
    Returns:
        nlp {obj}: nlp object
    """
    
    if coreference:
        coref = neuralcoref.NeuralCoref(nlp.vocab)
        nlp.add_pipe(coref, name='neuralcoref')
    return nlp

In [3]:
def tokenize(text):
    """Parses the given text using nlp pipeline
    
    Arguments:
        text {str}: Text to be parsed
    
    Returns:
        doc {obj}: nlp document object
    """
    
    doc = nlp(text)
    return doc

In [4]:
def plot_dependencies(doc):
    """Plots the dependencies in the nlp document
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns: 
        displacy plot
    """
    
    displacy.serve(doc, style="dep")

In [5]:
def get_pos_tags(doc):
    """Prints the pos tags for each token in the document
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns:
        token.pos_ {unicode}: POS tag of each token
    """
    
    for token in doc:
        print(token.text, token.pos_)

In [6]:
# nlp.add_pipe(nlp.create_pipe('merge_noun_chunks'))

def merge_nouns(doc):
    """If two consecutive tokens are nouns, it concatenates them into one \
    representing one aspect term.
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns:
        new_text {str}: text for retokenization after nouns have been merged
    """
    
    i=0
    new_text = ''
    while i<len(doc):
        if doc[i].dep_ == 'compound':
            compound_noun = doc[i].text + doc[i+1].text
            new_text = new_text + ' ' + compound_noun
            i += 2
        else:
            new_text = new_text + ' ' + doc[i].text
            i += 1
    return new_text

In [7]:
def get_aspect_terms(doc):
    """This function returns the root noun present in noun chunks of the document.
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns:
        aspects {list{str}}: list of root nouns as aspects
    """
                                            
    aspects = [(chunk.root.text) for chunk in doc.noun_chunks if chunk.root.pos_ == 'NOUN']
    return aspects

In [8]:
def get_sentiment_terms(doc):
    """This function return the adjectives and verbs that are not stopwords \
    or punctuations, indicating a descriptive/polarized word.
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns:
        sentiment_terms {list{str}}: list of sentiment terms
    """
    
    sentiment_terms = []
    if doc.is_parsed:
        sentiment_terms.append([token.lemma_ for token in doc if \
                                (not token.is_stop and not token.is_punct \
                                 and (token.pos_ == "ADJ" or token.pos_ == "VERB"))])
    else:
        sentiment_terms.append('') 
    return sentiment_terms

In [9]:
def get_dependencies(doc):
    """Prints the dependency tree of the document
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns:
        token.text {unicode}: Verbatim text content
        token.dep_ {unicode}: Syntactic dependency relation
        token.head.text {unicode}: The syntactic parent, or “governor”, of a token
        token.head.pos_ {unicode}: POS tag of the governor
        children {list}: list of children of a token
        
    """
    
    for token in doc:
        print(token.text, token.dep_, token.head.text, token.head.pos_,
                [child for child in token.children])

In [10]:
def get_opinion_pairs(doc):
    """This function returns the opinion pairs based on pre-defined rules.
    
    Arguments:
        doc {obj}: nlp document object
    
    Returns:
        opinion_pairs {list{tuple}}: list of tuples consisiting of (aspect, opinion)
    """
    
    opinion_pairs = []
    for token in doc:
        if token.dep_ == 'nsubj' and TextBlob(token.head.text).polarity > 0.4:
            opinion_pairs.append((token.text, token.head.text))
        elif token.dep_ == 'dobj' and (token.head.pos_ == 'ADJ' or TextBlob(token.head.text).polarity > 0.4):
            opinion_pairs.append((token.text, token.head.text))
        elif token.dep_ == 'amod' and token.head.pos_ == 'ADJ':
            opinion_pairs.append((token.text, token.head.text))
    return opinion_pairs

In [12]:
def target_extraction_pipeline(text):
    """This is the main pipeline that runs for extracting targets using rule based method
    
    Arguments:
        text {str}: string of text to extract targets from
        
    Returns:
        aspects {list}: list of target terms
    
    """
    nlp = create_nlp_pipe(False)
    doc = tokenize(text)
    aspects = get_aspect_terms(doc)
    
    return aspects

### Implementation

In [11]:
# text = 'I enjoyed the screen resolution, it is amazing for such a cheap laptop.'
text = "Slow service, but the waiter were friendly. He kept us engaged in conversation"

In [21]:
aspects = target_extraction_pipeline(text)
aspects

['service', 'waiter', 'conversation']

### Other functions

In [85]:
nlp = create_nlp_pipe(True)

In [86]:
doc = tokenize(text)

In [87]:
get_pos_tags(doc)

Slow ADJ
service NOUN
, PUNCT
but CCONJ
the DET
waiter NOUN
were VERB
friendly ADJ
. PUNCT
He PRON
kept VERB
us PRON
engaged VERB
in ADP
conversation NOUN


In [88]:
new_text = merge_nouns(doc) #TODO: make lookup table to reverse the process
new_text

' Slow service , but the waiter were friendly . He kept us engaged in conversation'

In [89]:
doc2 = tokenize(new_text)

In [90]:
aspects = get_aspect_terms(doc2)
aspects

['service', 'waiter', 'conversation']

In [91]:
sentiment_terms = get_sentiment_terms(doc2)
sentiment_terms

[['slow', 'friendly', 'keep', 'engage']]

In [128]:
plot_dependencies(doc2)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [92]:
get_dependencies(doc2)

   Slow ADJ []
Slow ROOT Slow ADJ [ , service, ,, but, were]
service dobj Slow ADJ []
, punct Slow ADJ []
but cc Slow ADJ []
the det waiter NOUN []
waiter nsubj were VERB [the]
were conj Slow ADJ [waiter, friendly, .]
friendly acomp were VERB []
. punct were VERB []
He nsubj kept VERB []
kept ROOT kept VERB [He, us, engaged]
us dobj kept VERB []
engaged oprd kept VERB [in]
in prep engaged VERB [conversation]
conversation pobj in ADP []


Above structure is: (dependent, relation, governor)

Rules for get_aspect_sentiment:
- If relation-type is nsubj and the Governor is polarized word then (Dependent, Governor) is a Opinion-pair.
- If relation-type is amod and the Governor is tagged as adjective then (Dependent, Governor) is a Opinion-pair.
- If relation-type is dobj and the Governor has pos tag as adjective(JJ) then (Dependent, Governor) is a Opinion-pair.

In [93]:
opinion_pairs = get_opinion_pairs(doc2)
opinion_pairs

[('service', 'Slow')]

### Coreference Resolution (huggingface neuralcoref)

In [94]:
print(doc._.has_coref)
print(doc._.coref_clusters)

True
[the waiter: [the waiter, He]]
