In [52]:
import pandas as pd
import re
import warnings
import socket
import json
import nltk
import operator
from stanfordcorenlp import StanfordCoreNLP
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ashwinramesh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [53]:
class Stack:
    def __init__(self):
        self.__storage = []

    def isEmpty(self):
        return len(self.__storage) == 0

    def push(self,p):
        self.__storage.append(p)

    def pop(self):
        return self.__storage.pop()
    
    def pop_val1(self):
        return self.__storage.pop()[0]
    
    def pop_val2(self):
        return self.__storage.pop()[1]
    
    def top(self):
        return self.__storage[-1]
    
    def top_at_pos_from_top(self, pos):
        return self.__storage[self.size() - pos - 1]
    
    def top_val1(self):
        return self.__storage[-1][0]
    
    def top_val1_at_pos_from_top(self, pos):
        return self.__storage[self.size() - pos - 1][0]
    
    def top_val2(self):
        return self.__storage[-1][1]
    
    def top_val2_at_pos_from_top(self, pos):
        return self.__storage[self.size() - pos - 1][1]
    
    def combine_val1(self, cur_val):
        top_val = self.pop()
        self.push((top_val[0] + ' ' + cur_val, top_val[1]))

    def update_val2(self, cur_val):
        top_val = self.pop()
        self.push((top_val[0], cur_val))
        
    def update_val2_at_pos_from_top(self, cur_val, pos):
        self.__storage[self.size() - pos - 1] = (self.__storage[self.size() - pos - 1][0],cur_val) # Tuples are immutable

    def size(self):
        return len(self.__storage)
    
    def prnt(self):
        if len(self.__storage) > 0:
            return str(self.__storage)
        else:
            return ''

In [54]:
# Global variables
global _nlp
global _ip
global _port
global _buffer_size
global _sen_analyzer

_nlp = None
_ip = '100.81.36.227'
_port = 7183
_buffer_size = 1024
_sen_analyzer = None

In [55]:
def init():
    global _nlp
    global _sen_analyzer
    _nlp = StanfordCoreNLP(r'stanford-corenlp-full-2017-06-09')
    _sen_analyzer = SentimentIntensityAnalyzer()

In [56]:
def cleanup():
    _nlp.close()

In [57]:
def send_over_socket(data):
    if len(data) == 0:
        return
    print "Send Data: " + str(data)
    client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    client_socket.connect((_ip, _port))
    client_socket.send(json.dumps(data))
    print "Sent Data"
    client_socket.close()

In [58]:
def get_data():
    d = {'title': 'Trump’s Promises to Kim Jong-un Leave U.S. and Allies Scrambling', 'content': 'Along with 10 other nations, Canada is trying to revive the Trans-Pacific Partnership, a trade deal championed by the Obama administration and abandoned by Mr. Trump.'}
    df = pd.DataFrame(data=d, index=[0])
    return df

In [59]:
def get_named_entity(phrase):
    entity_dict = {}
    allowed_entities = ['PERSON', 'ORGANIZATION', 'LOCATION']
    for pair in _nlp.ner(phrase):
        if pair[1] in allowed_entities:
            entity_dict[allowed_entities.index(pair[1])] = pair[1]
    if len(entity_dict) == 0:
        return "-"
    else:
        entity_dict_sorted = sorted(entity_dict.items())
        return entity_dict_sorted[0][1]

In [66]:
def get_sentiment(phrase):
    sentiment_dict = {'pos': '2', 'neg': '-2', 'neu': '1'}
    sentiment = _sen_analyzer.polarity_scores(phrase)
    del sentiment['compound']
    max_key = max(sentiment.iteritems(), key=operator.itemgetter(1))[0]
    if sentiment[max_key] == 0:
        return 0
    else:
        return sentiment_dict[max_key]

In [61]:
# noun_st: stack to hold all the nouns; val1 refers to noun and val2 refers to the level of nesting
# verb_st: stack to hold all the verbs; val1 refers to verb and val2 refers to the level of nesting
# prev_added_verb: helps determine if the verbs are consecutive and can be combined
# cur_nest: gives the level of nesting of the tags on the current line ( got using the number of tabs at the beginning of the line)
# nouns_popped_count: Add two nouns at a time to noun list

def parse_dependency_tree(tree):
    # Define patterns for Regular expression matching
    open_b, close_b = '()'
    open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
    node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
    leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
    token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
            open_pattern, node_pattern, close_pattern, leaf_pattern))
    
    # Define other variables in the function whose values are constant or needs to be retained between lines of the tree
    prev_added_verb = False
    allowed_NP = ['NP', 'NN', 'NNP', 'NNPS', 'NNS', 'JJ', 'CD']
    allowed_VP = ['VP', 'VB', 'VBP', 'VBZ', 'VBD', 'VBG', 'VBN', 'MD', 'TO', 'PP']
    prev_nest,cur_nest = 0,0
    noun_st = Stack()
    verb_st = Stack()    
    noun_list = []
    verb_list = []
    ner_list = []
    sen_list = []
    
    # For every line in the tree
    for line in tree.split('\n'):
        print '\nLINE: ' + line
        
        #### 1. Find and process all matching patterns ####
        
        main_tag, cur_tag, noun, verb = '','','',''
        prev_nest = cur_nest
        cur_nest = len(line) - len(line.lstrip(' '))
        
        for match in token_re.finditer(line):
            token = match.group()
            # Beginning of a tree/subtree
            if token[0] == open_b:
                if main_tag == '':
                    # main_tag is usually ROOT, S, NP, VP, JJ etc
                    main_tag = token[1:].lstrip()
                else:
                    # get the current tag, the nested one
                    cur_tag = token[1:].lstrip()    
            # End of a tree/subtree - nothing to do
            elif token == close_b:
                ignore = 1
            # Leaf node
            else:
                # if main tag is a noun (or adjective) and the nested tags are part of the noun, combine them all
                if main_tag in allowed_NP and (cur_tag in allowed_NP or cur_tag == ''):
                    noun += token + ' '
                # if main tag is a verb (or preposition) and the nested tags are part of the verb, combine them all
                elif main_tag in allowed_VP and (cur_tag in allowed_VP or cur_tag == ''):
                    verb += token + ' '
                # if main tag is a preposition, regardless of inner tags, consider it a verb
                elif main_tag == 'PP':
                    verb += token + ' '

        print '#### 1. Find and process all matching patterns ####'
        print 'NOUN: ' + noun.rstrip()
        print 'VERB: ' + verb.rstrip()
        print 'MAIN_TAG: ' + main_tag
        print 'CUR_TAG: ' + cur_tag
        print 'PREV NEST: ' + str(prev_nest)
        print 'CUR NEST: ' + str(cur_nest)
        
        #### 2. If exiting nesting, pop the necessary stack elements and to graph data ####
        
        if cur_nest < prev_nest:
            print '#### 2. If exiting nesting, pop the necessary stack elements and to graph data ####'
            while noun_st.size() > 0 and cur_nest <= noun_st.top_val2():
                if noun_st.size() > 1:
                    popped_noun = noun_st.pop()
                    if cur_nest <= noun_st.top_val2():
                        v1 = popped_noun[0];
                        v2 = noun_st.top()[0];
                        noun_list.append(v1)
                        noun_list.append(v2)
                        ner_list.append(get_named_entity(v1))
                        ner_list.append(get_named_entity(v2))
                    else:
                        if cur_nest <= verb_st.top_val2():
                            v1 = popped_noun[0];
                            v2 = noun_st.top()[0];
                            noun_list.append(v1)
                            noun_list.append(v2)
                            ner_list.append(get_named_entity(v1))
                            ner_list.append(get_named_entity(v2))
                        else:
                            noun_st.push(popped_noun)
                            break
                else:
                    break
                    
                if verb_st.size() > 0 and (verb_st.top_val2() >= noun_st.top_val2() or verb_st.top_val2() >= popped_noun[1]):
                    e = verb_st.pop_val1()
                    verb_list.append(e)
                    sen_list.append(get_sentiment(e))
                else:
                    verb_list.append('-') 
                    sen_list.append('0')
                    
            if noun_st.size() > 0:
                noun_st.update_val2(min(cur_nest,noun_st.top_val2()))
                    
            cur_pos_from_top = 0
            while verb_st.size() > cur_pos_from_top and cur_nest < verb_st.top_val2_at_pos_from_top(cur_pos_from_top):
                verb_st.update_val2_at_pos_from_top(cur_nest, cur_pos_from_top)
                cur_pos_from_top += 1

            print 'NOUN STACK: ' + noun_st.prnt()
            print 'VERB STACK: ' + verb_st.prnt()
            print 'NOUN LIST: ' + str(noun_list)
            print 'VERB LIST: ' + str(verb_list)
            print 'ENTITY LIST: ' + str(ner_list)
            print 'SENTIMENT LIST: ' + str(sen_list)
                
        #### 3. Add necessary elements to stack ####
    
        if noun != '':
            noun_st.push((noun.rstrip(), cur_nest))
            prev_added_verb = False
            
        if verb != '':
            if prev_added_verb == True and verb_st.size() > 0:
                verb_st.combine_val1(verb.rstrip())
            else:
                verb_st.push((verb.rstrip(), cur_nest))
            prev_added_verb = True

        print '#### 3. Add necessary elements to stack ####'
        print 'NOUN STACK: ' + noun_st.prnt()
        print 'VERB STACK: ' + verb_st.prnt()
        
    return noun_list, verb_list, ner_list, sen_list

In [62]:
def parse_data(data):
    # For each article
    for index, row in data.iterrows(): 
        # Get the content
        content = row['content']

        # Split content into sentenes
        result = _nlp.annotate(content,
                               properties={
                                   'annotators': 'ssplit',
                                   'outputFormat': 'json'
                               })
        annotated_content = json.loads(result)

        # For each sentence
        for annotated_sentence in annotated_content['sentences']:
            sentence = ' '.join([t['word'] for t in annotated_sentence['tokens']])
            
            # Get the dependency tree for the sentence
            tree = _nlp.parse(sentence)
            vertices, edges, entities, sentiment = parse_dependency_tree(tree)
            print '\n\nSEND OVER SOCKET'
            socket_data = {}
            socket_data['operation'] = 'append_graph_data'
            socket_data['vertices'] = vertices
            socket_data['edges'] = edges
            socket_data['entities'] = entities
            socket_data['sentiment'] = sentiment
            send_over_socket(socket_data)

In [63]:
def get_sample_results():
    socket_data = {}
    #socket_data['operation'] = 'get_all_persons'
    #socket_data['operation'] = 'get_all_persons_with_degree'
    socket_data['operation'] = 'get_sentiment_around_person'
    socket_data['person'] = 'President Trump'
    send_over_socket(socket_data)

In [64]:
def main():
    init()
    data = get_data()
    parse_data(data)
    #get_sample_results()
    cleanup()

In [65]:
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    main()


LINE: (ROOT
#### 1. Find and process all matching patterns ####
NOUN: 
VERB: 
MAIN_TAG: ROOT
CUR_TAG: 
PREV NEST: 0
CUR NEST: 0
#### 3. Add necessary elements to stack ####
NOUN STACK: 
VERB STACK: 

LINE:   (S
#### 1. Find and process all matching patterns ####
NOUN: 
VERB: 
MAIN_TAG: S
CUR_TAG: 
PREV NEST: 0
CUR NEST: 2
#### 3. Add necessary elements to stack ####
NOUN STACK: 
VERB STACK: 

LINE:     (PP (IN Along)
#### 1. Find and process all matching patterns ####
NOUN: 
VERB: Along
MAIN_TAG: PP
CUR_TAG: IN
PREV NEST: 2
CUR NEST: 4
#### 3. Add necessary elements to stack ####
NOUN STACK: 
VERB STACK: [(u'Along', 4)]

LINE:       (PP (IN with)
#### 1. Find and process all matching patterns ####
NOUN: 
VERB: with
MAIN_TAG: PP
CUR_TAG: IN
PREV NEST: 4
CUR NEST: 6
#### 3. Add necessary elements to stack ####
NOUN STACK: 
VERB STACK: [(u'Along with', 4)]

LINE:         (NP (CD 10) (JJ other) (NNS nations))))
#### 1. Find and process all matching patterns ####
NOUN: 10 other nations
VER