# Part I Extract subject-verb-object (SVO) relations from the article

In [6]:
### define the function (code from class exercise)

import spacy
nlp = spacy.load('en_core_web_sm')

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)
    #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
    #    objs.extend(potentialNewObjs)
    #    v = potentialNewVerb

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])

In [8]:
### apply SVO function to an article

# To be simple, I just use the same article as Assignment 4
text = "Centene (NYSE:CNC) updated its FY20 earnings guidance on Friday. The company provided earnings per share guidance of $4.76-4.96 for the period, compared to the Thomson Reuters consensus earnings per share estimate of $4.72. The company issued revenue guidance of $109.5-111.9 billion, compared to the consensus revenue estimate of $110.88 billion.Centene also updated its FY 2020 Pre-Market guidance to 4.76-4.96 EPS.\nA number of analysts recently commented on CNC shares. Citigroup increased their price objective on Centene from $70.00 to $81.00 and gave the company a buy rating in a research note on Wednesday, April 29th. Deutsche Bank decreased their price target on shares of Centene from $82.00 to $80.00 and set a buy rating on the stock in a research note on Wednesday, April 29th. Credit Suisse Group reiterated a hold rating and set a $72.50 price target on shares of Centene in a report on Wednesday, April 29th. Morgan Stanley upped their price objective on shares of Centene from $85.00 to $89.00 and gave the company an overweight rating in a research note on Monday, April 13th. Finally, SunTrust Banks lifted their target price on shares of Centene from $85.00 to $90.00 and gave the stock a buy rating in a research note on Monday, April 13th. One research analyst has rated the stock with a sell rating, two have issued a hold rating, thirteen have given a buy rating and one has given a strong buy rating to the stock. The company presently has an average rating of Buy and a consensus price target of $82.78. Get Centene alerts:\nShares of NYSE CNC opened at $61.94 on Friday. The stock has a market capitalization of $34.90 billion, a PE ratio of 15.49, a P/E/G ratio of 0.87 and a beta of 0.71. The company has a quick ratio of 1.14, a current ratio of 1.14 and a debt-to-equity ratio of 0.72. The firm has a fifty day moving average of $66.30 and a 200-day moving average of $62.59. Centene has a 52-week low of $41.62 and a 52-week high of $74.70. Centene (NYSE:CNC) last issued its earnings results on Tuesday, April 28th. The company reported $0.86 earnings per share for the quarter, missing the Zacks’ consensus estimate of $0.99 by ($0.13). The firm had revenue of $26.03 billion during the quarter, compared to analyst estimates of $23.96 billion. Centene had a net margin of 1.03% and a return on equity of 11.47%. Centene’s revenue was up 41.1% compared to the same quarter last year. During the same quarter in the previous year, the business posted $1.39 EPS. Equities research analysts expect that Centene will post 4.74 EPS for the current fiscal year.\nIn related news, Director Tommy G. Thompson sold 1,500 shares of the firm’s stock in a transaction that occurred on Monday, April 6th. The shares were sold at an average price of $58.00, for a total value of $87,000.00. The transaction was disclosed in a document filed with the SEC, which is available at this hyperlink . Also, Director Robert K. Ditmore sold 33,333 shares of Centene stock in a transaction that occurred on Monday, June 8th. The shares were sold at an average price of $65.31, for a total transaction of $2,176,978.23. The disclosure for this sale can be found here . Insiders have sold a total of 180,019 shares of company stock valued at $11,900,641 in the last quarter. Company insiders own 1.90% of the company’s stock.\nCentene Company Profile\nCentene Corporation operates as a diversified and multi-national healthcare enterprise that provides programs and services to under-insured and uninsured individuals in the United States. The company's Managed Care segment offers health plan coverage to individuals through government subsidized programs, including Medicaid, the State children's health insurance program, long-term services and support, foster care, and medicare-medicaid plans, which covers dually eligible individuals, as well as aged, blind, or disabled programs. Receive News & Ratings for Centene Daily - Enter your email address below to receive a concise daily summary of the latest news and analysts' ratings for Centene and related companies with MarketBeat.com's FREE daily email newsletter . «"

# apply
tok = nlp(text)
svos = findSVOs(tok)
print(svos)

[('centene', 'updated', 'guidance'), ('company', 'provided', 'earnings'), ('company', 'issued', 'guidance'), ('centene', 'updated', 'guidance'), ('citigroup', 'increased', 'objective'), ('bank', 'decreased', 'target'), ('group', 'reiterated', 'rating'), ('stanley', 'upped', 'objective'), ('banks', 'lifted', 'price'), ('analyst', 'rated', 'stock'), ('two', 'issued', 'rating'), ('thirteen', 'given', 'rating'), ('thirteen', 'given', 'one'), ('company', 'reported', 'earnings'), ('business', 'posted', 'eps'), ('centene', 'post', 'eps'), ('thompson', 'sold', 'shares'), ('ditmore', 'sold', 'shares'), ('insiders', 'sold', 'total'), ('insiders', 'own', '%'), ('enterprise', 'provides', 'programs'), ('enterprise', 'provides', 'services'), ('segment', 'offers', 'coverage'), ('segment', 'offers', 'to'), ('programs', 'covers', 'individuals'), ('news', 'enter', 'address')]


# Part II Extract keywords (key phrases) from the article

#### Method 1

In [9]:
### define the function (code from class exercise)

from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight


        keyphrase_extractor = TextRank4Keyword()

In [16]:
### apply function to an article

tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN',"ADP"], window_size=8, lower=False)
tr4w.get_keywords(20)

Centene - 6.379846362332974
stock - 3.4534517059998873
company - 3.166833619363474
shares - 3.077348558616903
rating - 2.365556396891015
earnings - 2.154611572349058
price - 2.0965174965878406
individuals - 1.9354508556969545
programs - 1.9309265463315595
transaction - 1.886708149637478
April - 1.8830185321184252
research - 1.8730735936528768
guidance - 1.677421527870922
health - 1.6514214891097612
buy - 1.6427468694355998
ratio - 1.6190469352311192
firm - 1.606970263741923
quarter - 1.6022627138507448
analysts - 1.58126659559293
news - 1.5685951631739155
target - 1.5442919886990412
Monday - 1.5054865477342525


#### Method 2

In [14]:
### define the function (code from class exercise)

def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

def score_keyphrases_by_tfidf(texts, candidates='chunks'):
    import gensim, nltk
    
    # extract candidates from each text in texts, either chunks or words
    if candidates == 'chunks':
        boc_texts = [extract_candidate_chunks(text) for text in texts]
    elif candidates == 'words':
        boc_texts = [extract_candidate_words(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary

def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import operator
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keywords]}
                  #for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
                  
    #sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[:3]
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
            
    return sorted(keyphrases.items(), key=operator.itemgetter(1), reverse=True)
    #return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)

In [15]:
### apply function to an article

score_keyphrases_by_textrank(text)

[('centene', 0.03628291642725251),
 ('centene company', 0.03146128418412235),
 ('centene stock', 0.026780776637427943),
 ('company', 0.02663965194099219),
 ('centene daily', 0.024302512538490827),
 ('shares', 0.02413351429988731),
 ('company stock', 0.021959144394297782),
 ('stock', 0.017278636847603376),
 ('april', 0.016747566398765228),
 ('price', 0.015244183943823292),
 ('rating', 0.015133053984744986),
 ('average price', 0.0138717720838031),
 ('average rating', 0.013816207104263947),
 ('individuals', 0.013713590351348793),
 ('ratio', 0.013627200802207504),
 ('programs', 0.0134988582120739),
 ('average', 0.012499360223782908),
 ('daily', 0.012322108649729146),
 ('quarter', 0.011807728617272044),
 ('earnings', 0.011297137770688317),
 ('news', 0.011211754447982373),
 ('earnings guidance', 0.01119038556742483),
 ('guidance', 0.01108363336416134)]

# Part III Produce an extractive summary of the article

In [66]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

length_of_summary = input("length of summary in sentences : ")

parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LexRankSummarizer()
sentences = summarizer(parser.document, length_of_summary)

summary = str()
for sentence in sentences:
    summary += (sentence.__unicode__())
print(summary)

length of summary in sentences : 3
The company presently has an average rating of Buy and a consensus price target of $82.78.In related news, Director Tommy G. Thompson sold 1,500 shares of the firm’s stock in a transaction that occurred on Monday, April 6th.The shares were sold at an average price of $65.31, for a total transaction of $2,176,978.23.
