In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
from spacy.attrs import ORTH, NORM

nlp.tokenizer.add_special_case('out of date', [{ORTH:'out of date', NORM: 'outdated'}])

In [29]:
doc = nlp("This is not outdated. That document get deprecated. It's not discouraged. "
          "Service is non-outdated. It's an obsolete document. The article is "
          "obsolete.")
for sent in doc.sents:
    print(sent)

This is not outdated.
That document get deprecated.
It's not discouraged.
Service is non-outdated.
It's an obsolete document.
The article is obsolete.


In [34]:
from spacy.tokens  import Token
from typing import List

KEYWORDS = ['outdated', 'discouraged', 'deprecated', 'obsolete']

tokens: List[Token] = list(doc)
for t in tokens:
    if t.text in KEYWORDS:
        print(f"{t.i}\t{t.text}\t{t.dep_}"
              f"\t{t.tag_}\t{t.pos_}\t{t.head}"
              f"\t{list(t.ancestors)}\t{list(t.children)}")

3	outdated	ROOT	VBN	VERB	outdated	[]	[is, not, .]
8	deprecated	ROOT	JJ	ADJ	deprecated	[]	[document, get, .]
13	discouraged	ROOT	VBN	VERB	discouraged	[]	[It, 's, not, .]
19	outdated	acomp	JJ	ADJ	is	[is]	[-]
24	obsolete	amod	JJ	ADJ	document	[document, 's]	[]
30	obsolete	acomp	JJ	ADJ	is	[is]	[]


In [22]:
t_outdated = doc[3]
t_outdated_children: List[Token] = list(t_outdated.children)

In [25]:
# if child of the keyword contains negation words i.e. 'not'
any((child.dep_ == 'neg' for child in t_outdated.children))

False

In [27]:
for t in doc[:5]:
    if t.dep_ == 'nsubj':
        print(t)

This


In [28]:
t_modifier = doc[24]


In [None]:
for tok in doc:
    print("lemma: {}  \t dep: {} \t pos: {} \t head: {}".format(tok.lemma_, tok.dep_, tok.pos_, tok.head.text))

In [None]:
for sent in doc.sents:
    print(sent.start, sent.end)
    print(doc[sent.start], doc[sent.end - 1])
    print(sent.end_char)
    print(str(sent))

In [3]:
from spacy.tokens import Span

KEYWORDS = ['outdated']

from typing import Optional, Dict, List
from spacy.tokens import Doc, Token

def analyse_sentence(sentence: spacy.tokens.Span) -> Optional[Dict]:
    """
    Analyse a sentence from a doc.
    punct: ? . !
    negative statement: not
    dep of keyword: ROOT or amod
    https://spacy.io/usage/linguistic-features#pos-tagging
    """
    ref_doc: List[Token] = sentence.doc
    text = str(sentence)
    if any([kw in text for kw in KEYWORDS]) is False:
        return None
    start, end = sentence.start, sentence.end - 1
    # print_tokens(ref_doc[start:end])
    punctuation = ref_doc[end]
    target = None
    negation = False
    for token in ref_doc[start:end]:
        if token.dep_ == 'neg': negation = True
        if token.text not in KEYWORDS: continue
        if token.dep_ == 'ROOT':
            for child in token.children:
                if child.dep_ == 'nsubjpass':
                    target = child.text
        elif token.dep_ == 'amod':
            for child in token.children:
                if child.dep_ == 'attr':
                    target = child.text
        elif token.dep_ == 'acomp': # dep_ can be 'acomp'
            for t in ref_doc[start:end]:
                if t.dep_ == 'nsubj':
                    target = t.text
                    break
        else:
            print(token.doc)
            continue
    return {
        "modified_noun": target,
        "punctuation": punctuation,
        "negative_statement": negation
    }

def analyse_content(text: str, nlp) -> Dict:
    """Analyse a text i.e. the comment."""
    clean_text = text.replace('out of date', 'outdated')
    document: Doc = nlp(clean_text)
    res = [analyse_sentence(s) for s in document.sents]
    analysis: List[dict] = [r for r in res if r is not None]
    neg_cnt = sum([r.get('negative_statement', 0) for r in analysis])
    if len(analysis) == 0:
        analysis.append(dict())
    output = {
        "cnt_keywords": len(analysis),
        "subject": analysis[0].get('modified_noun', ""),
        "punctuation": analysis[0].get('punctuation', ""),
        "negative_statement": neg_cnt
    }
    return output

def print_tokens(tokens: List[Token]):
    """Print a sequence of tokens."""
    print("Idx\tText\tdep\ttag\tpos\thead\tancestor\tchildren")
    for t in tokens:
        print(f"{t.i}\t{t.text}\t{t.dep_}"
              f"\t{t.tag_}\t{t.pos_}\t{t.head}"
              f"\t{list(t.ancestors)}\t{list(t.children)}")

In [9]:
doc2 = nlp("It is a non-outdated idea. This idea may not be outdated. This idea is never obsolete. Amazon network service is outdated. Actor server is obsolete. Data become outdated, old and obsolete. It is an old and outdated science boook. MD5 is a very weak hash function and it's usage has been discouraged for many years now: http://en.wikipedia.org/wiki/MD5. Use SHA2 nowadays. MD5 is lipstick on a pig with an identity crisis. ")
print_tokens(doc2)

Idx	Text	dep	tag	pos	head	ancestor	children
0	\	compound	NNP	PROPN	MD5	[MD5, is]	[]
1	MD5	nsubj	NNP	PROPN	is	[is]	[\]
2	is	ROOT	VBZ	AUX	is	[]	[MD5, function, and]
3	a	det	DT	DET	function	[function, is]	[]
4	very	advmod	RB	ADV	weak	[weak, function, is]	[]
5	weak	amod	JJ	ADJ	function	[function, is]	[very]
6	hash	compound	NN	NOUN	function	[function, is]	[]
7	function	attr	NN	NOUN	is	[is]	[a, weak, hash]
8	and	cc	CC	CCONJ	is	[is]	[]
9	it	nsubj	PRP	PRON	's	['s, discouraged]	[]
10	's	nsubjpass	VBZ	AUX	discouraged	[discouraged]	[it, usage, http://en.wikipedia.org/wiki/MD5]
11	usage	attr	NN	NOUN	's	['s, discouraged]	[]
12	has	aux	VBZ	AUX	discouraged	[discouraged]	[]
13	been	auxpass	VBN	AUX	discouraged	[discouraged]	[]
14	discouraged	ROOT	VBN	VERB	discouraged	[]	['s, has, been, for, now, :, .]
15	for	prep	IN	ADP	discouraged	[discouraged]	[years]
16	many	amod	JJ	ADJ	years	[years, for, discouraged]	[]
17	years	pobj	NNS	NOUN	for	[for, discouraged]	[many]
18	now	advmod	RB	ADV	discouraged	[discourag

In [17]:
res = analyse_content("This question and answers are out of date now that SimpleDB supports consistent reads and "
                      "conditional puts. See http://developer.amazonwebservices.com/connect/ann.jspa?annID=611", nlp)
res

{'cnt_keywords': 1,
 'subject': 'question',
 'punctuation': .,
 'negative_statement': 0}