In [2]:
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
import random

In [3]:
sentence = "Jack liked Maria but she didn't like him back because she dreamed about Mexico in 1996"

In [4]:
ne_chunk(pos_tag(word_tokenize(sentence)))#.draw()

Tree('S', [Tree('PERSON', [('Jack', 'NNP')]), ('liked', 'VBD'), Tree('PERSON', [('Maria', 'NNP')]), ('but', 'CC'), ('she', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('like', 'VB'), ('him', 'PRP'), ('back', 'RB'), ('because', 'IN'), ('she', 'PRP'), ('dreamed', 'VBD'), ('about', 'IN'), Tree('GPE', [('Mexico', 'NNP')]), ('in', 'IN'), ('1996', 'CD')])

In [5]:
import spacy
from spacy.en import English
nlp = English()

In [6]:
x = nlp(u"Jack liked Maria but she didn't like him back because she dreamed about Mexico in 1996")

In [7]:
tags = []
for t in x:
    tags.append((t, t.tag_))
tags

[(Jack, u'NNP'),
 (liked, u'VBD'),
 (Maria, u'NNP'),
 (but, u'CC'),
 (she, u'PRP'),
 (did, u'VBD'),
 (n't, u'RB'),
 (like, u'VB'),
 (him, u'PRP'),
 (back, u'RB'),
 (because, u'IN'),
 (she, u'PRP'),
 (dreamed, u'VBD'),
 (about, u'IN'),
 (Mexico, u'NNP'),
 (in, u'IN'),
 (1996, u'CD')]

In [8]:
data = nlp(unicode(sentence))
for token in data:
    print (token.orth_, token.ent_type_, token.lemma_, token.dep_)

(u'Jack', u'PERSON', u'jack', u'nsubj')
(u'liked', u'', u'like', u'ROOT')
(u'Maria', u'PERSON', u'maria', u'dobj')
(u'but', u'', u'but', u'cc')
(u'she', u'', u'she', u'nsubj')
(u'did', u'', u'do', u'aux')
(u"n't", u'', u'not', u'neg')
(u'like', u'', u'like', u'conj')
(u'him', u'', u'him', u'dobj')
(u'back', u'', u'back', u'advmod')
(u'because', u'', u'because', u'mark')
(u'she', u'', u'she', u'nsubj')
(u'dreamed', u'', u'dream', u'advcl')
(u'about', u'', u'about', u'prep')
(u'Mexico', u'GPE', u'mexico', u'pobj')
(u'in', u'', u'in', u'prep')
(u'1996', u'DATE', u'1996', u'pobj')


In [9]:
sentence = "In the middle of 1990s Ivan had already become accustomed to the Soviet Union law system as they never forget a chance to sue him"
data = nlp(unicode(sentence))
for token in data:
    print (token.orth_, token.ent_type_, token.lemma_, token.dep_, token.head, token.head.i)

(u'In', u'', u'in', u'prep', become, 8)
(u'the', u'', u'the', u'det', middle, 2)
(u'middle', u'', u'middle', u'pobj', In, 0)
(u'of', u'', u'of', u'prep', middle, 2)
(u'1990s', u'', u'1990s', u'pobj', of, 3)
(u'Ivan', u'PERSON', u'ivan', u'nsubj', become, 8)
(u'had', u'', u'have', u'aux', become, 8)
(u'already', u'', u'already', u'advmod', become, 8)
(u'become', u'', u'become', u'ROOT', become, 8)
(u'accustomed', u'', u'accustomed', u'acomp', become, 8)
(u'to', u'', u'to', u'prep', accustomed, 9)
(u'the', u'GPE', u'the', u'det', system, 15)
(u'Soviet', u'GPE', u'soviet', u'compound', Union, 13)
(u'Union', u'GPE', u'union', u'compound', system, 15)
(u'law', u'', u'law', u'compound', system, 15)
(u'system', u'', u'system', u'pobj', to, 10)
(u'as', u'', u'as', u'mark', forget, 19)
(u'they', u'', u'they', u'nsubj', forget, 19)
(u'never', u'', u'never', u'neg', forget, 19)
(u'forget', u'', u'forget', u'advcl', become, 8)
(u'a', u'', u'a', u'det', chance, 21)
(u'chance', u'', u'chance', u'dob

In [10]:
s1 = unicode("Here Jimmy is using Python to see behavior of our users")
data = nlp(s1)
for token in data:
    print (token.orth_, token.ent_type_, token.pos_, token.dep_)

(u'Here', u'', u'ADV', u'advmod')
(u'Jimmy', u'PERSON', u'PROPN', u'nsubj')
(u'is', u'', u'VERB', u'aux')
(u'using', u'', u'VERB', u'ROOT')
(u'Python', u'PRODUCT', u'PROPN', u'dobj')
(u'to', u'', u'PART', u'aux')
(u'see', u'', u'VERB', u'xcomp')
(u'behavior', u'', u'NOUN', u'dobj')
(u'of', u'', u'ADP', u'prep')
(u'our', u'', u'ADJ', u'poss')
(u'users', u'', u'NOUN', u'pobj')


In [11]:
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
            if type(i) == Tree:
                    current_chunk.append(" ".join([token for token, pos in i.leaves()]))
            elif current_chunk:
                    named_entity = " ".join(current_chunk)
                    if named_entity not in continuous_chunk:
                            continuous_chunk.append(named_entity)
                            current_chunk = []
            else:
                    continue
    return continuous_chunk

In [12]:
get_continuous_chunks(s1)

[u'Jimmy', u'Python']

In [13]:
s2 = unicode("Maybe Vasyl was playing baseball in New York last summer with his dog")
data = nlp(s2)
for token in data:
    print (token.orth_, token.ent_type_, token.pos_, token.dep_)

(u'Maybe', u'', u'ADV', u'advmod')
(u'Vasyl', u'PERSON', u'PROPN', u'nsubj')
(u'was', u'', u'VERB', u'aux')
(u'playing', u'', u'VERB', u'ROOT')
(u'baseball', u'', u'NOUN', u'dobj')
(u'in', u'', u'ADP', u'prep')
(u'New', u'GPE', u'PROPN', u'compound')
(u'York', u'GPE', u'PROPN', u'pobj')
(u'last', u'DATE', u'ADJ', u'amod')
(u'summer', u'DATE', u'NOUN', u'npadvmod')
(u'with', u'', u'ADP', u'prep')
(u'his', u'', u'ADJ', u'poss')
(u'dog', u'', u'NOUN', u'pobj')


In [14]:
def did_you_know(sentence):
    check = nlp(unicode("Maybe "+sentence))
    check_tokens = [t.ent_type_ for t in check]
    ent = check_tokens[1]
    if ent=='':
        sentence = sentence[0].lower()+sentence[1:]
    question_begin = ['Did you know that ', 'Are you familiar with the fact that ', 'Have you heard that ']
    question = random.choice(question_begin)
    #data = nlp(question)
    #tokens = [t.orth_ for t in data]
    return question+sentence+'?'
did_you_know('He is playing baseball in New York')

'Have you heard that he is playing baseball in New York?'

In [15]:
#http://homepages.inf.ed.ac.uk/mroth/demo.html
def form_question(start, grouping, i):
    x = grouping[:i]+grouping[i+1:]
    x1 = [x[1] for x in x]
    x2 = " ".join(x1)
    return start+" "+x2+"?"



def squestion(sentence):
    check = nlp(unicode("Maybe "+sentence))
    check_tokens = [t.ent_type_ for t in check]
    ent = check_tokens[1]
    if ent=='':
        sentenceq = sentence[0].lower()+sentence[1:]
    subj = []
    mverb = []
    chunks = []
    for i in xrange(len(check[1:])):
    #for token in check[1:]:
        token = check[i+1]
        if token.dep_=='nsubj':
            subj=[i, token.orth_]
        if token.ent_type_=='':
            if token.pos_=='VERB' and token.dep_=='ROOT' and check[i].dep_!='aux':
                chunks.append(['VERB-aux', 'did'])
                chunks.append([token.pos_, token.lemma_])
                mverb=['did', token.orth_]
            elif token.pos_=='VERB' and token.dep_=='ROOT' and check[i].dep_=='aux':
                chunks.append([token.pos_, token.orth_])
                mverb=[check[i].orth_, token.orth_]
                
            #elif token.pos_=='VERB' and token.dep_=='aux':
                #chunks.append([token.pos_+'-aux', token.orth_, ])
            else:
                chunks.append([token.pos_, token.orth_])
        else:
            chunks.append([token.ent_type_, token.orth_])
    
    
    grouping = [chunks[0]]
    for item in chunks[1:]:
        #print (item[0], grouping[-1][0])
        if item[0]==grouping[-1][0]:
            grouping[-1][1]+=' '+item[1]
        else:
            grouping.append(item)
    quest_begin = {'PERSON':'Who', 'GPE':'Where', 'DATE':'When', 'NOUN':'What'}
    
    #for key in quest_begin.keys():
    #for item in grouping:
    qlist = []
    for i in xrange(len(grouping)):
        item = grouping[i]
        if item[0] in quest_begin.keys():
            start = quest_begin[item[0]]
            qlist.append(form_question(start, grouping, i))
    
    return qlist

In [16]:
squestion('Igor Lushchyk was programming in python in New York last summer')

[u'Who was programming in python in New York last summer?',
 u'What Igor Lushchyk was programming in in New York last summer?',
 u'Where Igor Lushchyk was programming in python in last summer?',
 u'When Igor Lushchyk was programming in python in New York?']

In [18]:
squestion('Sergiy is showing us a demo in Lviv right now')

[u'Who is showing us a demo in Lviv right now?',
 u'What Sergiy is showing us a in Lviv right now?',
 u'Where Sergiy is showing us a demo in right now?']