In [2]:
#SPLITTING SENTENCES INTO CLAUSES
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
sentence = "He eats cheese, but he won't eat ice cream."

In [5]:
doc = nlp(sentence)

In [6]:
#Function to look at the structure of the input sentence by printing out the part of speech, dependency tag, ancestors, and children of each token
for token in doc:
    ancestors = [t.text for t in token.ancestors]
    children = [t.text for t in token.children]
    print(token.text, "\t", token.i, "\t",
        token.pos_, "\t", token.dep_, "\t",
        ancestors, "\t", children)

He 	 0 	 PRON 	 nsubj 	 ['eats'] 	 []
eats 	 1 	 VERB 	 ROOT 	 [] 	 ['He', 'cheese', ',', 'but', 'eat']
cheese 	 2 	 NOUN 	 dobj 	 ['eats'] 	 []
, 	 3 	 PUNCT 	 punct 	 ['eats'] 	 []
but 	 4 	 CCONJ 	 cc 	 ['eats'] 	 []
he 	 5 	 PRON 	 nsubj 	 ['eat', 'eats'] 	 []
wo 	 6 	 AUX 	 aux 	 ['eat', 'eats'] 	 []
n't 	 7 	 PART 	 neg 	 ['eat', 'eats'] 	 []
eat 	 8 	 VERB 	 conj 	 ['eats'] 	 ['he', 'wo', "n't", 'cream', '.']
ice 	 9 	 NOUN 	 compound 	 ['cream', 'eat', 'eats'] 	 []
cream 	 10 	 NOUN 	 dobj 	 ['eat', 'eats'] 	 ['ice']
. 	 11 	 PUNCT 	 punct 	 ['eat', 'eats'] 	 []


In [24]:
#Function to find the root token of the sentence, which is usually the main verb
def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        #print(token.text)
        if (token.dep_ == "ROOT"):
            return token

In [27]:
print(find_root_of_sentence(doc))

eats


In [28]:
root_token = find_root_of_sentence(doc)

In [29]:
#Function to find the other verbs in the sentence
def find_other_verbs(doc, root_token):
    other_verbs = []
    for token in doc:
        ancestors = list(token.ancestors)
        if (token.pos_ == "VERB" and len(ancestors) == 1\
            and ancestors[0] == root_token):
            other_verbs.append(token)
    return other_verbs

In [30]:
#Function to find the remaining verbs in the sentence
other_verbs = find_other_verbs(doc, root_token)

In [31]:
#Function to find the token spans for each verb
def get_clause_token_span_for_verb(verb, doc, all_verbs):
    first_token_index = len(doc)
    last_token_index = 0
    this_verb_children = list(verb.children)
    for child in this_verb_children:
        if (child not in all_verbs):
            if (child.i < first_token_index):
                first_token_index = child.i
        if (child.i > last_token_index):
            last_token_index = child.i
    return(first_token_index, last_token_index)

In [32]:
#To put together all the verbs in one array and process each using the preceding function. This will return a tuple of start and end indices for each verb's clause
token_spans = []
all_verbs = [root_token] + other_verbs
for other_verb in all_verbs:
    (first_token_index, last_token_index) = \
    get_clause_token_span_for_verb(other_verb, doc, all_verbs)
    token_spans.append((first_token_index, last_token_index))

In [33]:
#To put together token spans for each clause
sentence_clauses = []
for token_span in token_spans:
    start = token_span[0]
    end = token_span[1]
    if (start < end):
        clause = doc[start:end]
        sentence_clauses.append(clause)
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])

In [34]:
clauses_text = [clause.text for clause in sentence_clauses]
print(clauses_text)

["He eats cheese, but he won't", "he won't eat ice cream"]
