In [1]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [2]:
doc=nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [3]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [4]:
doc.sents[0]
#as it is a generator

TypeError: 'generator' object is not subscriptable

In [5]:
doc[0]

This

In [6]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [7]:
list(doc.sents)[0]

This is the first sentence.

In [8]:
type(list(doc.sents)[0])
#they are spacy span objects and not just normal strings

spacy.tokens.span.Span

In [9]:
doc2=nlp(u'"Management is doing the right thing; leadership is doing the right things." -Peter Drucker')

In [10]:
doc2.text

'"Management is doing the right thing; leadership is doing the right things." -Peter Drucker'

In [11]:
for sent in doc2.sents:
    print(sent)
    print("\n")

"Management is doing the right thing; leadership is doing the right things."


-Peter Drucker




In [12]:
#Add a Segmentation Rule

In [13]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text==";":
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [None]:
set_custom_boundaries(doc2)

In [14]:
doc2[:-1]

"Management is doing the right thing; leadership is doing the right things." -Peter

In [15]:
doc3=nlp(u'"Management is doing the right thing; leadership is doing the right things." -Peter Drucker')

In [16]:
for sent in doc3.sents:
    print(sent)

"Management is doing the right thing;
leadership is doing the right things."
-Peter Drucker


In [17]:
#Change Segmentation Rules

In [18]:
nlp=spacy.load("en_core_web_sm")

In [23]:
mystring = u"This is the first sentence. This is another sentence.\n\nThis is the \nlast sentence."

In [24]:
print(mystring)

This is the first sentence. This is another sentence.

This is the 
last sentence.


In [25]:
doc=nlp(mystring)

In [26]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.


This is the 
last sentence.


In [27]:
from spacy.pipeline import SentenceSegmenter

In [28]:
def split_on_newlines(doc):
    start=0
    seen_newline=False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start=word.i
            seen_newline=False
        elif word.text.startswith("\n"):
            seen_newline=True
    yield doc[start:]

In [29]:
sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)

In [30]:
nlp.add_pipe(sbd)

In [31]:
nlp.pipe_names

['tagger', 'parser', 'ner', 'sbd']

In [32]:
doc1=nlp(mystring)

In [33]:
for sent in doc1.sents:
    print(sent)

This is the first sentence. This is another sentence.


This is the 

last sentence.
