In [1]:
import spacy
from spacy import displacy

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
doc = nlp("""Over the last quarter Apple sold nearly 20 thousand iPods for a prfit of 6 million.\
By contrast, Sony only sold 8 thousand Walkman music planers.\
""")

In [12]:
displacy.render(doc, style="ent", jupyter=True)

In [14]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style="ent", jupyter=True)

## Sentence Segmentation

In [15]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [16]:
# sents object is a generator not a list
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [19]:
doc = nlp('"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [20]:
for sent in doc.sents:
    print(sent)

"Management is doing the right things; leadership is doing the right things."
- Peter Drucker


In [None]:
# A pipeline fot custom segmentaiton

In [27]:
def set_custom_boundaries(doc):
    for token in doc[:-1]: # leaving the last token
        if token.text == ";":
            doc[token.i+1].is_sent_start = True
    return doc

In [29]:
nlp.add_pipe(set_custom_boundaries, before="parser")
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [31]:
doc_2 = nlp('"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [32]:
# Apply Custom Boundaries
for sent in doc_2.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
- Peter Drucker


## Custom Segmenter

In [49]:
nlp = spacy.load("en_core_web_sm")

In [50]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [51]:
doc = nlp(mystring)

In [52]:
for s in doc.sents:
    print(s)

This is a sentence.
This is another.


This is a 
third sentence.


In [53]:
from spacy.pipeline import SentenceSegmenter

In [77]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    # searcing and concating words together
    # until it finds a \n
    # when it finds a \n next loop it yeild the whole sentence
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith("\n"):
            seen_newline = True
    yield doc[start:]

In [55]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [56]:
nlp.add_pipe(sbd)

In [57]:
doc = nlp(mystring)

In [78]:
for s in doc.sents:
    print(s)

This is a sentence. This is another.


This is a 

third sentence.
