In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'This is the first sentence. this is second sentence. this is third sentence.')

In [4]:
# default sentence segmentation.
for sents in doc.sents:
    print(sents)

This is the first sentence.
this is second sentence.
this is third sentence.


In [5]:
doc2 = nlp(u'This is the first sentence.'
           u' this is second sentence.'
          u' this is third sentence.')

In [6]:
# default sentence segmentation.
for sents in doc2.sents:
    print(sents)

This is the first sentence.
this is second sentence.
this is third sentence.


In [8]:
# individual tokens can be fetched from the doc
doc[9]

sentence

In [7]:
# but individual sentences cannot be fetched from the doc
# wrong approach for getting a single sentence
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [11]:
# correct approach for grabbing sentences.
list(doc.sents)[1]

this is second sentence.

In [12]:
list(doc2.sents)[2]

this is third sentence.

In [17]:
doc3 = nlp(u'"This is first half; this is second half." -HaHaHa')

In [18]:
# default sentence segmentation won't work with other punct..
for sents in doc3.sents:
    print(sents)

"This is first half; this is second half."
-HaHaHa


In [19]:
# normal nlp pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [24]:
from spacy.language import Language

In [30]:
# Adding new segmentation rule
@Language.component("add_semicolon_rule")
def add_rules(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [31]:
nlp.add_pipe("add_semicolon_rule", before='parser')

ValueError: [E007] 'add_semicolon_rule' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'add_semicolon_rule', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']

In [32]:
# modified nlp pipeline with custom rule
nlp.pipe_names

['tok2vec',
 'tagger',
 'add_semicolon_rule',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [37]:
# modified sentence segmentation with ';' punct.
doc4 = nlp(u'This is the first sentence. this is second sentence; this is third sentence.')
for sents in doc4.sents:
    print(sents)

This is the first sentence.
this is second sentence;
this is third sentence.
