This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
sentence_splitter.py
68 lines (52 loc) · 2.58 KB
/
sentence_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from typing import List
from overrides import overrides
import spacy
from allennlp.common import Registrable
from allennlp.common.util import get_spacy_model
class SentenceSplitter(Registrable):
"""
A `SentenceSplitter` splits strings into sentences.
"""
default_implementation = "spacy"
def split_sentences(self, text: str) -> List[str]:
"""
Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
"""
raise NotImplementedError
def batch_split_sentences(self, texts: List[str]) -> List[List[str]]:
"""
Default implementation is to just iterate over the texts and call `split_sentences`.
"""
return [self.split_sentences(text) for text in texts]
@SentenceSplitter.register("spacy")
class SpacySentenceSplitter(SentenceSplitter):
"""
A `SentenceSplitter` that uses spaCy's built-in sentence boundary detection.
Spacy's default sentence splitter uses a dependency parse to detect sentence boundaries, so
it is slow, but accurate.
Another option is to use rule-based sentence boundary detection. It's fast and has a small memory footprint,
since it uses punctuation to detect sentence boundaries. This can be activated with the `rule_based` flag.
By default, `SpacySentenceSplitter` calls the default spacy boundary detector.
Registered as a `SentenceSplitter` with name "spacy".
"""
def __init__(self, language: str = "en_core_web_sm", rule_based: bool = False) -> None:
# we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
self.spacy = get_spacy_model(language, parse=not rule_based, ner=False, pos_tags=False)
if rule_based:
# we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
# depending on the spacy version, it could be called 'sentencizer' or 'sbd'
sbd_name = "sbd" if spacy.__version__ < "2.1" else "sentencizer"
if not self.spacy.has_pipe(sbd_name):
sbd = self.spacy.create_pipe(sbd_name)
self.spacy.add_pipe(sbd)
@overrides
def split_sentences(self, text: str) -> List[str]:
return [sent.string.strip() for sent in self.spacy(text).sents]
@overrides
def batch_split_sentences(self, texts: List[str]) -> List[List[str]]:
"""
This method lets you take advantage of spacy's batch processing.
"""
return [
[sentence.string.strip() for sentence in doc.sents] for doc in self.spacy.pipe(texts)
]