Skip to content

Commit

Permalink
Merge pull request #905 from varunkatiyar819/varunkatiyar819-patch-1
Browse files Browse the repository at this point in the history
Updated crfcut.py
  • Loading branch information
wannaphong committed Apr 3, 2024
2 parents fa0a2ca + d1b64a7 commit 71ef3e3
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion pythainlp/tokenize/crfcut.py
Expand Up @@ -199,11 +199,20 @@ def segment(text: str) -> List[str]:
labs = _tagger.tag(feat)
labs[-1] = "E" # make sure it cuts the last sentence

# To ensure splitting of sentences using Terminal Punctuation
for idx, _ in enumerate(toks):
if toks[idx].strip().endswith(("!", ".", "?")):
labs[idx] = "E"
# Spaces or empty strings would no longer be treated as end of sentence.
elif (idx == 0 or labs[idx-1] == "E") and toks[idx].strip() == "":
labs[idx] = "I"

sentences = []
sentence = ""
for i, w in enumerate(toks):
sentence = sentence + w
if labs[i] == "E":
# Empty strings should not be part of output.
if labs[i] == "E" and sentence != "":
sentences.append(sentence)
sentence = ""

Expand Down

0 comments on commit 71ef3e3

Please sign in to comment.