forked from pielstroem/Topics
-
Notifications
You must be signed in to change notification settings - Fork 13
/
test_fuzzy_segmenting.py
108 lines (82 loc) · 3.94 KB
/
test_fuzzy_segmenting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from dariah_topics.preprocessing import segment_fuzzy, split_paragraphs, \
segment, tokenize
from functools import partial
from nose.tools import eq_
from itertools import chain
from pathlib import Path
import re
project_path = Path(__file__).absolute().parent.parent
_DEMO_DPAR = """
"Wedlock suits you," he remarked. "I think Watson, that you have put on
seven and a half pounds since I saw you."
"Seven," I answered.
"Indeed, I should have thought a little more. Just a trifle more, I
fancy, Watson. And in practice again, I observe. You did not tell me
that you intended to go into harness."
"Then how do you know?"
"""
_DEMO_SPAR = '''"Wedlock suits you," he remarked. "I think Watson, that you have put on seven and a half pounds since I saw you."
"Seven," I answered.
"Indeed, I should have thought a little more. Just a trifle more, I fancy, Watson. And in practice again, I observe. You did not tell me that you intended to go into harness."
"Then how do you know?"'''
def test_split_paragraphs_spar():
chunked = split_paragraphs(_DEMO_SPAR)
eq_(len(chunked), 4, msg='not 4 chunks: ' + str(chunked))
def test_split_paragraphs_dpar():
chunked = split_paragraphs(_DEMO_DPAR, sep=r'\n\n')
eq_(len(chunked), 4, msg='not 4 chunks: ' + str(chunked))
def test_split_paragraphs_dpar_re():
chunked = split_paragraphs(_DEMO_DPAR, sep=re.compile(r'\n\n'))
eq_(len(chunked), 4, msg='not 4 chunks: ' + str(chunked))
def test_plain_segments():
"""segment_size chunks, zero tolerance"""
document = ("01234 "*4).split()
segments = list(segment_fuzzy(document, segment_size=5))
eq_(segments, [[['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4']]])
def test_shorter_segments():
"""shorter segments than chunks, no tolerance"""
document = ("01234 "*4).split()
segments = list(segment_fuzzy(document, segment_size=4))
eq_(segments, [[['0', '1', '2', '3']],
[['4'], ['0', '1', '2']],
[['3', '4'], ['0', '1']],
[['2', '3', '4'], ['0']],
[['1', '2', '3', '4']]])
def test_tolerance():
"""chunk size within tolerance"""
document = ("01234 "*4).split()
segments = list(segment_fuzzy(document, segment_size=4, tolerance=1))
eq_(segments, [[['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4']]])
def test_tolerance_2():
"""segment size ~ 2*chunk_size"""
document = ("01234 "*4).split()
segments = list(segment_fuzzy(document, segment_size=8, tolerance=2))
eq_(segments, [[['0', '1', '2', '3', '4'],
['0', '1', '2', '3', '4']],
[['0', '1', '2', '3', '4'],
['0', '1', '2', '3', '4']]])
def test_overlong_chunk():
"""single chunk is longer than two segments"""
document = "012345678901234 01234".split()
segments = list(segment_fuzzy(document, segment_size=4, tolerance=1))
flattened = [ list(chain.from_iterable(seg)) for seg in segments ]
lengths = list(map(len, flattened))
assert min(lengths[:-1]) >= 3, "a segment is too short in " + str(segments)
assert max(lengths) <= 5, "a segment is too long in " + str(segments)
def test_segment():
"""segment convenience wrapper"""
path = project_path.joinpath('corpus_txt', 'Doyle_AStudyinScarlet.txt')
text = path.read_text(encoding='utf-8')
segments = segment(text, segment_size=1000, tolerance=0.05,
chunker=partial(split_paragraphs, sep=re.compile(r'\n\n')),
tokenizer=tokenize,
flatten_chunks=True)
lengths = list(map(len, segments))
assert min(lengths[:-1]) >= 950, "a segment is too short in " + str(segments)
assert max(lengths) <= 1050, "a segment is too long in " + str(segments)