forked from pielstroem/Topics
-
Notifications
You must be signed in to change notification settings - Fork 13
/
doclist.py
214 lines (172 loc) · 6.98 KB
/
doclist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Maintaining Lists of Documents
==============================
A __document list__ manages a list of documents. There are various
implementations of varying powerfulness, all have the following in common:
* A document list keeps a fixed list of documents in order, i.e. after you
created the list you can call the iteration functions and get the same file
at the same position (even if, e.g., the underlying directory changes). So
these files can be matched with lists of document _contents_.
* A document list separates a _base directory_ with some way to form _file
names_. Thus, you can easily create a mirror (of, e.g., files transformed
in some way) in a different directory, or modify the way filenames are formed.
Segments
--------
"""
from pathlib import Path
from itertools import zip_longest
from abc import abstractmethod, abstractproperty
class BaseDocList:
"""
"""
def __init__(self, basepath):
self.basepath = Path(basepath)
self._segment_counts = None
def full_path(self, document, as_str=False):
"""
Constructs a full path for the given document.
Args:
document: this is one document in the way the subclass chooses to
represent documents.
as_str (bool): if True, the result is a str, otherwise it is a `Path`
Notes:
The default implementation passed document on to `Path()`.
Implementers will most probably want to override this.
"""
path = Path(self.basepath, document)
if as_str:
path = str(path)
return path
@abstractmethod
def get_docs(self):
"""
Returns a sequence of documents, in the form the implementing class
chooses.
Note:
Subclasses may implement a method `_get_item(self, index)`, with
index being integer or slice, to speed access up.
"""
pass
def full_paths(self, as_str=False):
"""
Returns a list of full paths. Calls full_path.
"""
return [self.full_path(doc, as_str) for doc in self.get_docs()]
@abstractmethod
def label(self, document):
"""
Returns a label suitable for the document.
"""
pass
def __iter__(self):
"""
When used as an iterable, this object looks like an iterable of full paths.
"""
return iter(self.full_paths(as_str=True))
def __len__(self):
"""
When used as a sequence, this object looks like a sequence of full paths.
"""
return len(self.get_docs())
def __getitem__(self, index):
"""
When used as a sequence, this object looks like a sequence of full paths.
"""
try:
selection = self._getitem(index)
except AttributeError:
selection = self.get_docs()[index]
if isinstance(index, slice):
return [self.full_path(doc, as_str=True) for doc in selection]
else:
return self.full_path(selection, as_str=True)
def labels(self):
"""
Returns a list of (human-readable) labels for all documents.
"""
return [self.label(doc) for doc in self.get_docs()]
def flatten_segments(self, segmented_docs):
"""
Records and flattens segment counts according to the stream of documents.
Assume you have three documents
A : I am an example document
B : Me too
C : All examples reference themselves
docs = SimpleDocList('.', filenames=['A','B','C'])
Now, you have an (external) segmenter function that segments each document
into segments each being at most two tokens long. The data structure your
segmenter will produce looks similar to the following:
segmented_corpus = \
[[['I', 'am'], ['an', 'example'], ['document']],
[['Me', 'too']],
[['All', 'examples'], ['reference', 'themselves']]]
Now, if you run docs.flatten_segments(self), it will do two things: it will
record how many segments each document has (A: 3, B: 1, C: 2), and it will
return a structure flattened by one level as in the following:
[['I', 'am'], ['an', 'example'], ['document'], ['Me', 'too'],
['All', 'examples'], ['reference', 'themselves']]
I.e. the result will look like a corpus of six shorter documents. This
matches with the iteration you get when you call docs.segments().
Args:
segmented_docs: Iterable of documents, each document being an
iterable of segments.
Returns:
Iterable of segments.
Notes:
Instead of lists you will receive generators, but you can iterate
over these as well.
"""
segment_counts = []
self._segment_counts = segment_counts
for doc in segmented_docs:
segment_counts.append(0)
for segment in doc:
segment_counts[-1] += 1
yield segment
def segment_counts(self):
"""
Returns an iterable of the number of each segments for each document.
"""
return self._segment_counts
def segments(self):
"""
Yields a tuple (document, segment_no) for each segment, with document
being the internal representation of each document and segment_count an
integer starting at 0
"""
for document, segment_count in zip_longest(self.get_docs(),
self.segment_counts()):
if segment_count is None:
yield (document, None)
else:
for segment_no in range(segment_count):
yield (document, segment_no)
class PathDocList(BaseDocList):
"""
Document list based on a list of Paths.
"""
def __init__(self, basepath, glob_pattern='*', filenames=None):
"""
Creates a new document list either from the given file names
or by looking for files matching the glob_pattern in the basepath.
Args:
basepath (Path or str): Root directory where your corpus resides
glob_pattern (str): A file glob pattern matching the files to
include.
filenames (list): An iterable of paths or file names relative to
basepath. If `None`, look for files on the file system.
"""
self.basepath = Path(basepath)
self.segments = None
if filenames is None:
self._files = [p.relative_to(self.basepath)
for p in self.basepath.glob(glob_pattern)]
else:
paths = (Path(name) for name in filenames)
if glob_pattern is not None:
paths = (path for path in paths if path.matches(glob_pattern))
self._files = list(paths)
def get_docs(self):
return self._files
def label(self, document):
return document.stem