public
Description: RESTful Blog for Google App Engine
Homepage:
Clone URL: git://github.com/DocSavage/bloog.git
bloog / models / search.py
bb33d4b0 » DocSavage 2008-08-12 Added ability to remove pro... 1 #!/usr/bin/env python
2 #
3 # Copyright 2007 Google Inc.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17
18 """Full text indexing and search, implemented in pure python.
19
20 Note: This code is slightly altered from google.appengine.ext.search.
21 The original code has timeout/quota problems when running in the
bd5235f6 » DocSavage 2008-08-18 Added code highlighting tha... 22 cloud.
23
24 Changes by Bill Katz on original:
25 - Brought Query out from SearchableModel and renamed it FullTextQuery
26 due to issues with scoping (?) and online shell app.
27 - Added unsearchable_properties class variable that lets you remove
28 string-based properties from indexing.
29 - Don't index over code inside pre with name 'code'.
bb33d4b0 » DocSavage 2008-08-12 Added ability to remove pro... 30
31 Defines a SearchableModel subclass of db.Model that supports full text
32 indexing and search, based on the datastore's existing indexes.
33
34 Don't expect too much. First, there's no ranking, which is a killer drawback.
35 There's also no exact phrase match, substring match, boolean operators,
36 stemming, or other common full text search features. Finally, support for stop
37 words (common words that are not indexed) is currently limited to English.
38
39 To be indexed, entities must be created and saved as SearchableModel
40 instances, e.g.:
41
42 class Article(search.SearchableModel):
43 text = db.TextProperty()
44 ...
45
46 article = Article(text=...)
47 article.save()
48
49 To search the full text index, use the SearchableModel.all() method to get an
50 instance of SearchableModel.Query, which subclasses db.Query. Use its search()
51 method to provide a search query, in addition to any other filters or sort
52 orders, e.g.:
53
54 query = article.all().search('a search query').filter(...).order(...)
55 for result in query:
56 ...
57
58 The full text index is stored in a property named __searchable_text_index.
59
60
61 In general, if you just want to provide full text search, you *don't* need to
62 add any extra indexes to your index.yaml. However, if you want to use search()
63 in a query *in addition to* an ancestor, filter, or sort order, you'll need to
64 create an index in index.yaml with the __searchable_text_index property. For
65 example:
66
67 - kind: Article
68 properties:
69 - name: __searchable_text_index
70 - name: date
71 direction: desc
72 ...
73
74 Note that using SearchableModel will noticeable increase the latency of save()
75 operations, since it writes an index row for each indexable word. This also
76 means that the latency of save() will increase roughly with the size of the
77 properties in a given entity. Caveat hacker!
78 """
79
80 import logging
81 import re
82 import string
83 import sys
84
85 from google.appengine.api import datastore
86 from google.appengine.api import datastore_errors
87 from google.appengine.api import datastore_types
88 from google.appengine.ext import db
89 from google.appengine.datastore import datastore_pb
90
91 class SearchableEntity(datastore.Entity):
92 """A subclass of datastore.Entity that supports full text indexing.
93
94 Automatically indexes all string and Text properties, using the datastore's
95 built-in per-property indices. To search, use the SearchableQuery class and
96 its Search() method.
97 """
98 # Note that AppEngine servers will cache all imported modules including
99 # the interior of a class definition. So the following _FULL_TEXT_*
100 # properties will be executed once and cached.
101
102 _FULL_TEXT_INDEX_PROPERTY = '__searchable_text_index'
103
cabfc0cb » DocSavage 2008-08-26 Don't cache view for users.... 104 _FULL_TEXT_MIN_LENGTH = 4
bb33d4b0 » DocSavage 2008-08-12 Added ability to remove pro... 105
106 _FULL_TEXT_STOP_WORDS = frozenset([
107 'a', 'about', 'according', 'accordingly', 'affected', 'affecting', 'after',
108 'again', 'against', 'all', 'almost', 'already', 'also', 'although',
109 'always', 'am', 'among', 'an', 'and', 'any', 'anyone', 'apparently', 'are',
110 'arise', 'as', 'aside', 'at', 'away', 'be', 'became', 'because', 'become',
111 'becomes', 'been', 'before', 'being', 'between', 'both', 'briefly', 'but',
112 'by', 'came', 'can', 'cannot', 'certain', 'certainly', 'could', 'did', 'do',
113 'does', 'done', 'during', 'each', 'either', 'else', 'etc', 'ever', 'every',
114 'following', 'for', 'found', 'from', 'further', 'gave', 'gets', 'give',
115 'given', 'giving', 'gone', 'got', 'had', 'hardly', 'has', 'have', 'having',
116 'here', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'itself',
117 'just', 'keep', 'kept', 'knowledge', 'largely', 'like', 'made', 'mainly',
118 'make', 'many', 'might', 'more', 'most', 'mostly', 'much', 'must', 'nearly',
119 'necessarily', 'neither', 'next', 'no', 'none', 'nor', 'normally', 'not',
120 'noted', 'now', 'obtain', 'obtained', 'of', 'often', 'on', 'only', 'or',
121 'other', 'our', 'out', 'owing', 'particularly', 'past', 'perhaps', 'please',
122 'poorly', 'possible', 'possibly', 'potentially', 'predominantly', 'present',
123 'previously', 'primarily', 'probably', 'prompt', 'promptly', 'put',
124 'quickly', 'quite', 'rather', 'readily', 'really', 'recently', 'regarding',
125 'regardless', 'relatively', 'respectively', 'resulted', 'resulting',
126 'results', 'said', 'same', 'seem', 'seen', 'several', 'shall', 'should',
127 'show', 'showed', 'shown', 'shows', 'significantly', 'similar', 'similarly',
128 'since', 'slightly', 'so', 'some', 'sometime', 'somewhat', 'soon',
129 'specifically', 'state', 'states', 'strongly', 'substantially',
130 'successfully', 'such', 'sufficiently', 'than', 'that', 'the', 'their',
131 'theirs', 'them', 'then', 'there', 'therefore', 'these', 'they', 'this',
132 'those', 'though', 'through', 'throughout', 'to', 'too', 'toward', 'under',
133 'unless', 'until', 'up', 'upon', 'use', 'used', 'usefully', 'usefulness',
134 'using', 'usually', 'various', 'very', 'was', 'we', 'were', 'what', 'when',
135 'where', 'whether', 'which', 'while', 'who', 'whose', 'why', 'widely',
136 'will', 'with', 'within', 'without', 'would', 'yet', 'you'])
137
138 _PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']')
139
140 def __init__(self, kind_or_entity, *args, **kwargs):
141 """Constructor. May be called as a copy constructor.
142
143 If kind_or_entity is a datastore.Entity, copies it into this Entity.
144 datastore.Get() and Query() returns instances of datastore.Entity, so this
145 is useful for converting them back to SearchableEntity so that they'll be
146 indexed when they're stored back in the datastore.
147
148 Otherwise, passes through the positional and keyword args to the
149 datastore.Entity constructor.
150
151 Args:
152 kind_or_entity: string or datastore.Entity
153 """
154 if isinstance(kind_or_entity, datastore.Entity):
155 self._Entity__key = kind_or_entity._Entity__key
156 self.update(kind_or_entity)
157 else:
158 super(SearchableEntity, self).__init__(kind_or_entity, *args, **kwargs)
159
160 def _ToPb(self):
161 """Rebuilds the full text index, then delegates to the superclass.
162
163 Returns:
164 entity_pb.Entity
165 """
166 if SearchableEntity._FULL_TEXT_INDEX_PROPERTY in self:
167 del self[SearchableEntity._FULL_TEXT_INDEX_PROPERTY]
168
169 index = set()
170 searchable = lambda (name, value): name not in self.unsearchable_properties
171 for (name, values) in filter(searchable, self.items()):
172 if not isinstance(values, list):
173 values = [values]
174 if (isinstance(values[0], basestring) and
175 not isinstance(values[0], datastore_types.Blob)):
176 for value in values:
177 index.update(SearchableEntity._FullTextIndex(value))
178
179 index_list = list(index)
180 if index_list:
181 self[SearchableEntity._FULL_TEXT_INDEX_PROPERTY] = index_list
182
183 return super(SearchableEntity, self)._ToPb()
184
185 @classmethod
186 def _FullTextIndex(cls, text):
187 """Returns a set of keywords appropriate for full text indexing.
188
189 See SearchableQuery.Search() for details.
190
191 Args:
192 text: string
193
194 Returns:
195 set of strings
196 """
197
198 if text:
199 datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
bd5235f6 » DocSavage 2008-08-18 Added code highlighting tha... 200 # TODO -- Remove embedded code blogs marked by 'pre' tags
201 # and name="code"
bb33d4b0 » DocSavage 2008-08-12 Added ability to remove pro... 202 text = cls._PUNCTUATION_REGEX.sub(' ', text)
203 words = text.lower().split()
204
205 words = set(words)
206
207 words -= cls._FULL_TEXT_STOP_WORDS
208 for word in list(words):
209 if len(word) < cls._FULL_TEXT_MIN_LENGTH:
210 words.remove(word)
211
212 else:
213 words = set()
214
215 return words
216
217
218 class SearchableQuery(datastore.Query):
219 """A subclass of datastore.Query that supports full text search.
220
221 Only searches over entities that were created and stored using the
222 SearchableEntity or SearchableModel classes.
223 """
224
225 def Search(self, search_query):
226 """Add a search query. This may be combined with filters.
227
228 Note that keywords in the search query will be silently dropped if they
229 are stop words or too short, ie if they wouldn't be indexed.
230
231 Args:
232 search_query: string
233
234 Returns:
235 # this query
236 SearchableQuery
237 """
238 datastore_types.ValidateString(search_query, 'search query')
239 self._search_query = search_query
240 return self
241
242 def _ToPb(self, limit=None, offset=None):
243 """Adds filters for the search query, then delegates to the superclass.
244
245 Raises BadFilterError if a filter on the index property already exists.
246
247 Args:
248 # an upper bound on the number of results returned by the query.
249 limit: int
250 # number of results that match the query to skip. limit is applied
251 # after the offset is fulfilled.
252 offset: int
253
254 Returns:
255 datastore_pb.Query
256 """
257 if SearchableEntity._FULL_TEXT_INDEX_PROPERTY in self:
258 raise datastore_errors.BadFilterError(
259 '%s is a reserved name.' % SearchableEntity._FULL_TEXT_INDEX_PROPERTY)
260
261 pb = super(SearchableQuery, self)._ToPb(limit=limit, offset=offset)
262
263 if hasattr(self, '_search_query'):
264 keywords = SearchableEntity._FullTextIndex(self._search_query)
265 for keyword in keywords:
266 filter = pb.add_filter()
267 filter.set_op(datastore_pb.Query_Filter.EQUAL)
268 prop = filter.add_property()
269 prop.set_name(SearchableEntity._FULL_TEXT_INDEX_PROPERTY)
270 prop.mutable_value().set_stringvalue(keyword)
271
272 return pb
273
274 class FullTextQuery(db.Query):
275 """A subclass of db.Query that supports full text search."""
276 _search_query = None
277
278 def search(self, search_query):
279 """Adds a full text search to this query.
280
281 Args:
282 search_query, a string containing the full text search query.
283
284 Returns:
285 self
286 """
287 self._search_query = search_query
288 return self
289
290 def _get_query(self):
291 """Wraps db.Query._get_query() and injects SearchableQuery."""
292 query = db.Query._get_query(self, _query_class=SearchableQuery)
293 if self._search_query:
294 query.Search(self._search_query)
295 return query
296
22e92f4d » DocSavage 2008-08-20 Consolidated db.Model exten... 297 import models
298
299 class SearchableModel(models.SerializableModel):
bb33d4b0 » DocSavage 2008-08-12 Added ability to remove pro... 300 """A subclass of db.Model that supports full text search and indexing.
301
302 Automatically indexes all string-based properties. To search, use the all()
303 method to get a FullTextQuery, then use its search() method.
304
305 Looks for a class variable, unsearchable_properties, and if set, removes
306 indexing on those properties. Note that only properties with string
307 base types are indexed in any case.
308 """
309 unsearchable_properties = []
310
311 def _populate_internal_entity(self):
312 """Wraps db.Model._populate_internal_entity() and injects
313 SearchableEntity."""
314 entity = db.Model._populate_internal_entity(self,
16a90c4b » DocSavage 2008-08-21 Now using json serializatio... 315 _entity_class=SearchableEntity)
bb33d4b0 » DocSavage 2008-08-12 Added ability to remove pro... 316 entity.unsearchable_properties = self.__class__.unsearchable_properties
317 return entity
318
319 @classmethod
320 def from_entity(cls, entity):
321 """Wraps db.Model.from_entity() and injects SearchableEntity."""
322 if not isinstance(entity, SearchableEntity):
323 entity = SearchableEntity(entity)
324 return super(SearchableModel, cls).from_entity(entity)
325
326 @classmethod
327 def all(cls):
328 """Returns a FullTextQuery for this kind."""
329 return FullTextQuery(cls)