20
20
import re
21
21
import sys
22
22
23
+ try :
24
+ import Levenshtein
25
+ levenshtein_supported = True
26
+ except ImportError :
27
+ levenshtein_supported = False
28
+
23
29
class NotesIndex :
24
30
25
31
def __init__ (self ):
@@ -33,7 +39,7 @@ def __init__(self):
33
39
self .list_keywords (self .keyword_filter )
34
40
else :
35
41
matches = self .search_index (keywords )
36
- print '\n ' .join (sorted (matches ))
42
+ print self . encode ( '\n ' .join (sorted (matches ) ))
37
43
38
44
def parse_args (self ):
39
45
''' Parse the command line arguments. '''
@@ -64,6 +70,8 @@ def parse_args(self):
64
70
sys .exit (0 )
65
71
else :
66
72
assert False , "Unhandled option"
73
+ if self .keyword_filter is not None :
74
+ self .keyword_filter = self .decode (self .keyword_filter )
67
75
# Canonicalize pathnames, check validity.
68
76
self .database_file = self .munge_path (self .database_file )
69
77
self .user_directory = self .munge_path (self .user_directory )
@@ -147,27 +155,37 @@ def search_index(self, keywords):
147
155
matches &= set (filenames )
148
156
return list (matches ) if matches else []
149
157
150
- def list_keywords (self , substring , limit = 100 ):
158
+ def list_keywords (self , substring , limit = 25 ):
151
159
''' Print all (matching) keywords to standard output. '''
152
- i = 0
153
- for kw in self .index ['keywords' ]:
160
+ decorated = []
161
+ for kw , filenames in self .index ['keywords' ]. iteritems () :
154
162
if substring in kw .lower ():
155
- print kw
156
- if i < limit :
157
- i += 1
163
+ if levenshtein_supported :
164
+ decorated .append ((Levenshtein .distance (kw .lower (), substring ), - len (filenames ), kw ))
158
165
else :
159
- break
166
+ decorated .append ((- len (filenames ), kw ))
167
+ decorated .sort ()
168
+ selection = [d [- 1 ] for d in decorated [:limit ]]
169
+ print self .encode ('\n ' .join (selection ))
160
170
161
171
def tokenize (self , text ):
162
172
''' Tokenize a string into a list of normalized, unique keywords. '''
163
173
words = set ()
164
- text = text .decode (self . character_encoding , 'ignore' )
165
- for word in re .findall (r'\w+' , text . lower () , re .UNICODE ):
174
+ text = self .decode (text ). lower ( )
175
+ for word in re .findall (r'\w+' , text , re .UNICODE ):
166
176
word = word .strip ()
167
177
if word != '' and not word .isspace ():
168
178
words .add (word )
169
179
return words
170
180
181
+ def encode (self , text ):
182
+ ''' Encode a string in the user's preferred character encoding. '''
183
+ return text .encode (self .character_encoding , 'ignore' )
184
+
185
+ def decode (self , text ):
186
+ ''' Decode a string in the user's preferred character encoding. '''
187
+ return text .decode (self .character_encoding , 'ignore' )
188
+
171
189
def munge_path (self , path ):
172
190
''' Canonicalize user-defined path, making it absolute. '''
173
191
return os .path .abspath (os .path .expanduser (path ))
0 commit comments