@@ -141,30 +141,37 @@ def _split(self, data):
141
141
142
142
return data [:len1 ], binary , data [idx :]
143
143
144
- _whitespace = re .compile (br'[\0\t\r\014\n ]+' )
145
- _token = re .compile (br'/{0,2}[^]\0\t\r\v\n ()<>{}/%[]+' )
146
- _comment = re .compile (br'%[^\r\n\v]*' )
147
- _instring = re .compile (br'[()\\]' )
144
+ _whitespace_re = re .compile (br'[\0\t\r\014\n ]+' )
145
+ _token_re = re .compile (br'/{0,2}[^]\0\t\r\v\n ()<>{}/%[]+' )
146
+ _comment_re = re .compile (br'%[^\r\n\v]*' )
147
+ _instring_re = re .compile (br'[()\\]' )
148
+
149
+ # token types
150
+ _whitespace = object ()
151
+ _name = object ()
152
+ _string = object ()
153
+ _delimiter = object ()
154
+ _number = object ()
148
155
149
156
@classmethod
150
157
def _tokens (cls , text ):
151
158
"""
152
159
A PostScript tokenizer. Yield (token, value) pairs such as
153
- ('whitespace' , ' ') or ('name' , '/Foobar').
160
+ (cls._whitespace , ' ') or (cls._name , '/Foobar').
154
161
"""
155
162
pos = 0
156
163
while pos < len (text ):
157
- match = (cls ._comment .match (text [pos :]) or
158
- cls ._whitespace .match (text [pos :]))
164
+ match = (cls ._comment_re .match (text [pos :]) or
165
+ cls ._whitespace_re .match (text [pos :]))
159
166
if match :
160
- yield ('whitespace' , match .group ())
167
+ yield (cls . _whitespace , match .group ())
161
168
pos += match .end ()
162
169
elif text [pos ] == '(' :
163
170
start = pos
164
171
pos += 1
165
172
depth = 1
166
173
while depth :
167
- match = cls ._instring .search (text [pos :])
174
+ match = cls ._instring_re .search (text [pos :])
168
175
if match is None :
169
176
return
170
177
pos += match .end ()
@@ -174,25 +181,25 @@ def _tokens(cls, text):
174
181
depth -= 1
175
182
else : # a backslash - skip the next character
176
183
pos += 1
177
- yield ('string' , text [start :pos ])
184
+ yield (cls . _string , text [start :pos ])
178
185
elif text [pos :pos + 2 ] in ('<<' , '>>' ):
179
- yield ('delimiter' , text [pos :pos + 2 ])
186
+ yield (cls . _delimiter , text [pos :pos + 2 ])
180
187
pos += 2
181
188
elif text [pos ] == '<' :
182
189
start = pos
183
190
pos += text [pos :].index ('>' )
184
- yield ('string' , text [start :pos ])
191
+ yield (cls . _string , text [start :pos ])
185
192
else :
186
- match = cls ._token .match (text [pos :])
193
+ match = cls ._token_re .match (text [pos :])
187
194
if match :
188
195
try :
189
196
float (match .group ())
190
- yield ('number' , match .group ())
197
+ yield (cls . _number , match .group ())
191
198
except ValueError :
192
- yield ('name' , match .group ())
199
+ yield (cls . _name , match .group ())
193
200
pos += match .end ()
194
201
else :
195
- yield ('delimiter' , text [pos ])
202
+ yield (cls . _delimiter , text [pos : pos + 1 ])
196
203
pos += 1
197
204
198
205
def _parse (self ):
@@ -205,26 +212,30 @@ def _parse(self):
205
212
prop = {'weight' : 'Regular' , 'ItalicAngle' : 0.0 , 'isFixedPitch' : False ,
206
213
'UnderlinePosition' : - 100 , 'UnderlineThickness' : 50 }
207
214
tokenizer = self ._tokens (self .parts [0 ])
208
- filtered = filter (lambda x : x [0 ] != 'whitespace' , tokenizer )
215
+ filtered = filter (lambda x : x [0 ] != self ._whitespace , tokenizer )
216
+ # The spec calls this an ASCII format; in Python 2.x we could
217
+ # just treat the strings and names as opaque bytes but let's
218
+ # turn them into proper Unicode, and be lenient in case of high bytes.
219
+ convert = lambda x : x .decode ('ascii' , errors = 'replace' )
209
220
for token , value in filtered :
210
- if token == b'name' and value .startswith (b'/' ):
211
- key = value [1 :]
221
+ if token is self . _name and value .startswith (b'/' ):
222
+ key = convert ( value [1 :])
212
223
token , value = next (filtered )
213
- if token == b'name' :
224
+ if token is self . _name :
214
225
if value in (b'true' , b'false' ):
215
226
value = value == b'true'
216
227
else :
217
- value = value .lstrip (b'/' )
218
- elif token == b'string' :
219
- value = value .lstrip (b'(' ).rstrip (b')' )
220
- elif token == b'number' :
228
+ value = convert ( value .lstrip (b'/' ) )
229
+ elif token is self . _string :
230
+ value = convert ( value .lstrip (b'(' ).rstrip (b')' ) )
231
+ elif token is self . _number :
221
232
if b'.' in value :
222
233
value = float (value )
223
234
else :
224
235
value = int (value )
225
236
else : # more complicated value such as an array
226
237
value = None
227
- if key != b 'FontInfo' and value is not None :
238
+ if key != 'FontInfo' and value is not None :
228
239
prop [key ] = value
229
240
230
241
# Fill in the various *Name properties
0 commit comments