forked from Freely-Given-org/BibleOrgSys
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SFMFile.py
executable file
·422 lines (358 loc) · 18 KB
/
SFMFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# SFMFile.py
#
# SFM (Standard Format Marker) data file reader
#
# Copyright (C) 2010-2017 Robert Hunt
# Author: Robert Hunt <Freely.Given.org@gmail.com>
# License: See gpl-3.0.txt
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Module for reading UTF-8 SFM (Standard Format Marker) file.
There are three kinds of SFM encoded files which can be loaded:
1/ SFMLines: A "flat" file, read line by line into a list.
This could be any kind of SFM data.
2/ SFMRecords: A "record based" file (e.g., a dictionary), read record by record into a list
3/ SFMRecords: A header segment, then a "record based" structure read into the same list,
for example an interlinearized text.
In each case, the SFM and its data field are read into a 2-tuple and saved (in order) in the list.
Raises IOError if file doesn't exist.
"""
from gettext import gettext as _
LastModifiedDate = '2017-10-19' # by RJH
ShortProgName = "SFMFile"
ProgName = "SFM Files loader"
ProgVersion = '0.86'
ProgNameVersion = '{} v{}'.format( ShortProgName, ProgVersion )
ProgNameVersionDate = '{} {} {}'.format( ProgNameVersion, _("last modified"), LastModifiedDate )
debuggingThisModule = False
import logging, sys
import BibleOrgSysGlobals
#def splitMarkerText( line ):
#"""
#Given a line of text (may be empty),
#returns a backslash marker and the text.
#Returns None for the backslash marker if there isn't one.
#Returns an empty string for the text if there isn't any.
#"""
#if not line: return None, ''
#if line[0] != '\\': return None, line # Not a SFM line
## We have a line that starts with a backslash
## The marker can end with a space or another marker
#lineAfterBackslash = line[1:]
#si1 = lineAfterBackslash.find( ' ' )
#si2 = lineAfterBackslash.find( '\\' )
#if si2!=-1 and (si1==-1 or si2<si1): # Marker stops at a backslash
#marker = lineAfterBackslash[:si2]
#text = lineAfterBackslash[si2:]
#elif si1!=-1: # Marker stops at a space
#marker = lineAfterBackslash[:si1]
#text = lineAfterBackslash[si1+1:] # We drop the space
#else: # The line is only the marker
#marker = lineAfterBackslash
#text = ''
#return marker, text
## end if splitMarkerText
class SFMLines:
"""
Class holding a list of (non-blank) SFM lines.
Each line is a tuple consisting of (SFMMarker, SFMValue).
"""
def __init__(self):
self.lines = []
def __str__(self):
"""
This method returns the string representation of a SFM lines object.
@return: the name of a SFM field object formatted as a string
@rtype: string
"""
result = "SFM Lines Object"
if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel>2: result += ' v' + ProgVersion
for line in self.lines:
result += ('\n' if result else '') + str( line )
return result
def read( self, SFMFilepath, ignoreSFMs=None, encoding='utf-8' ):
"""
Read a simple SFM (Standard Format Marker) file into a list of tuples.
@param SFMFilepath: The filename
@type SFMFilepath: string
@param key: The SFM record marker (not including the backslash)
@type encoding: string
@rtype: list
@return: list of lists containing the records
"""
# Check/handle parameters
if ignoreSFMs is None: ignoreSFMs = ()
lastLine, lineCount, result = '', 0, []
with open( SFMFilepath, encoding=encoding ) as myFile: # Automatically closes the file when done
try:
for line in myFile:
lineCount += 1
if lineCount==1 and encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF or \ufeff
logging.info( "SFMLines: Detected Unicode Byte Order Marker (BOM) in {}".format( SFMFilepath ) )
line = line[1:] # Remove the Unicode Byte Order Marker (BOM)
if line and line[-1]=='\n': line=line[:-1] # Removing trailing newline character
if not line: continue # Just discard blank lines
lastLine = line
#print ( 'SFM file line is "' + line + '"' )
#if line[0:2]=='\\_': continue # Just discard Toolbox header lines
if line[0]=='#': continue # Just discard comment lines
if line[0]!='\\': # Not a SFM line
if len(result)==0: # We don't have any SFM data lines yet
if BibleOrgSysGlobals.verbosityLevel > 2:
logging.error( "Non-SFM line in " + SFMFilepath + " -- line ignored at #" + str(lineCount) )
#print( "SFMFile.py: XXZXResult is", result, len(line) )
#for x in range(0, min(6,len(line))):
#print( x, "'" + str(ord(line[x])) + "'" )
#raise IOError('Oops: Line break on last line ??? not handled here "' + line + '"')
else: # Append this continuation line
if marker not in ignoreSFMs:
oldmarker, oldtext = result.pop()
#print ("Popped",oldmarker,oldtext)
#print ("Adding", line, "to", oldmarker, oldtext)
result.append( (oldmarker, oldtext+' '+line) )
continue
lineAfterBackslash = line[1:]
si1 = lineAfterBackslash.find( ' ' )
si2 = lineAfterBackslash.find( '\\' )
if si2!=-1 and (si1==-1 or si2<si1): # Marker stops at a backslash
marker = lineAfterBackslash[:si2]
text = lineAfterBackslash[si2:]
elif si1!=-1: # Marker stops at a space
marker = lineAfterBackslash[:si1]
text = lineAfterBackslash[si1+1:] # We drop the space
else: # The line is only the marker
marker = lineAfterBackslash
text = ''
if marker not in ignoreSFMs:
result.append( (marker, text) )
except UnicodeError as err:
print( "Unicode error:", sys.exc_info()[0], err )
logging.critical( "Invalid line in " + SFMFilepath + " -- line ignored at #" + str(lineCount) )
if lineCount > 1: print( 'Previous line was: ', lastLine )
#print( line )
#raise
self.lines = result
# end of SFMLines.read
# end of class SFMLines
class SFMRecords:
"""
Class holding a list of SFM records.
Each record is a list of SFM lines.
(The record always starts with the same SFMMarker, except perhaps the first record.)
Each line is a 2-tuple consisting of (SFMMarker, SFMValue).
"""
def __init__(self):
self.records = []
def __str__(self):
"""
This method returns the string representation of a SFM lines object.
@return: the name of a SFM field object formatted as a string
@rtype: string
"""
result = ""
for record in self.records:
if result: result += '\n' # Blank line between records
for line in record:
result += ('\n' if result else '') + str( line )
return result
def read( self, SFMFilepath, key=None, ignoreSFMs=None, ignoreEntries=None, changePairs=None, encoding='utf-8' ):
"""
Read a simple SFM (Standard Format Marker) file into a list of lists of tuples.
@param SFMFilepath: The filename
@type SFMFilepath: string
@param key: The SFM record marker (not including the backslash)
@type encoding: string
@rtype: list
@return: list of lists containing the records
"""
def changeMarker( currentMarker, changePairs ):
"""
Change the SFM marker if required
"""
if changePairs:
for findMarker, replaceMarker in changePairs:
if findMarker==currentMarker: return replaceMarker
return currentMarker
# end of changeMarker
# Main code for SFMRecords.read()
# Check/handle parameters
if ignoreSFMs is None: ignoreSFMs = ()
#print( "ignoreSFMs =", ignoreSFMs )
if ignoreEntries is None: ignoreEntries = ()
#print( "ignoreEntries =", ignoreEntries )
if key:
if '\\' in key: raise ValueError('SFM marker must not contain backslash')
if ' ' in key: raise ValueError('SFM marker must not contain spaces')
self.SFMFilepath = SFMFilepath
self.key = key
self.ignoreSFMs = ignoreSFMs
self.ignoreEntries = ignoreEntries
self.changePairs = changePairs
self.encoding = encoding
lastLine, lineCount, record, result = '', 0, [], []
with open( SFMFilepath, encoding=encoding ) as myFile: # Automatically closes the file when done
try:
for line in myFile:
lineCount += 1
if lineCount==1 and encoding.lower()=='utf-8' and line and line[0]==chr(65279): #U+FEFF
logging.info( "SFMRecords: Detected Unicode Byte Order Marker (BOM) in {}".format( SFMFilepath ) )
line = line[1:] # Remove the Unicode Byte Order Marker (BOM)
if line and line[-1]=='\n': line = line[:-1] # Removing trailing newline character
if not line: continue # Just discard blank lines
lastLine = line
#print ( 'SFM file line is "' + line + '"' )
#if line[0:2]=='\\_': continue # Just discard Toolbox header lines
if line[0]=='#': continue # Just discard comment lines
if line[0]!='\\':
if len(record)==0:
print( 'SFMFile.py: SFM file line is "' + line + '"' )
print( "First character of line is '" + line[0] + "' (" + str(ord(line[0])) + ")" )
print( "XXXRecord is", record)
raise IOError('Oops: Line break on last line of record not handled here "' + line + '"')
else: # Append this continuation line
oldmarker, oldtext = record.pop()
record.append( (oldmarker, oldtext+' '+line) )
continue
lineAfterBackslash = line[1:]
si1 = lineAfterBackslash.find( ' ' )
si2 = lineAfterBackslash.find( '\\' )
if si2!=-1 and (si1==-1 or si2<si1): # Marker stops at a backslash
marker = changeMarker( lineAfterBackslash[:si2], changePairs )
text = lineAfterBackslash[si2:]
elif si1!=-1: # Marker stops at a space
marker = changeMarker( lineAfterBackslash[:si1], changePairs )
text = lineAfterBackslash[si1+1:] # We drop the space
else: # The line is only the marker
marker = changeMarker( lineAfterBackslash, changePairs )
text = ''
if marker==key: print ("Warning: Have a blank key field after", record)
if not key and marker not in ignoreSFMs:
print (' Assuming', marker, 'to be the SFM key for', SFMFilepath)
key = marker
if marker==key: # Save the previous record
if record and record[0][1] not in ignoreEntries: # Looks at the text associated with the first (record key) marker
strippedRecord = []
for savedMarker,savedText in record:
if savedMarker not in ignoreSFMs:
strippedRecord.append( (savedMarker, savedText) )
if strippedRecord:
result.append( strippedRecord )
record = []
# Save the current marker and text
record.append( (marker, text) )
except UnicodeError as err:
print( "Unicode error:", sys.exc_info()[0], err )
logging.critical( "Invalid line in " + SFMFilepath + " -- line ignored at " + str(lineCount) )
if lineCount > 1: print( 'Previous line was: ', lastLine )
else: print( 'Possible encoding error -- expected', encoding )
#raise
# Write the final record
if record and record[0][1] not in ignoreEntries: # Looks at the text associated with the first (record key) marker
strippedRecord = []
for savedMarker,savedText in record:
if savedMarker not in ignoreSFMs:
strippedRecord.append( (savedMarker, savedText) )
if strippedRecord:
result.append( strippedRecord ) # Append the last record
self.records = result
# end of SFMRecords.read
def analyze( self ):
"""
Analyzes the list of records read in from the file
to find the smallest and largest size (number of lines) of each record
as well as making a list of all the SFM marker types
and a dictionary of all the possible values of all the various SFM markers.
Returns these two integers
plus the list and the dictionary.
"""
smallestSize, largestSize, markerList, markerSets = 9999, -1, [], {}
for record in self.records:
lr = len( record )
if lr < smallestSize: smallestSize = lr
if lr > largestSize: largestSize = lr
for marker, value in record:
if marker not in markerList:
markerList.append( marker )
markerSets[marker] = []
if value not in markerSets[marker]:
markerSets[marker].append( value )
return smallestSize, largestSize, markerList, markerSets
# end of SFMRecords.analyze
def copyToDict( self, internalStructure ):
"""
self.records is a list of lists.
This function copies them to a dictionary
where the keys are the values of the given marker (self.key).
The inner structure can either be lists (if the parameter is "list" )
which is most useful if lines with the identical SFM can be repeated within the record.
The inner structure can be dicts (if the parameter is "dict" )
which then checks that each line within the record starts with a unique marker.
The order of the original lines within each record is lost.
Returns the dictionary.
"""
assert internalStructure in ( "list", "dict" )
self.dataDict = {}
for record in self.records:
for j, (marker,value) in enumerate( record ):
if j==0:
assert marker == self.key
key = value
self.dataDict[key] = [] if internalStructure=="list" else {}
else:
if isinstance( self.dataDict[key], list ):
self.dataDict[key].append( (marker,value) )
elif isinstance( self.dataDict[key], dict ):
#print( j, key, marker, value )
if marker in self.dataDict[key]:
logging.warning( "Multiple {} lines in {} record--will be overwritten".format( marker, key ) )
self.dataDict[key][marker] = value
return self.dataDict
# end of SFMRecords.copyToDict
# end of class SFMRecords
def demo():
"""
Demonstrate reading and processing some UTF-8 SFM databases.
"""
if BibleOrgSysGlobals.verbosityLevel > 1: print( ProgNameVersion )
import os.path
filepath = os.path.join( 'Tests/DataFilesForTests/', 'MatigsalugDictionaryA.sfm' )
if BibleOrgSysGlobals.verbosityLevel > 2: print( "Using {} as test file…".format( filepath ) )
linesDB = SFMLines()
linesDB.read( filepath, ignoreSFMs=('mn','aMU','aMW','cu','cp') )
print( len(linesDB.lines), 'lines read from file', filepath )
for i, r in enumerate(linesDB.lines):
print ( i, r)
if i>9: break
print ( '…\n',len(linesDB.lines)-1, linesDB.lines[-1], '\n') # Display the last record
recordsDB = SFMRecords()
recordsDB.read( filepath, 'og', ignoreSFMs=('mn','aMU','aMW','cu','cp'))
print( len(recordsDB.records), 'records read from file', filepath )
for i, r in enumerate(recordsDB.records):
print ( i, r)
if i>3: break
print( '…\n',len(recordsDB.records)-1, recordsDB.records[-1]) # Display the last record
# end of demo
if __name__ == '__main__':
#from multiprocessing import freeze_support
#freeze_support() # Multiprocessing support for frozen Windows executables
# Configure basic set-up
parser = BibleOrgSysGlobals.setup( ProgName, ProgVersion )
BibleOrgSysGlobals.addStandardOptionsAndProcess( parser )
demo()
BibleOrgSysGlobals.closedown( ProgName, ProgVersion )
# end of SFMFile.py