forked from Freely-Given-org/BibleOrgSys
-
Notifications
You must be signed in to change notification settings - Fork 0
/
USXFilenames.py
executable file
·361 lines (320 loc) · 19.9 KB
/
USXFilenames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# USXFilenames.py
#
# Module handling USX Bible filenames
#
# Copyright (C) 2012-2018 Robert Hunt
# Author: Robert Hunt <Freely.Given.org@gmail.com>
# License: See gpl-3.0.txt
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Module for creating and manipulating USX filenames.
"""
from gettext import gettext as _
LastModifiedDate = '2018-01-18' # by RJH
ShortProgName = "USXBible"
ProgName = "USX Bible filenames handler"
ProgVersion = '0.54'
ProgNameVersion = '{} v{}'.format( ProgName, ProgVersion )
ProgNameVersionDate = '{} {} {}'.format( ProgNameVersion, _("last modified"), LastModifiedDate )
debuggingThisModule = False
import os, logging
import BibleOrgSysGlobals
# All of the following must be all UPPER CASE
filenamesToIgnore = ('AUTOCORRECT.TXT','HYPHENATEDWORDS.TXT','PRINTDRAFTCHANGES.TXT','README.TXT','BOOK_NAMES.TXT',) # Only needs to include names whose extensions are not listed below
filenameEndingsToIgnore = ('.ZIP.GO', '.ZIP.DATA',) # Must begin with a dot
# NOTE: Extensions ending in ~ are also ignored
extensionsToIgnore = ( 'ASC', 'BAK', 'BAK2', 'BAK3', 'BAK4', 'BBLX', 'BC', 'CCT', 'CSS', 'DOC', 'DTS', 'HTM','HTML',
'JAR', 'LDS', 'LOG', 'MYBIBLE', 'NT','NTX', 'ODT', 'ONT','ONTX', 'OSIS', 'OT','OTX', 'PDB',
'SAV', 'SAVE', 'STY', 'SSF', 'USFM', 'VRS', 'YET', 'XML', 'ZIP', ) # Must be UPPERCASE and NOT begin with a dot
class USXFilenames:
"""
Class for creating and manipulating USX Filenames.
"""
def __init__( self, givenFolderName ):
"""
Create the object by inspecting files in the given folder.
"""
self.givenFolderName = givenFolderName
self.pattern, self.fileExtension = '', 'usx' # Pattern should end up as 'dddBBB'
self.fileList = [] # A list of all files in our folder (excluding folder names and backup filenames)
self.digitsIndex, self.USXBookCodeIndex = 0, 3
# Get the data tables that we need for proper checking
#self._USFMBooksCodes = BibleOrgSysGlobals.BibleBooksCodes.getAllUSFMBooksCodes()
#self._USFMBooksCodesUpper = [x.upper() for x in self._USFMBooksCodes]
self._USFMBooksCodeNumberTriples = BibleOrgSysGlobals.BibleBooksCodes.getAllUSFMBooksCodeNumberTriples()
#self._BibleditBooksCodeNumberTriples = BibleOrgSysGlobals.BibleBooksCodes.getAllBibleditBooksCodeNumberTriples()
# Find how many files are in our folder
for possibleFilename in os.listdir( self.givenFolderName ):
#print( "possibleFilename", possibleFilename )
pFUpper = possibleFilename.upper()
if pFUpper in filenamesToIgnore: continue
pFUpperProper, pFUpperExt = os.path.splitext( pFUpper )
#print( pFUpperProper, pFUpperExt )
ignore = False
for ending in filenameEndingsToIgnore:
if pFUpper.endswith( ending): ignore=True; break
if ignore: continue
if pFUpper[-1]!='~' and not pFUpperExt[1:] in extensionsToIgnore: # Compare without the first dot
filepath = os.path.join( self.givenFolderName, possibleFilename )
if os.path.isfile( filepath ): # It's a file not a folder
self.fileList.append( possibleFilename )
#print( "fL", self.fileList )
#if not self.fileList: logging.error( _("No files at all in given folder: {!r}").format( self.givenFolderName) ); return
matched = False
for foundFilename in self.fileList:
#print( foundFilename )
foundFileBit, foundExtBit = os.path.splitext( foundFilename )
foundLength = len( foundFileBit )
containsDigits = False
for char in foundFilename:
if char.isdigit():
containsDigits = True
break
#matched = False
#print( repr(foundFileBit), foundLength, containsDigits, repr(foundExtBit) )
if foundLength>=6 and containsDigits and foundExtBit=='.'+self.fileExtension:
for USXBookCode,USXDigits,BBB in BibleOrgSysGlobals.BibleBooksCodes.getAllUSXBooksCodeNumberTriples():
#print( USXBookCode,USXDigits,BBB )
if USXDigits in foundFileBit and (USXBookCode in foundFileBit or USXBookCode.upper() in foundFileBit):
digitsIndex = foundFileBit.index( USXDigits )
USXBookCodeIndex = foundFileBit.index(USXBookCode) if USXBookCode in foundFileBit else foundFileBit.index(USXBookCode.upper())
USXBookCode = foundFileBit[USXBookCodeIndex:USXBookCodeIndex+3]
#print( foundLength, digitsIndex, containsDigits, USXBookCodeIndex )
if foundLength==6 and digitsIndex==0 and USXBookCodeIndex==3: # Found a form like 001GEN.usx
self.digitsIndex = digitsIndex
self.hyphenIndex = None
self.USXBookCodeIndex = USXBookCodeIndex
self.pattern = 'dddbbb'
else: logging.error( _("Unrecognized USX filename template at ")+foundFileBit ); return
if USXBookCode.isupper(): self.pattern = self.pattern.replace( 'bbb', 'BBB' )
self.fileExtension = foundExtBit[1:]
matched = True
break
elif USXDigits[1:] in foundFileBit and '-' in foundFileBit and (USXBookCode in foundFileBit or USXBookCode.upper() in foundFileBit):
digitsIndex = foundFileBit.index( USXDigits[1:] ) # Without the leading zero for the 66 books
hyphenIndex = foundFileBit.index( '-' )
USXBookCodeIndex = foundFileBit.index(USXBookCode) if USXBookCode in foundFileBit else foundFileBit.index(USXBookCode.upper())
USXBookCode = foundFileBit[USXBookCodeIndex:USXBookCodeIndex+3]
#print( foundLength, digitsIndex, containsDigits, hyphenIndex, USXBookCodeIndex )
if foundLength==6 and digitsIndex==0 and hyphenIndex==2 and USXBookCodeIndex==3: # Found a form like 001GEN.usx
self.digitsIndex = digitsIndex
self.hyphenIndex = hyphenIndex
self.USXBookCodeIndex = USXBookCodeIndex
self.pattern = 'dd-bbb'
else: logging.error( _("Unrecognized USX filename template at ")+foundFileBit ); return
if USXBookCode.isupper(): self.pattern = self.pattern.replace( 'bbb', 'BBB' )
self.fileExtension = foundExtBit[1:]
matched = True
break
if matched: break
#print( matched )
if BibleOrgSysGlobals.verbosityLevel>2 and not matched: logging.info( _("Unable to recognize valid USX files in ") + self.givenFolderName )
#print( "USXFilenames: pattern={!r} fileExtension={!r}".format( self.pattern, self.fileExtension ) )
# end of USXFilenames.__init__
def __str__( self ):
"""
This method returns the string representation of an object.
@return: the name of a Bible object formatted as a string
@rtype: string
"""
result = "USX Filenames object"
indent = 2
if self.givenFolderName: result += ('\n' if result else '') + ' '*indent + _("Folder: {}").format( self.givenFolderName )
if self.pattern: result += ('\n' if result else '') + ' '*indent + _("Filename pattern: {}").format( self.pattern )
if self.fileExtension: result += ('\n' if result else '') + ' '*indent + _("File extension: {}").format( self.fileExtension )
return result
# end of USXFilenames.__str__
def getFilenameTemplate( self ):
"""
Returns a pattern/template for USX filenames where
bbb = book code (lower case) or BBB = book code (UPPER CASE)
ddd = digits
It should be 'dddBBB' for USX files
"""
return self.pattern
# end of USXFilenames.getFilenameTemplate
def doListAppend( self, BBB, filename, givenList, caller ):
"""
Check that BBB and filename are not in the givenList,
then add them as a 2-tuple.
If there is a duplicate, remove both (as we're obviously unsure).
"""
removeBBB = removeFilename = None
for existingBBB, existingFilename in givenList:
if existingBBB == BBB:
if BibleOrgSysGlobals.verbosityLevel > 2: logging.warning( "{} tried to add duplicate {} {} when already had {} (removed both)".format( caller, BBB, filename, existingFilename ) )
removeBBB, removeFilename = existingBBB, existingFilename
if existingFilename == filename:
if BibleOrgSysGlobals.verbosityLevel > 2: logging.warning( "{} tried to add duplicate {} {} when already had {} (removed both)".format( caller, filename, BBB, existingBBB ) )
removeBBB, removeFilename = existingBBB, existingFilename
if removeFilename:givenList.remove( (removeBBB,removeFilename,) )
else: givenList.append( (BBB,filename,) )
# end of USXFilenames.doListAppend
def getDerivedFilenameTuples( self ):
"""
Return a list of valid USX filenames that match our filename template.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
resultList = []
if self.pattern:
for USFMBookCode,USXDigits,BBB in BibleOrgSysGlobals.BibleBooksCodes.getAllUSXBooksCodeNumberTriples():
filename = "------" # Six characters
if self.hyphenIndex is None:
filename = filename[:self.digitsIndex] + USXDigits + filename[self.digitsIndex+len(USXDigits):]
else: # have a hyphen so assumeonly two digits
if USXDigits.isdigit():
USXInt = int( USXDigits )
if USXInt > 39:
USXDigits = str( USXInt + 1 )
USXDigits = '0'*(3-len(USXDigits)) + USXDigits
#print( repr(USXDigits) ); halt
filename = filename[:self.digitsIndex] + USXDigits[1:] + filename[self.digitsIndex+len(USXDigits)-1:]
filename = filename[:self.USXBookCodeIndex] + ( USFMBookCode.upper() if 'BBB' in self.pattern else USFMBookCode ) + filename[self.USXBookCodeIndex+len(USFMBookCode):]
filename += '.' + self.fileExtension
#print( "getDerivedFilenames: Filename is {!r}".format( filename ) )
resultList.append( (BBB,filename,) )
return BibleOrgSysGlobals.BibleBooksCodes.getSequenceList( resultList )
# end of USXFilenames.getDerivedFilenameTuples
def getConfirmedFilenameTuples( self, strictCheck=False ):
"""
Return a list of tuples of UPPER CASE book codes with actual (present and readable) USX filenames.
If the strictCheck flag is set, the program also looks at the first line(s) inside the files.
The result is a list of 2-tuples in the default rough sequence order from the BibleBooksCodes module.
Each tuple contains ( BBB, filename ) not including the folder path.
"""
resultList = []
for BBB,possibleFilename in self.getDerivedFilenameTuples():
possibleFilepath = os.path.join( self.givenFolderName, possibleFilename )
#print( ' Looking for: ' + possibleFilename )
if os.access( possibleFilepath, os.R_OK ):
#print( "possibleFilepath", possibleFilepath )
#USXBookCode = possibleFilename[self.USXBookCodeIndex:self.USXBookCodeIndex+3].upper()
if strictCheck or BibleOrgSysGlobals.strictCheckingFlag:
firstLines = BibleOrgSysGlobals.peekIntoFile( possibleFilename, self.givenFolderName, numLines=3 )
#print( "firstLinesGCFT", firstLines )
if not firstLines or len(firstLines)<3: continue
if not ( firstLines[0].startswith( '<?xml version="1.0"' ) or firstLines[0].startswith( "<?xml version='1.0'" ) ) \
and not ( firstLines[0].startswith( '\ufeff<?xml version="1.0"' ) or firstLines[0].startswith( "\ufeff<?xml version='1.0'" ) ): # same but with BOM
if BibleOrgSysGlobals.verbosityLevel > 3: print( "USXB (unexpected) first line was {!r} in {}".format( firstLines, thisFilename ) )
if '<usx' not in firstLines[0] and '<usx' not in firstLines[1]:
continue # so it doesn't get added
resultList.append( (BBB, possibleFilename,) )
return resultList # No need to sort these, coz the above call produce sorted results
# end of USXFilenames.getConfirmedFilenameTuples
def getPossibleFilenameTuples( self, strictCheck=False ):
"""
Return a list of filenames just derived from the list of files in the folder,
i.e., look only externally at the filenames.
If the strictCheck flag is set, the program also looks at the first line(s) inside the files.
"""
#print( "getPossibleFilenameTuples()" )
resultList = []
for possibleFilename in self.fileList:
pFUpper = possibleFilename.upper()
if pFUpper in filenamesToIgnore: continue
pFUpperProper, pFUpperExt = os.path.splitext( pFUpper )
for USFMBookCode,USFMDigits,BBB in self._USFMBooksCodeNumberTriples:
ignore = False
for ending in filenameEndingsToIgnore:
if pFUpper.endswith( ending): ignore=True; break
if ignore: continue
if USFMBookCode.upper() in pFUpperProper:
if pFUpper[-1]!='~' and not pFUpperExt[1:] in extensionsToIgnore: # Compare without the first dot
if strictCheck or BibleOrgSysGlobals.strictCheckingFlag:
firstLines = BibleOrgSysGlobals.peekIntoFile( possibleFilename, self.givenFolderName, numLines=3 )
#print( "firstLinesGPFT", firstLines )
if not firstLines or len(firstLines)<3: continue
if not ( firstLines[0].startswith( '<?xml version="1.0"' ) or firstLines[0].startswith( "<?xml version='1.0'" ) ) \
and not ( firstLines[0].startswith( '\ufeff<?xml version="1.0"' ) or firstLines[0].startswith( "\ufeff<?xml version='1.0'" ) ): # same but with BOM
if BibleOrgSysGlobals.verbosityLevel > 3: print( "USXB (unexpected) first line was {!r} in {}".format( firstLines, thisFilename ) )
if '<usx' not in firstLines[0] and '<usx' not in firstLines[1]:
continue # so it doesn't get added
self.doListAppend( BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFMAbbreviation( USFMBookCode ), possibleFilename, resultList, "getPossibleFilenameTuplesExt" )
self.lastTupleList = resultList
#print( "resultList", resultList )
return BibleOrgSysGlobals.BibleBooksCodes.getSequenceList( resultList )
# end of USXFilenames.getPossibleFilenameTuples
def getUnusedFilenames( self ):
"""
Return a list of filenames which didn't match the USFX template.
The order of the filenames in the list has no meaning.
"""
folderFilenames = os.listdir( self.givenFolderName )
actualFilenames = self.getConfirmedFilenameTuples()
filelist = []
for BBB,actualFilename in actualFilenames:
folderFilenames.remove( actualFilename )
return folderFilenames
# end of USXFilenames.getUnusedFilenames
#def getSSFFilenames( self, searchAbove=False, auto=True ):
# """Return a list of full pathnames of .ssf files in the folder.
# NOTE: USX projects don't usually have the .ssf files in the project folder,
# but 'backed-up' projects often do.
# If searchAbove is set to True and no ssf files are found in the given folder,
# this routine will attempt to search the next folder up the file hierarchy.
# Furthermore, unless auto is set to False,
# it will try to find the correct one from multiple SSFs."""
# def getSSFFilenamesHelper( folder ):
# filelist = []
# files = os.listdir( folder )
# for foundFilename in files:
# if not foundFilename.endswith('~'): # Ignore backup files
# foundFileBit, foundExtBit = os.path.splitext( foundFilename )
# if foundExtBit.lower()=='.ssf':
# filelist.append( os.path.join( folder, foundFilename ) )
# return filelist
# # end of getSSFFilenamesHelper
# filelist = getSSFFilenamesHelper( self.givenFolderName )
# if not filelist and searchAbove: # try the next level up
# filelist = getSSFFilenamesHelper( os.path.join( self.givenFolderName, '../' ) )
# if auto and len(filelist)>1: # See if we can help them by automatically choosing the right one
# count, index = 0, -1
# for j, filepath in enumerate(filelist): # Check if we can find a single matching ssf file
# foundPathBit, foundExtBit = os.path.splitext( filepath )
# foundPathBit, foundFileBit = os.path.split( foundPathBit )
# #print( foundPathBit, foundFileBit, foundExtBit, self.givenFolderName )
# if foundFileBit in self.givenFolderName: index = j; count += 1 # Take a guess that this might be the right one
# #print( count, index )
# if count==1 and index!=-1: filelist = [ filelist[index] ] # Found exactly one so reduce the list down to this one filepath
# return filelist
## end of getSSFFilenames
# end of class USXFiles
def demo():
""" Demonstrate finding files in some USX Bible folders. """
if BibleOrgSysGlobals.verbosityLevel > 0: print( ProgNameVersion )
# These are relative paths -- you can replace these with your test folder(s)
testFolders = ('Tests/DataFilesForTests/USXTest1/', 'Tests/DataFilesForTests/USXTest2/',
'Tests/DataFilesForTests/USFMTest1/', 'Tests/DataFilesForTests/USFMTest2/',)
for testFolder in testFolders:
print( '\n' )
if os.access( testFolder, os.R_OK ):
UFns = USXFilenames( testFolder )
print( UFns )
result = UFns.getDerivedFilenameTuples(); print( "\nPossible:", len(result), result )
result = UFns.getConfirmedFilenameTuples(); print( "\nConfirmed:", len(result), result )
result = UFns.getUnusedFilenames(); print( "\nOther:", len(result), result )
else: print( "Sorry, test folder {!r} doesn't exist on this computer.".format( testFolder ) )
# end of demo
if __name__ == '__main__':
# Configure basic set-up
parser = BibleOrgSysGlobals.setup( ProgName, ProgVersion )
BibleOrgSysGlobals.addStandardOptionsAndProcess( parser )
demo()
BibleOrgSysGlobals.closedown( ProgName, ProgVersion )
# end of USXFilenames.py