This repository has been archived by the owner on Sep 20, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 58
/
Xlout.py
293 lines (267 loc) · 17.1 KB
/
Xlout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# -*- coding: utf-8 -*-
"""
:mod:`EdgarRenderer.Xlout`
~~~~~~~~~~~~~~~~~~~
Edgar(tm) Renderer was created by staff of the U.S. Securities and Exchange Commission.
Data and content created by government employees within the scope of their employment
are not subject to domestic copyright protection. 17 U.S.C. 105.
"""
"""
Convert Html Tables into Excel tables
At this moment, Xlout.py requires openpyxl 2.1.4, it does not work with openpyxl 2.2.x
"""
import os.path, datetime, time, lxml, decimal, collections, openpyxl.cell, openpyxl.styles, openpyxl.utils, openpyxl.worksheet.dimensions
import regex as re
from . import IoManager
from lxml.etree import tostring as treeToString
# note that number pattern allows word before number like shares (1,234,567) (but would misfire on same in text block!)
numberPattern = re.compile(r"\s*[_A-Za-z\xC0-\xD6\xD8-\xF6\xF8-\xFF\u0100-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]*"
r"\s*([$€¥£]\s*)?[(]?\s*[+-]?[0-9,]+([.][0-9]*)?[)-]?[%]?\s*$")
# word in num only allows non-numeric unicode character in first group
# we could change to allow number after first alpha character (like in XmlValidate.py) but not sure it is necessary net
wordInNumPattern = re.compile(r"\s*([_A-Za-z\xC0-\xD6\xD8-\xF6\xF8-\xFF\u0100-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]*)"
r"\s*([$€¥£]\s*)?[(]?\s*[+-]?[0-9,]+([.][0-9]*)?[)-]?\s*$")
dateTimePattern = re.compile(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-2][0-9]:[0-6][0-9]:[0-6][0-9]$")
datePattern = re.compile(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")
forbiddenChars = re.compile(r'[\\*?:/\[\]]')
OUTPUT_FILE_NAME = "Financial_Report.xlsx"
def intCol(elt, attrName, default=None):
try:
return int(elt.get(attrName, default))
except (TypeError, ValueError):
return default
class XlWriter(object):
def __init__(self, controller, outputFolderName):
self.controller = controller
self.outputFolderName = outputFolderName
self.simplified_transform = lxml.etree.XSLT(lxml.etree.parse(controller.excelXslt))
self.wb = openpyxl.Workbook()
self.sheetNames = set() # prevent duplicates
self.workSheet = None
controller.logDebug(_('Excel writer initialized {}'.format(controller.entrypoint)),file=os.path.basename(__file__))
def __repr__(self):
return ("xlWriter(outputFolderName='{}')".format(self.outputFolderName))
def close(self):
# dereference openpyxl storage
del self.wb
del self.controller
del self.simplified_transform
def save(self, suffix="", zipDir=""):
if len(self.wb.worksheets)>1:
self.wb.remove(self.wb.worksheets[0])
if not (self.controller.reportZip or self.outputFolderName is not None):
return # no report output (just validation)
import io
file = io.BytesIO()
self.wb.save(file)
file.seek(0)
outputFileName = OUTPUT_FILE_NAME + suffix
if self.controller.reportZip:
self.controller.reportZip.writestr(zipDir + outputFileName, file.read())
else:
self.controller.writeFile(os.path.join(self.outputFolderName, outputFileName), file.read())
file.close()
del file # dereference
self.controller.renderedFiles.add(outputFileName)
self.controller.logDebug('Excel output saved {}'.format(self.controller.entrypoint),file=os.path.basename(__file__))
def createWorkSheet(self, reportNum, reportShortName):
sheetName = reportShortName[:31] # max length 31 for excel title
try:
sheetName = sheetName.encode("ascii", "replace").decode("ascii") # this can throw an exception
sheetName = re.sub(forbiddenChars, '_', sheetName) # some characters not allowed in Sheet Names, we replace them with '_'
except Exception:
sheetName = 'nonAsciiName_' + str(reportNum)
#message = ErrorMgr.getError('CANNOT_CREATE_SHEET_NAME').format(reportShortName, sheetName)
self.controller.logDebug(("Cannot convert {} to ASCII. The Excel work sheet will instead " \
"be labeled {}.").format(reportShortName, sheetName),
file=os.path.basename(__file__))
# if there are duplicate names, disambiguate them. we use casefold() because excel's sheet names aren't case sensitive, so
# it thinks 'Dog' and 'dog' are the same sheet.
sheetDupNbr = 1
while sheetName.casefold() in self.sheetNames:
sheetDupNbr += 1 # second sheet title is blahblah_2, 3rd is blahblah_3, _ needed because some sheets end in year numbers
sheetDupNbrStr = str(sheetDupNbr)
sheetName = sheetName[:30 - len(sheetDupNbrStr)] + "_" + sheetDupNbrStr # this can overshoot, won't cause an array out of index error
self.sheetNames.add(sheetName.casefold())
self.workSheet = self.wb.create_sheet(title=sheetName)
def buildWorkSheet(self, report):
ws = self.workSheet
colsWithCustomDimensions = set()
try:
# Need to Handle, not currently handled:
# rowspan attribute of <td> or <th> - this always occurs in upper leftmost cell of any R.htm
# colspan attribute (does not seem to actually happen)
# any <table> element appearing in a textBlock is being dropped.
# any other html formatting of users' text blocks is pretty much thrown away by this.
_startedAt = time.time()
rdoc = self.simplified_transform(report.rootETree,asPage=lxml.etree.XSLT.strparam('true'),method='html')
self.controller.logDebug("R{} xlout XSLT {:.3f} secs".format(report.cube.fileNumber, time.time() - _startedAt))
# uncomment to debug intermediate xml result
#with open("/Users/hermf/temp/xlout.xml", "wb") as fh:
# fh.write(treeToString(rdoc, method='xml', with_tail=False, pretty_print=True, encoding='utf-8', xml_declaration=True))
row = 0 # openpyxl changed to 1-offset col numbering in version 2
widthPerCharacter = 1
maxWidth = 80
for tableElt in rdoc.iter(tag="table"):
# handle only top level table of class 'report'
if (tableElt.xpath("count(ancestor::table)") == 0 and tableElt.xpath("count(ancestor-or-self::table[@class='report'])")>0):
def populateCell(col,row,classAttr,text,tag,fontBold):
# TODO Just cut off the text at 32767 the biggest string Excel 2010 can handle.
colLetter = openpyxl.utils.get_column_letter(col)
if colLetter not in colsWithCustomDimensions:
colsWithCustomDimensions.add(colLetter)
ws.column_dimensions[colLetter] = openpyxl.worksheet.dimensions.ColumnDimension(ws, customWidth=True)
cell = ws.cell(row=row,column=col)
# default style fields (so we can set the all at once)
wrapText = False
alignHorizontal = "general"
alignVertical = "top"
fmt = "General"
unitSymbol = ''
if dateTimePattern.match(text):
try: cell.value = datetime.datetime(text)
except: cell.value = text
elif datePattern.match(text):
try: cell.value = datetime.date(text)
except: cell.value = text
elif numberPattern.match(text) and classAttr in ("num", "nump"):
if ',' in text:
text = text.replace(',', '')
isNeg = isPct = False
for c in '()-':
if c in text:
text = text.replace(c, '')
isNeg = True
if "%" in text:
isPct = True
text = text.replace('%', '')
try:
mWordInNum = wordInNumPattern.match(text)
if mWordInNum and len(mWordInNum.group(1)) > 0:
unitSymbol = mWordInNum.group(1) + ' '
text = text.replace(unitSymbol, '').strip()
else:
for c in '$€¥£':
if c in text:
text = text.replace(c, '').strip()
unitSymbol = c + ' '
break
value = decimal.Decimal(text)
fractPart = value % decimal.Decimal(1)
if fractPart:
decimalPlaces = len(str(fractPart)) - 2
fractPattern = '.' + ''.join('0' for ignore in range(decimalPlaces))
else:
fractPattern = ''
if unitSymbol:
fmt = '_("{0}"#,##0{1}_);_("{0}"(#,##0{1})'.format(unitSymbol, fractPattern)
elif isPct:
fmt = '#,##0{0}%_);(#,##0{0}%)'.format(fractPattern)
value = value / decimal.Decimal(100)
else:
fmt = '#,##0{0}_);(#,##0{0})'.format(fractPattern)
if isNeg:
value = -value
cell.value = value
except (ValueError, decimal.InvalidOperation):
cell.value = text
fractPattern = ''
alignHorizontal = 'right'
else:
cell.value = text
wrapText = True
if tag == 'th':
alignHorizontal = 'center'
alignVertical = 'center'
# set style all at once (see http://openpyxl.readthedocs.org/en/latest/styles.html)
if (fontBold or wrapText or fmt != "General" or alignHorizontal != "general" or alignVertical != "bottom"):
if getattr(openpyxl, "__version__", "unknown") >= "2.2.0":
cell.font = openpyxl.styles.Font(bold=fontBold)
cell.alignment = openpyxl.styles.Alignment(horizontal=alignHorizontal, vertical=alignVertical, wrap_text=wrapText)
else:
cell.style = openpyxl.styles.Style(font = openpyxl.styles.Font(bold=fontBold),
alignment = openpyxl.styles.Alignment(horizontal=alignHorizontal, vertical=alignVertical, wrap_text=wrapText)
# HF causes crash ,number_format=fmt
)
cell.number_format = fmt
try:
currentWidth = ws.column_dimensions[colLetter].width
w = len(text) + 1
if unitSymbol:
for c, n in collections.Counter(unitSymbol).items():
if c in ('MW_'): w += n * 2
elif c.islower() or c.isdigit() or c in (' .,'): w += n
else: w += n * 1.3
w += 2 # assure room for unit negative numbers and padding
newWidth = min(int(w) * widthPerCharacter,maxWidth)
ws.column_dimensions[colLetter].width = newWidth if currentWidth is None else max(currentWidth,newWidth)
except Exception as ex:
#message = ErrorMgr.getError('CANNOT_ADJUST_WIDTH').format(cell,colLetter, ex)
self.controller.logDebug(("Internal error in worksheet generation: {} could not adjust width on column {}: {}").format(
cell, colLetter, ex), file='Xlout.py', messageCode="debug")
return cell
mergedAreas = {} # colNumber: (colspan,lastrow)
for rowNum, trElt in enumerate(tableElt.iterchildren(tag="tr")):
# remove passed mergedAreas
for mergeCol in [col
for col, mergedArea in mergedAreas.items()
if rowNum >= mergedArea[1]]: # rowNum is 0-based, row is 1-based
del mergedAreas[mergeCol]
col = 1 # openpyxl 2.0.2 cell A1 row=1 col=1
for trTdElt in trElt.iterchildren():
if trTdElt.tag in ('th', 'td'):
if col == 1:
row += 1 # new row
if col in mergedAreas:
col += mergedAreas[col][0]
colspan = intCol(trTdElt, "colspan", 1)
rowspan = intCol(trTdElt, "rowspan", 1)
if rowspan > 1:
mergedAreas[col] = (colspan, row + rowspan - 1)
#text = trTdElt.text_content().strip()
text = ''.join([s for s in trTdElt.itertext(tag=lxml.etree.Element)])
textNodes = tryExtractingTextNodes(text)
if textNodes is not None:
text = textNodes
if not text == '':
isBold = any(True for ignore in trTdElt.iterdescendants('strong','b'))
ignore = populateCell(col,row,trTdElt.get("class"),text,trTdElt.tag,isBold)
elif rowspan > 1 or colspan > 1:
ws.cell(row=row,column=col)
if colspan > 1 or rowspan > 1:
# openpyxl bug in 2.0.2, these row/col are 0 offset, not 1 offset
#ws.merge_cells(start_row=row, end_row=row+rowspan-1, start_column=col, end_column=col+colspan-1)
ws.merge_cells(range_string='%s%s:%s%s' % (openpyxl.utils.get_column_letter(col),
row,
openpyxl.utils.get_column_letter(col+colspan-1),
row+rowspan-1))
col += colspan
except (lxml.etree.LxmlError, lxml.etree.XSLTError) as err:
self.controller.logDebug("Internal error in worksheet generation: {}".format(err.args),file='Xlout.py', messageCode="debug")
except (Exception) as err:
try: message = err.message
except: message = str(getattr(err,'args',err))
self.controller.logDebug("Internal error in worksheet generation: {}".format(message),file='Xlout.py', messageCode="debug")
def tryExtractingTextNodes(text):
# if the string looks like HTML, and it can be parsed, then
# strip out all the tags
# collapse all whitespace but preserve one newline whenever one appears
# in whitespace except at the beginning and end.
# Otherwise return None.
# This flattening can't be done effectively in XSLT because the quoted text inside the XML element is HTML.
if '<' in text and '>' in text:
blank = ' '
newline = '\n'
blanks_pat='['+''.join(['\xA0','\x40','\x09',blank])+']+'
newlines_pat='[ \r\n]*[\r\n][ \r\n]*'
try:
tree = lxml.etree.HTML('<body>'+text+'</body>')
plain = ' '.join([x for x in tree.itertext(tag=lxml.etree.Element,with_tail=False)])
plain = re.sub(blanks_pat,blank,plain)
plain = re.sub(newlines_pat,newline,plain)
plain = plain.strip('\n')
del tree
return plain
except Exception as ignore:
return None
return None # from tryExtractingTextNodes