Skip to content

Commit 1e8dfdc

Browse files
committed
including miscutils from w4py3
1 parent 62efe65 commit 1e8dfdc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+4069
-0
lines changed

MiscUtils/CSVJoiner.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""CSVJoiner.py
2+
3+
A helper function for joining CSV fields.
4+
"""
5+
6+
7+
def joinCSVFields(fields):
8+
"""Create a CSV record by joining its fields.
9+
10+
Returns a CSV record (e.g. a string) from a sequence of fields.
11+
Fields containing commands (,) or double quotes (") are quoted,
12+
and double quotes are escaped ("").
13+
The terminating newline is *not* included.
14+
"""
15+
newFields = []
16+
for field in fields:
17+
if not isinstance(field, str):
18+
raise UnicodeDecodeError('CSV fields should be strings')
19+
if '"' in field:
20+
newField = '"{}"'.format(field.replace('"', '""'))
21+
elif ',' in field or '\n' in field or '\r' in field:
22+
newField = f'"{field}"'
23+
else:
24+
newField = field
25+
newFields.append(newField)
26+
return ','.join(newFields)

MiscUtils/CSVParser.py

+264
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
"""CSVParser.py
2+
3+
A parser for CSV files.
4+
"""
5+
6+
# The states of the parser
7+
(StartRecord, StartField, InField, QuoteInField,
8+
InQuotedField, QuoteInQuotedField, EndQuotedField) = range(7)
9+
10+
# State handlers can return Finished to terminate parsing early
11+
Finished = 10
12+
13+
14+
class ParseError(Exception):
15+
"""CSV file parse error."""
16+
17+
18+
class CSVParser:
19+
"""Parser for CSV files.
20+
21+
Parses CSV files including all subtleties such as:
22+
23+
* commas in fields
24+
* double quotes in fields
25+
* embedded newlines in fields
26+
27+
Examples of programs that produce such beasts include MySQL and Excel.
28+
29+
For a higher-level, friendlier CSV class with many conveniences,
30+
see `DataTable` (which uses this class for its parsing).
31+
32+
Example::
33+
34+
records = []
35+
parse = CSVParser().parse
36+
for line in lines:
37+
results = parse(line)
38+
if results is not None:
39+
records.append(results)
40+
41+
42+
CREDIT
43+
44+
The algorithm was taken directly from the open source Python
45+
C-extension, csv: https://www.object-craft.com.au/projects/csv/
46+
47+
It would be nice to use the csv module when present, since it is
48+
substantially faster. Before that can be done, it needs to support
49+
`allowComments` and `stripWhitespace`, and pass the TestCSVParser.py
50+
test suite.
51+
"""
52+
53+
def __init__(
54+
self, allowComments=True, stripWhitespace=True, fieldSep=',',
55+
autoReset=True, doubleQuote=True):
56+
"""Create a new CSV parser.
57+
58+
`allowComments`:
59+
If true (the default), then comment lines using the Python comment
60+
marker are allowed.
61+
`stripWhitespace`:
62+
If true (the default), then left and right whitespace is stripped
63+
off from all fields.
64+
`fieldSep`:
65+
Defines the field separator string (a comma by default).
66+
`autoReset`:
67+
If true (the default), recover from errors automatically.
68+
`doubleQuote`:
69+
If true (the default), assume quotes in fields are escaped by
70+
appearing doubled.
71+
"""
72+
# settings
73+
self._allowComments = allowComments
74+
self._stripWhitespace = stripWhitespace
75+
self._doubleQuote = doubleQuote
76+
self._fieldSep = fieldSep
77+
self._autoReset = autoReset
78+
79+
# Other
80+
self._state = StartRecord
81+
self._fields = []
82+
self._hadParseError = False
83+
self._field = [] # a list of chars for the cur field
84+
self.addChar = self._field.append
85+
86+
# The handlers for the various states
87+
self._handlers = [
88+
self.startRecord,
89+
self.startField,
90+
self.inField,
91+
self.quoteInField,
92+
self.inQuotedField,
93+
self.quoteInQuotedField,
94+
self.endQuotedField,
95+
]
96+
97+
# region Parse
98+
99+
def parse(self, line):
100+
"""Parse a single line and return a list of string fields.
101+
102+
Returns None if the CSV record contains embedded newlines and
103+
the record is not yet complete.
104+
"""
105+
if self._autoReset and self._hadParseError:
106+
self.reset()
107+
handlers = self._handlers
108+
109+
i = 0
110+
lineLen = len(line)
111+
while i < lineLen:
112+
c = line[i]
113+
if c == '\r':
114+
i += 1
115+
if i == lineLen:
116+
break # Mac end of line
117+
c = line[i]
118+
if c == '\n':
119+
i += 1
120+
if i == lineLen:
121+
break # Win end of line
122+
123+
self._hadParseError = True
124+
raise ParseError('Newline inside string')
125+
126+
if c == '\n':
127+
i += 1
128+
if i == lineLen:
129+
break # Unix end of line
130+
131+
self._hadParseError = True
132+
raise ParseError('Newline inside string')
133+
134+
if handlers[self._state](c) == Finished:
135+
break # process a character
136+
137+
i += 1
138+
139+
handlers[self._state]('\0') # signal the end of the input
140+
141+
if self._state == StartRecord:
142+
fields = self._fields
143+
self._fields = []
144+
if self._stripWhitespace:
145+
fields = [field.strip() for field in fields]
146+
return fields
147+
148+
return None # indicates multi-line record; e.g. not finished
149+
150+
# endregion Parse
151+
152+
# region Reset
153+
154+
def reset(self):
155+
"""Reset the parser.
156+
157+
Resets the parser to a fresh state in order to recover from
158+
exceptions. But if autoReset is true (the default), this is
159+
done automatically.
160+
"""
161+
self._fields = []
162+
self._state = StartRecord
163+
self._hadParseError = False
164+
165+
# endregion Reset
166+
167+
# region State Handlers
168+
169+
def startRecord(self, c):
170+
if c != '\0': # not empty line
171+
if c == '#' and self._allowComments:
172+
return Finished
173+
self._state = StartField
174+
self.startField(c)
175+
176+
def startField(self, c):
177+
if c == '"':
178+
self._state = InQuotedField # start quoted field
179+
elif c == self._fieldSep:
180+
self.saveField() # save empty field
181+
elif c == ' ' and self._stripWhitespace:
182+
pass # skip over preceding whitespace
183+
elif c == '\0':
184+
self.saveField() # save empty field
185+
self._state = StartRecord
186+
else:
187+
self.addChar(c) # begin new unquoted field
188+
self._state = InField
189+
190+
def inField(self, c):
191+
# in unquoted field
192+
if c == self._fieldSep:
193+
self.saveField()
194+
self._state = StartField
195+
elif c == '\0':
196+
self.saveField() # end of line
197+
self._state = StartRecord
198+
elif c == '"' and self._doubleQuote:
199+
self._state = QuoteInField
200+
else:
201+
self.addChar(c) # normal character
202+
203+
def quoteInField(self, c):
204+
self.addChar('"')
205+
if c == '"':
206+
self._state = InField # save "" as "
207+
elif c == '\0':
208+
self.saveField() # end of line
209+
self._state = StartRecord
210+
elif c == self._fieldSep:
211+
self.saveField()
212+
self._state = StartField
213+
else:
214+
self.addChar(c) # normal character
215+
self._state = InField
216+
217+
def inQuotedField(self, c):
218+
if c == '"':
219+
if self._doubleQuote:
220+
self._state = QuoteInQuotedField
221+
else:
222+
self.saveField() # end of field
223+
self._state = EndQuotedField
224+
elif c == '\0':
225+
self.addChar('\n') # end of line
226+
else:
227+
self.addChar(c) # normal character
228+
229+
def quoteInQuotedField(self, c):
230+
if c == '"':
231+
self.addChar('"') # save "" as "
232+
self._state = InQuotedField
233+
elif c == self._fieldSep:
234+
self.saveField()
235+
self._state = StartField
236+
elif c == ' ' and self._stripWhitespace:
237+
pass # skip it
238+
elif c == '\0':
239+
self.saveField() # end of line
240+
self._state = StartRecord
241+
else:
242+
self._hadParseError = True # illegal
243+
raise ParseError(f'{self._fieldSep} expected after "')
244+
245+
def endQuotedField(self, c):
246+
if c == self._fieldSep: # seen closing " on quoted field
247+
self._state = StartField # wait for new field
248+
elif c == '\0':
249+
self._state = StartRecord # end of line
250+
else:
251+
self._hadParseError = True
252+
raise ParseError(f'{self._fieldSep} expected after "')
253+
254+
def saveField(self):
255+
self._fields.append(''.join(self._field))
256+
self._field = []
257+
self.addChar = self._field.append
258+
259+
# endregion State Handlers
260+
261+
262+
# Call the global function parse() if you like the default settings
263+
_parser = CSVParser()
264+
parse = _parser.parse

0 commit comments

Comments
 (0)