/
RulesReader.hs
1568 lines (1367 loc) · 67.9 KB
/
RulesReader.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
--- * module
--- ** doc
-- In Emacs, use TAB on lines beginning with "-- *" to collapse/expand sections.
{-|
A reader for a CSV rules file.
This reads the actual data from a file specified by a `source` rule
or from a similarly-named file in the same directory.
Most of the code for reading rules files and csv files is in this module.
-}
-- Lots of haddocks in this file are for non-exported types.
-- Here's a command that will render them:
-- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open
--- ** language
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# LANGUAGE LambdaCase #-}
--- ** exports
module Hledger.Read.RulesReader (
-- * Reader
reader,
-- * Misc.
readJournalFromCsv,
-- readRulesFile,
-- parseCsvRules,
-- validateCsvRules,
-- CsvRules,
dataFileFor,
rulesFileFor,
parseBalanceAssertionType,
-- * Tests
tests_RulesReader,
)
where
--- ** imports
import Prelude hiding (Applicative(..))
import Control.Applicative (Applicative(..))
import Control.Monad (unless, when, void)
import Control.Monad.Except (ExceptT(..), liftEither, throwError)
import qualified Control.Monad.Fail as Fail
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.State.Strict (StateT, get, modify', evalStateT)
import Control.Monad.Trans.Class (lift)
import Data.Char (toLower, isDigit, isSpace, isAlphaNum, ord)
import Data.Bifunctor (first)
import Data.Functor ((<&>))
import Data.List (elemIndex, foldl', mapAccumL, nub, sortOn)
import Data.List.Extra (groupOn)
import Data.Maybe (catMaybes, fromMaybe, isJust)
import Data.MemoUgly (memo)
import qualified Data.Set as S
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
import Safe (atMay, headMay, lastMay, readMay)
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import Data.Foldable (asum, toList)
import Text.Megaparsec hiding (match, parse)
import Text.Megaparsec.Char (char, newline, string, digitChar)
import Text.Megaparsec.Custom (parseErrorAt)
import Text.Printf (printf)
import Hledger.Data
import Hledger.Utils
import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, commenttagsp )
import Hledger.Read.CsvUtils
import System.Directory (doesFileExist, getHomeDirectory)
import Data.Either (fromRight)
--- ** doctest setup
-- $setup
-- >>> :set -XOverloadedStrings
--- ** reader
_READER__________________________________________ = undefined -- VSCode outline separator
reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Rules
,rExtensions = ["rules"]
,rReadFn = parse
,rParser = error' "sorry, rules files can't be included" -- PARTIAL:
}
isFileName f = takeFileName f == f
getDownloadDir = do
home <- getHomeDirectory
return $ home </> "Downloads" -- XXX
-- | Parse and post-process a "Journal" from the given rules file path, or give an error.
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file
-- in the same directory.
-- The source rule can specify a glob pattern and supports ~ for home directory.
-- If it is a bare filename it will be relative to the defaut download directory
-- on this system. If is a relative file path it will be relative to the rules
-- file's directory. When a glob pattern matches multiple files, the alphabetically
-- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
-- will be used.)
-- The provided text, or a --rules-file option, are ignored by this reader.
-- Balance assertions are not checked.
parse :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse iopts f _ = do
rules <- readRulesFile $ dbg4 "reading rules file" f
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
mdatafile <- liftIO $ do
dldir <- getDownloadDir
let rulesdir = takeDirectory f
let msource = T.unpack <$> getDirective "source" rules
fs <- case msource of
Just src -> expandGlob dir (dbg4 "source" src) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
where (dir,desc) = if isFileName src then (dldir," in download directory") else (rulesdir,"")
Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor f] -- shouldn't fail, f has .rules extension
where err = error' $ "could not infer a data file for " <> f
return $ dbg4 "data file" $ headMay fs
case mdatafile of
Nothing -> return nulljournal -- data file specified by source rule was not found
Just dat -> do
exists <- liftIO $ doesFileExist dat
if not (dat=="-" || exists)
then return nulljournal -- data file inferred from rules file name was not found
else do
t <- liftIO $ readFileOrStdinPortably dat
readJournalFromCsv (Just $ Left rules) dat t Nothing
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
-- reversed, as produced by JournalReader's parser.
-- But here they are already properly ordered. So we'd
-- better preemptively reverse them once more. XXX inefficient
. journalReverse
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""
--- ** reading rules files
--- *** rules utilities
_RULES_READING__________________________________________ = undefined
-- | Given a rules file path, what would be the corresponding data file ?
-- (Remove a .rules extension.)
dataFileFor :: FilePath -> Maybe FilePath
dataFileFor = stripExtension "rules"
-- | Given a csv file path, what would be the corresponding rules file ?
-- (Add a .rules extension.)
rulesFileFor :: FilePath -> FilePath
rulesFileFor = (++ ".rules")
-- | An exception-throwing IO action that reads and validates
-- the specified CSV rules file (which may include other rules files).
readRulesFile :: FilePath -> ExceptT String IO CsvRules
readRulesFile f =
liftIO (do
dbg6IO "using conversion rules file" f
readFilePortably f >>= expandIncludes (takeDirectory f)
) >>= either throwError return . parseAndValidateCsvRules f
-- | Inline all files referenced by include directives in this hledger CSV rules text, recursively.
-- Included file paths may be relative to the directory of the provided file path.
-- This is done as a pre-parse step to simplify the CSV rules parser.
expandIncludes :: FilePath -> Text -> IO Text
expandIncludes dir0 content = mapM (expandLine dir0) (T.lines content) <&> T.unlines
where
expandLine dir1 line =
case line of
(T.stripPrefix "include " -> Just f) -> expandIncludes dir2 =<< T.readFile f'
where
f' = dir1 </> T.unpack (T.dropWhile isSpace f)
dir2 = takeDirectory f'
_ -> return line
-- defaultRulesText :: FilePath -> Text
-- defaultRulesText _csvfile = T.pack $ unlines
-- ["# hledger csv conversion rules" -- for " ++ csvFileFor (takeFileName csvfile)
-- ,"# cf http://hledger.org/hledger.html#csv"
-- ,""
-- ,"account1 assets:bank:checking"
-- ,""
-- ,"fields date, description, amount1"
-- ,""
-- ,"#skip 1"
-- ,"#newest-first"
-- ,""
-- ,"#date-format %-d/%-m/%Y"
-- ,"#date-format %-m/%-d/%Y"
-- ,"#date-format %Y-%h-%d"
-- ,""
-- ,"#currency $"
-- ,""
-- ,"if ITUNES"
-- ," account2 expenses:entertainment"
-- ,""
-- ,"if (TO|FROM) SAVINGS"
-- ," account2 assets:bank:savings\n"
-- ]
-- | An error-throwing IO action that parses this text as CSV conversion rules
-- and runs some extra validation checks. The file path is used in error messages.
parseAndValidateCsvRules :: FilePath -> T.Text -> Either String CsvRules
parseAndValidateCsvRules rulesfile s =
case parseCsvRules rulesfile s of
Left err -> Left $ customErrorBundlePretty err
Right rules -> first makeFancyParseError $ validateCsvRules rules
where
makeFancyParseError :: String -> String
makeFancyParseError errorString =
parseErrorPretty (FancyError 0 (S.singleton $ ErrorFail errorString) :: ParseError Text String)
instance ShowErrorComponent String where
showErrorComponent = id
-- | Parse this text as CSV conversion rules. The file path is for error messages.
parseCsvRules :: FilePath -> T.Text -> Either (ParseErrorBundle T.Text HledgerParseErrorData) CsvRules
-- parseCsvRules rulesfile s = runParser csvrulesfile nullrules{baseAccount=takeBaseName rulesfile} rulesfile s
parseCsvRules = runParser (evalStateT rulesp defrules)
-- | Return the validated rules, or an error.
validateCsvRules :: CsvRules -> Either String CsvRules
validateCsvRules rules = do
unless (isAssigned "date") $ Left "Please specify (at top level) the date field. Eg: date %1"
Right rules
where
isAssigned f = isJust $ hledgerField rules [] f
--- *** rules types
_RULES_TYPES__________________________________________ = undefined
-- | A set of data definitions and account-matching patterns sufficient to
-- convert a particular CSV data file into meaningful journal transactions.
data CsvRules' a = CsvRules' {
rdirectives :: [(DirectiveName,Text)],
-- ^ top-level rules, as (keyword, value) pairs
rcsvfieldindexes :: [(CsvFieldName, CsvFieldIndex)],
-- ^ csv field names and their column number, if declared by a fields list
rassignments :: [(HledgerFieldName, FieldTemplate)],
-- ^ top-level assignments to hledger fields, as (field name, value template) pairs
rconditionalblocks :: [ConditionalBlock],
-- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
rblocksassigning :: a -- (String -> [ConditionalBlock])
-- ^ all conditional blocks which can potentially assign field with a given name (memoized)
}
-- | Type used by parsers. Directives, assignments and conditional blocks
-- are in the reverse order compared to what is in the file and rblocksassigning is non-functional,
-- could not be used for processing CSV records yet
type CsvRulesParsed = CsvRules' ()
-- | Type used after parsing is done. Directives, assignments and conditional blocks
-- are in the same order as they were in the input file and rblocksassigning is functional.
-- Ready to be used for CSV record processing
type CsvRules = CsvRules' (Text -> [ConditionalBlock]) -- XXX simplify
instance Eq CsvRules where
r1 == r2 = (rdirectives r1, rcsvfieldindexes r1, rassignments r1) ==
(rdirectives r2, rcsvfieldindexes r2, rassignments r2)
-- Custom Show instance used for debug output: omit the rblocksassigning field, which isn't showable.
instance Show CsvRules where
show r = "CsvRules { rdirectives = " ++ show (rdirectives r) ++
", rcsvfieldindexes = " ++ show (rcsvfieldindexes r) ++
", rassignments = " ++ show (rassignments r) ++
", rconditionalblocks = " ++ show (rconditionalblocks r) ++
" }"
type CsvRulesParser a = StateT CsvRulesParsed SimpleTextParser a
-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
type DirectiveName = Text
-- | CSV field name.
type CsvFieldName = Text
-- | 1-based CSV column number.
type CsvFieldIndex = Int
-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
type CsvFieldReference = Text
-- | One of the standard hledger fields or pseudo-fields that can be assigned to.
-- Eg date, account1, amount, amount1-in, date-format.
type HledgerFieldName = Text
-- | A text value to be assigned to a hledger field, possibly
-- containing csv field references to be interpolated.
type FieldTemplate = Text
-- | A reference to a regular expression match group. Eg \1.
type MatchGroupReference = Text
-- | A strptime date parsing pattern, as supported by Data.Time.Format.
type DateFormat = Text
-- | A prefix for a matcher test, either & or none (implicit or).
data MatcherPrefix = And | Not | None
deriving (Show, Eq)
-- | A single test for matching a CSV record, in one way or another.
data Matcher =
RecordMatcher MatcherPrefix Regexp -- ^ match if this regexp matches the overall CSV record
| FieldMatcher MatcherPrefix CsvFieldReference Regexp -- ^ match if this regexp matches the referenced CSV field's value
deriving (Show, Eq)
-- | A conditional block: a set of CSV record matchers, and a sequence
-- of rules which will be enabled only if one or more of the matchers
-- succeeds.
--
-- Three types of rule are allowed inside conditional blocks: field
-- assignments, skip, end. (A skip or end rule is stored as if it was
-- a field assignment, and executed in validateCsv. XXX)
data ConditionalBlock = CB {
cbMatchers :: [Matcher]
,cbAssignments :: [(HledgerFieldName, FieldTemplate)]
} deriving (Show, Eq)
defrules :: CsvRulesParsed
defrules = CsvRules' {
rdirectives=[],
rcsvfieldindexes=[],
rassignments=[],
rconditionalblocks=[],
rblocksassigning = ()
}
-- | Create CsvRules from the content parsed out of the rules file
mkrules :: CsvRulesParsed -> CsvRules
mkrules rules =
let conditionalblocks = reverse $ rconditionalblocks rules
maybeMemo = if length conditionalblocks >= 15 then memo else id
in
CsvRules' {
rdirectives=reverse $ rdirectives rules,
rcsvfieldindexes=rcsvfieldindexes rules,
rassignments=reverse $ rassignments rules,
rconditionalblocks=conditionalblocks,
rblocksassigning = maybeMemo (\f -> filter (any ((==f).fst) . cbAssignments) conditionalblocks)
}
--- *** rules parsers
_RULES_PARSING__________________________________________ = undefined
{-
Grammar for the CSV conversion rules, more or less:
RULES: RULE*
RULE: ( SOURCE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
SOURCE: source SPACE FILEPATH
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
QUOTED-FIELD-NAME: " (any CHAR except double-quote)+ "
BARE-FIELD-NAME: any CHAR except space, tab, #, ;
FIELD-ASSIGNMENT: JOURNAL-FIELD ASSIGNMENT-SEPARATOR FIELD-VALUE
JOURNAL-FIELD: date | date2 | status | code | description | comment | account1 | account2 | amount | JOURNAL-PSEUDO-FIELD
JOURNAL-PSEUDO-FIELD: amount-in | amount-out | currency
ASSIGNMENT-SEPARATOR: SPACE | ( : SPACE? )
FIELD-VALUE: VALUE (possibly containing CSV-FIELD-REFERENCEs and REGEX-MATCHGROUP-REFERENCEs)
CSV-FIELD-REFERENCE: % CSV-FIELD
REGEX-MATCHGROUP-REFERENCE: \ DIGIT+
CSV-FIELD: ( FIELD-NAME | FIELD-NUMBER ) (corresponding to a CSV field)
FIELD-NUMBER: DIGIT+
CONDITIONAL-BLOCK: if ( FIELD-MATCHER NEWLINE )+ INDENTED-BLOCK
FIELD-MATCHER: ( CSV-FIELD-NAME SPACE? )? ( MATCHOP SPACE? )? PATTERNS
MATCHOP: ~
PATTERNS: ( NEWLINE REGEXP )* REGEXP
INDENTED-BLOCK: ( SPACE ( FIELD-ASSIGNMENT | COMMENT ) NEWLINE )+
REGEXP: ( NONSPACE CHAR* ) SPACE?
VALUE: SPACE? ( CHAR* ) SPACE?
COMMENT: SPACE? COMMENT-CHAR VALUE
COMMENT-CHAR: # | ; | *
NONSPACE: any CHAR not a SPACE-CHAR
BLANK: SPACE?
SPACE: SPACE-CHAR+
SPACE-CHAR: space | tab
CHAR: any character except newline
DIGIT: 0-9
-}
addDirective :: (DirectiveName, Text) -> CsvRulesParsed -> CsvRulesParsed
addDirective d r = r{rdirectives=d:rdirectives r}
addAssignment :: (HledgerFieldName, FieldTemplate) -> CsvRulesParsed -> CsvRulesParsed
addAssignment a r = r{rassignments=a:rassignments r}
setIndexesAndAssignmentsFromList :: [CsvFieldName] -> CsvRulesParsed -> CsvRulesParsed
setIndexesAndAssignmentsFromList fs = addAssignmentsFromList fs . setCsvFieldIndexesFromList fs
where
setCsvFieldIndexesFromList :: [CsvFieldName] -> CsvRulesParsed -> CsvRulesParsed
setCsvFieldIndexesFromList fs' r = r{rcsvfieldindexes=zip fs' [1..]}
addAssignmentsFromList :: [CsvFieldName] -> CsvRulesParsed -> CsvRulesParsed
addAssignmentsFromList fs' r = foldl' maybeAddAssignment r journalfieldnames
where
maybeAddAssignment rules f = (maybe id addAssignmentFromIndex $ elemIndex f fs') rules
where
addAssignmentFromIndex i = addAssignment (f, T.pack $ '%':show (i+1))
addConditionalBlock :: ConditionalBlock -> CsvRulesParsed -> CsvRulesParsed
addConditionalBlock b r = r{rconditionalblocks=b:rconditionalblocks r}
addConditionalBlocks :: [ConditionalBlock] -> CsvRulesParsed -> CsvRulesParsed
addConditionalBlocks bs r = r{rconditionalblocks=bs++rconditionalblocks r}
rulesp :: CsvRulesParser CsvRules
rulesp = do
_ <- many $ choice
[blankorcommentlinep <?> "blank or comment line"
,(directivep >>= modify' . addDirective) <?> "directive"
,(fieldnamelistp >>= modify' . setIndexesAndAssignmentsFromList) <?> "field name list"
,(fieldassignmentp >>= modify' . addAssignment) <?> "field assignment"
-- conditionalblockp backtracks because it shares "if" prefix with conditionaltablep.
,try (conditionalblockp >>= modify' . addConditionalBlock) <?> "conditional block"
-- 'reverse' is there to ensure that conditions are added in the order they listed in the file
,(conditionaltablep >>= modify' . addConditionalBlocks . reverse) <?> "conditional table"
]
eof
mkrules <$> get
blankorcommentlinep :: CsvRulesParser ()
blankorcommentlinep = lift (dbgparse 8 "trying blankorcommentlinep") >> choiceInState [blanklinep, commentlinep]
blanklinep :: CsvRulesParser ()
blanklinep = lift skipNonNewlineSpaces >> newline >> return () <?> "blank line"
commentlinep :: CsvRulesParser ()
commentlinep = lift skipNonNewlineSpaces >> commentcharp >> lift restofline >> return () <?> "comment line"
commentcharp :: CsvRulesParser Char
commentcharp = oneOf (";#*" :: [Char])
directivep :: CsvRulesParser (DirectiveName, Text)
directivep = (do
lift $ dbgparse 8 "trying directive"
d <- choiceInState $ map (lift . string) directives
v <- (((char ':' >> lift (many spacenonewline)) <|> lift (some spacenonewline)) >> directivevalp)
<|> (optional (char ':') >> lift skipNonNewlineSpaces >> lift eolof >> return "")
return (d, v)
) <?> "directive"
directives :: [Text]
directives =
["source"
,"date-format"
,"decimal-mark"
,"separator"
-- ,"default-account"
-- ,"default-currency"
,"skip"
,"timezone"
,"newest-first"
,"intra-day-reversed"
, "balance-type"
]
directivevalp :: CsvRulesParser Text
directivevalp = T.pack <$> anySingle `manyTill` lift eolof
fieldnamelistp :: CsvRulesParser [CsvFieldName]
fieldnamelistp = (do
lift $ dbgparse 8 "trying fieldnamelist"
string "fields"
optional $ char ':'
lift skipNonNewlineSpaces1
let separator = lift skipNonNewlineSpaces >> char ',' >> lift skipNonNewlineSpaces
f <- fromMaybe "" <$> optional fieldnamep
fs <- some $ (separator >> fromMaybe "" <$> optional fieldnamep)
lift restofline
return . map T.toLower $ f:fs
) <?> "field name list"
fieldnamep :: CsvRulesParser Text
fieldnamep = quotedfieldnamep <|> barefieldnamep
quotedfieldnamep :: CsvRulesParser Text
quotedfieldnamep =
char '"' *> takeWhile1P Nothing (`notElem` ("\"\n:;#~" :: [Char])) <* char '"'
barefieldnamep :: CsvRulesParser Text
barefieldnamep = takeWhile1P Nothing (`notElem` (" \t\n,;#~" :: [Char]))
fieldassignmentp :: CsvRulesParser (HledgerFieldName, FieldTemplate)
fieldassignmentp = do
lift $ dbgparse 8 "trying fieldassignmentp"
f <- journalfieldnamep
v <- choiceInState [ assignmentseparatorp >> fieldvalp
, lift eolof >> return ""
]
return (f,v)
<?> "field assignment"
journalfieldnamep :: CsvRulesParser Text
journalfieldnamep = do
lift (dbgparse 8 "trying journalfieldnamep")
choiceInState $ map (lift . string) journalfieldnames
maxpostings = 99
-- Transaction fields and pseudo fields for CSV conversion.
-- Names must precede any other name they contain, for the parser
-- (amount-in before amount; date2 before date). TODO: fix
journalfieldnames =
concat [[ "account" <> i
,"amount" <> i <> "-in"
,"amount" <> i <> "-out"
,"amount" <> i
,"balance" <> i
,"comment" <> i
,"currency" <> i
] | x <- [maxpostings, (maxpostings-1)..1], let i = T.pack $ show x]
++
["amount-in"
,"amount-out"
,"amount"
,"balance"
,"code"
,"comment"
,"currency"
,"date2"
,"date"
,"description"
,"status"
,"skip" -- skip and end are not really fields, but we list it here to allow conditional rules that skip records
,"end"
]
assignmentseparatorp :: CsvRulesParser ()
assignmentseparatorp = do
lift $ dbgparse 8 "trying assignmentseparatorp"
_ <- choiceInState [ lift skipNonNewlineSpaces >> char ':' >> lift skipNonNewlineSpaces
, lift skipNonNewlineSpaces1
]
return ()
fieldvalp :: CsvRulesParser Text
fieldvalp = do
lift $ dbgparse 8 "trying fieldvalp"
T.pack <$> anySingle `manyTill` lift eolof
-- A conditional block: one or more matchers, one per line, followed by one or more indented rules.
conditionalblockp :: CsvRulesParser ConditionalBlock
conditionalblockp = do
lift $ dbgparse 8 "trying conditionalblockp"
-- "if\nMATCHER" or "if \nMATCHER" or "if MATCHER"
start <- getOffset
string "if" >> ( (newline >> return Nothing)
<|> (lift skipNonNewlineSpaces1 >> optional newline))
ms <- some matcherp
as <- catMaybes <$>
many (lift skipNonNewlineSpaces1 >>
choice [ lift eolof >> return Nothing
, fmap Just fieldassignmentp
])
when (null as) $
customFailure $ parseErrorAt start $ "start of conditional block found, but no assignment rules afterward\n(assignment rules in a conditional block should be indented)"
return $ CB{cbMatchers=ms, cbAssignments=as}
<?> "conditional block"
-- A conditional table: "if" followed by separator, followed by some field names,
-- followed by many lines, each of which has:
-- one matchers, followed by field assignments (as many as there were fields)
conditionaltablep :: CsvRulesParser [ConditionalBlock]
conditionaltablep = do
lift $ dbgparse 8 "trying conditionaltablep"
start <- getOffset
string "if"
sep <- lift $ satisfy (\c -> not (isAlphaNum c || isSpace c))
fields <- journalfieldnamep `sepBy1` (char sep)
newline
body <- flip manyTill (lift eolof) $ do
off <- getOffset
m <- matcherp' $ void $ char sep
vs <- T.split (==sep) . T.pack <$> lift restofline
if (length vs /= length fields)
then customFailure $ parseErrorAt off $ ((printf "line of conditional table should have %d values, but this one has only %d" (length fields) (length vs)) :: String)
else return (m,vs)
when (null body) $
customFailure $ parseErrorAt start $ "start of conditional table found, but no assignment rules afterward"
return $ flip map body $ \(m,vs) ->
CB{cbMatchers=[m], cbAssignments=zip fields vs}
<?> "conditional table"
-- A single matcher, on one line.
matcherp' :: CsvRulesParser () -> CsvRulesParser Matcher
matcherp' end = try (fieldmatcherp end) <|> recordmatcherp end
matcherp :: CsvRulesParser Matcher
matcherp = matcherp' (lift eolof)
-- A single whole-record matcher.
-- A pattern on the whole line, not beginning with a csv field reference.
recordmatcherp :: CsvRulesParser () -> CsvRulesParser Matcher
recordmatcherp end = do
lift $ dbgparse 8 "trying recordmatcherp"
-- pos <- currentPos
-- _ <- optional (matchoperatorp >> lift skipNonNewlineSpaces >> optional newline)
p <- matcherprefixp
r <- regexp end
return $ RecordMatcher p r
-- when (null ps) $
-- Fail.fail "start of record matcher found, but no patterns afterward\n(patterns should not be indented)"
<?> "record matcher"
-- | A single matcher for a specific field. A csv field reference
-- (like %date or %1), and a pattern on the rest of the line,
-- optionally space-separated. Eg:
-- %description chez jacques
fieldmatcherp :: CsvRulesParser () -> CsvRulesParser Matcher
fieldmatcherp end = do
lift $ dbgparse 8 "trying fieldmatcher"
-- An optional fieldname (default: "all")
-- f <- fromMaybe "all" `fmap` (optional $ do
-- f' <- fieldnamep
-- lift skipNonNewlineSpaces
-- return f')
p <- matcherprefixp
f <- csvfieldreferencep <* lift skipNonNewlineSpaces
-- optional operator.. just ~ (case insensitive infix regex) for now
-- _op <- fromMaybe "~" <$> optional matchoperatorp
lift skipNonNewlineSpaces
r <- regexp end
return $ FieldMatcher p f r
<?> "field matcher"
matcherprefixp :: CsvRulesParser MatcherPrefix
matcherprefixp = do
lift $ dbgparse 8 "trying matcherprefixp"
(char '&' >> lift skipNonNewlineSpaces >> return And) <|> (char '!' >> lift skipNonNewlineSpaces >> return Not) <|> return None
csvfieldreferencep :: CsvRulesParser CsvFieldReference
csvfieldreferencep = do
lift $ dbgparse 8 "trying csvfieldreferencep"
char '%'
T.cons '%' . textQuoteIfNeeded <$> fieldnamep
-- A single regular expression
regexp :: CsvRulesParser () -> CsvRulesParser Regexp
regexp end = do
lift $ dbgparse 8 "trying regexp"
-- notFollowedBy matchoperatorp
c <- lift nonspace
cs <- anySingle `manyTill` end
case toRegexCI . T.strip . T.pack $ c:cs of
Left x -> Fail.fail $ "CSV parser: " ++ x
Right x -> return x
-- -- A match operator, indicating the type of match to perform.
-- -- Currently just ~ meaning case insensitive infix regex match.
-- matchoperatorp :: CsvRulesParser String
-- matchoperatorp = fmap T.unpack $ choiceInState $ map string
-- ["~"
-- -- ,"!~"
-- -- ,"="
-- -- ,"!="
-- ]
_RULES_LOOKUP__________________________________________ = undefined
getDirective :: DirectiveName -> CsvRules -> Maybe FieldTemplate
getDirective directivename = lookup directivename . rdirectives
-- | Look up the value (template) of a csv rule by rule keyword.
csvRule :: CsvRules -> DirectiveName -> Maybe FieldTemplate
csvRule rules = (`getDirective` rules)
-- | Look up the value template assigned to a hledger field by field
-- list/field assignment rules, taking into account the current record and
-- conditional rules.
hledgerField :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe FieldTemplate
hledgerField rules record f = fmap
(either id (lastCBAssignmentTemplate f))
(getEffectiveAssignment rules record f)
-- | Look up the final value assigned to a hledger field, with csv field
-- references and regular expression match group references interpolated.
hledgerFieldValue rules record f = (flip fmap) (getEffectiveAssignment rules record f)
$ either (renderTemplate rules record)
$ \cb -> let
t = lastCBAssignmentTemplate f cb
r = rules { rconditionalblocks = [cb] } -- XXX handle rblocksassigning
in renderTemplate r record t
lastCBAssignmentTemplate :: HledgerFieldName -> ConditionalBlock -> FieldTemplate
lastCBAssignmentTemplate f = snd . last . filter ((==f).fst) . cbAssignments
maybeNegate :: MatcherPrefix -> Bool -> Bool
maybeNegate Not origbool = not origbool
maybeNegate _ origbool = origbool
-- | Given the conversion rules, a CSV record and a hledger field name, collect
-- the value templates (and their parent Conditional Blocks where applicable)
-- assigned to this field, if any, by top-level field assignments and
-- conditional blocks matching this record.
--
-- Note conditional blocks' patterns are matched against an approximation of the
-- CSV record: all the field values, without enclosing quotes, comma-separated.
--
getEffectiveAssignment
:: CsvRules
-> CsvRecord
-> HledgerFieldName
-> Maybe (Either FieldTemplate ConditionalBlock)
getEffectiveAssignment rules record f = lastMay assignments
where
-- all active assignments to field f, in order
assignments = dbg9 "csv assignments" $ toplevelassignments ++ conditionalassignments
-- all top level field assignments
toplevelassignments = map (Left . snd) $ filter ((==f).fst) $ rassignments rules
-- all conditional blocks assigning to field f and active for the current csv record
conditionalassignments = map Right
$ filter (any (==f) . map fst . cbAssignments)
$ filter (isBlockActive rules record)
$ (rblocksassigning rules) f
-- does this conditional block match the current csv record ?
isBlockActive :: CsvRules -> CsvRecord -> ConditionalBlock -> Bool
isBlockActive rules record CB{..} = any (all matcherMatches) $ groupedMatchers cbMatchers
where
-- does this individual matcher match the current csv record ?
matcherMatches :: Matcher -> Bool
matcherMatches (RecordMatcher prefix pat) = maybeNegate prefix origbool
where
pat' = dbg7 "regex" pat
-- A synthetic whole CSV record to match against. Note, this can be
-- different from the original CSV data:
-- - any whitespace surrounding field values is preserved
-- - any quotes enclosing field values are removed
-- - and the field separator is always comma
-- which means that a field containing a comma will look like two fields.
wholecsvline = dbg7 "wholecsvline" $ T.intercalate "," record
origbool = regexMatchText pat' wholecsvline
matcherMatches (FieldMatcher prefix csvfieldref pat) = maybeNegate prefix origbool
where
-- the value of the referenced CSV field to match against.
csvfieldvalue = dbg7 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
origbool = regexMatchText pat csvfieldvalue
-- | Group matchers into associative pairs based on prefix, e.g.:
-- A
-- & B
-- C
-- D
-- & E
-- => [[A, B], [C], [D, E]]
groupedMatchers :: [Matcher] -> [[Matcher]]
groupedMatchers [] = []
groupedMatchers (x:xs) = (x:ys) : groupedMatchers zs
where
(ys, zs) = span (\y -> matcherPrefix y == And) xs
matcherPrefix :: Matcher -> MatcherPrefix
matcherPrefix (RecordMatcher prefix _) = prefix
matcherPrefix (FieldMatcher prefix _ _) = prefix
-- | Render a field assignment's template, possibly interpolating referenced
-- CSV field values or match groups. Outer whitespace is removed from interpolated values.
renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> Text
renderTemplate rules record t =
maybe t mconcat $ parseMaybe
(many
( literaltextp
<|> (matchrefp <&> replaceRegexGroupReference rules record)
<|> (fieldrefp <&> replaceCsvFieldReference rules record)
)
)
t
where
literaltextp :: SimpleTextParser Text
literaltextp = some (nonBackslashOrPercent <|> nonRefBackslash <|> nonRefPercent) <&> T.pack
where
nonBackslashOrPercent = noneOf ['\\', '%'] <?> "character other than backslash or percent"
nonRefBackslash = try (char '\\' <* notFollowedBy digitChar) <?> "backslash that does not begin a match group reference"
nonRefPercent = try (char '%' <* notFollowedBy (satisfy isFieldNameChar)) <?> "percent that does not begin a field reference"
matchrefp = liftA2 T.cons (char '\\') (takeWhile1P (Just "matchref") isDigit)
fieldrefp = liftA2 T.cons (char '%') (takeWhile1P (Just "reference") isFieldNameChar)
isFieldNameChar c = isAlphaNum c || c == '_' || c == '-'
-- | Replace something that looks like a Regex match group reference with the
-- resulting match group value after applying the Regex.
replaceRegexGroupReference :: CsvRules -> CsvRecord -> MatchGroupReference -> Text
replaceRegexGroupReference rules record s = case T.uncons s of
Just ('\\', group) -> fromMaybe "" $ regexMatchValue rules record group
_ -> s
regexMatchValue :: CsvRules -> CsvRecord -> Text -> Maybe Text
regexMatchValue rules record sgroup = let
matchgroups = concatMap (getMatchGroups rules record)
$ concatMap cbMatchers
$ filter (isBlockActive rules record)
$ rconditionalblocks rules
-- ^ XXX adjusted to not use memoized field as caller might be sending a subset of rules with just one CB (hacky)
group = (read (T.unpack sgroup) :: Int) - 1 -- adjust to 0-indexing
in atMay matchgroups group
getMatchGroups :: CsvRules -> CsvRecord -> Matcher -> [Text]
getMatchGroups _ record (RecordMatcher _ regex) = let
txt = T.intercalate "," record -- see caveats of wholecsvline, in `isBlockActive`
in regexMatchTextGroups regex txt
getMatchGroups rules record (FieldMatcher _ fieldref regex) = let
txt = replaceCsvFieldReference rules record fieldref
in regexMatchTextGroups regex txt
-- | Replace something that looks like a reference to a csv field ("%date" or "%1)
-- with that field's value. If it doesn't look like a field reference, or if we
-- can't find such a field, replace it with the empty string.
replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> Text
replaceCsvFieldReference rules record s = case T.uncons s of
Just ('%', fieldname) -> fromMaybe "" $ csvFieldValue rules record fieldname
_ -> s
-- | Get the (whitespace-stripped) value of a CSV field, identified by its name or
-- column number, ("date" or "1"), from the given CSV record, if such a field exists.
csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe Text
csvFieldValue rules record fieldname = do
fieldindex <-
if T.all isDigit fieldname
then readMay $ T.unpack fieldname
else lookup (T.toLower fieldname) $ rcsvfieldindexes rules
T.strip <$> atMay record (fieldindex-1)
_CSV_READING__________________________________________ = undefined
-- | Read a Journal from the given CSV data (and filename, used for error
-- messages), or return an error. Proceed as follows:
--
-- 1. Conversion rules are provided, or they are parsed from the specified
-- rules file, or from the default rules file for the CSV data file.
-- If rules parsing fails, or the required rules file does not exist, throw an error.
--
-- 2. Parse the CSV data using the rules, or throw an error.
--
-- 3. Convert the CSV records to hledger transactions using the rules.
--
-- 4. Return the transactions as a Journal.
--
readJournalFromCsv :: Maybe (Either CsvRules FilePath) -> FilePath -> Text -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" _ _ = throwError "please use --rules-file when reading CSV from stdin"
readJournalFromCsv merulesfile csvfile csvtext sep = do
-- for now, correctness is the priority here, efficiency not so much
rules <- case merulesfile of
Just (Left rs) -> return rs
Just (Right rulesfile) -> readRulesFile rulesfile
Nothing -> readRulesFile $ rulesFileFor csvfile
dbg6IO "csv rules" rules
-- convert the csv data to lines and remove all empty/blank lines
let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext
-- if there is a top-level skip rule, skip the specified number of non-empty lines
skiplines <- case getDirective "skip" rules of
Nothing -> return 0
Just "" -> return 1
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1
-- convert back to text and parse as csv records
let
csvtext1 = T.unlines csvlines2
-- The separator in the rules file takes precedence over the extension or prefix
separator = case getDirective "separator" rules >>= parseSeparator of
Just c -> c
_ | ext == "ssv" -> ';'
_ | ext == "tsv" -> '\t'
_ ->
case sep of
Just Csv -> ','
Just Ssv -> ';'
Just Tsv -> '\t'
Nothing -> ','
where
ext = map toLower $ drop 1 $ takeExtension csvfile
-- parsec seemed to fail if you pass it "-" here -- TODO: try again with megaparsec
parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
dbg6IO "using separator" separator
-- parse csv records
csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1
-- remove any records skipped by conditional skip or end rules
let csvrecords1 = applyConditionalSkips rules csvrecords0
-- and check the remaining records for any obvious problems
csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1
dbg6IO "first 3 csv records" $ take 3 csvrecords
-- XXX identify header lines some day ?
-- let (headerlines, datalines) = identifyHeaderLines csvrecords'
-- mfieldnames = lastMay headerlines
tzout <- liftIO getCurrentTimeZone
mtzin <- case getDirective "timezone" rules of
Nothing -> return Nothing
Just s ->
maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
let
-- convert CSV records to transactions, saving the CSV line numbers for error positions
txns = dbg7 "csv txns" $ snd $ mapAccumL
(\pos r ->
let
SourcePos name line col = pos
line' = (mkPos . (+1) . unPos) line
pos' = SourcePos name line' col
in
(pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
)
(initialPos parsecfilename) csvrecords
where
timesarezoned =
case csvRule rules "date-format" of
Just f | any (`T.isInfixOf` f) ["%Z","%z","%EZ","%Ez"] -> True
_ -> False
-- Do our best to ensure transactions will be ordered chronologically,
-- from oldest to newest. This is done in several steps:
-- 1. Intra-day order: if there's an "intra-day-reversed" rule,
-- assume each day's CSV records were ordered in reverse of the overall date order,
-- so reverse each day's txns.
intradayreversed = dbg6 "intra-day-reversed" $ isJust $ getDirective "intra-day-reversed" rules
txns1 = dbg7 "txns1" $
(if intradayreversed then concatMap reverse . groupOn tdate else id) txns
-- 2. Overall date order: now if there's a "newest-first" rule,
-- or if there's multiple dates and the first is more recent than the last,
-- assume CSV records were ordered newest dates first,
-- so reverse all txns.
newestfirst = dbg6 "newest-first" $ isJust $ getDirective "newest-first" rules
mdatalooksnewestfirst = dbg6 "mdatalooksnewestfirst" $
case nub $ map tdate txns of
ds@(d:_) -> Just $ d > last ds
[] -> Nothing
txns2 = dbg7 "txns2" $
(if newestfirst || mdatalooksnewestfirst == Just True then reverse else id) txns1
-- 3. Disordered dates: in case the CSV records were ordered by chaos,
-- do a final sort by date. If it was only a few records out of order,
-- this will hopefully refine any good ordering done by steps 1 and 2.
txns3 = dbg7 "date-sorted csv txns" $ sortOn tdate txns2
return nulljournal{jtxns=txns3}
-- | Parse special separator names TAB and SPACE, or return the first
-- character. Return Nothing on empty string
parseSeparator :: Text -> Maybe Char
parseSeparator = specials . T.toLower
where specials "space" = Just ' '
specials "tab" = Just '\t'
specials xs = fst <$> T.uncons xs
-- Call parseCassava on a file or stdin, converting the result to ExceptT.
parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
parseCsv separator filePath csvtext = ExceptT $
case filePath of
"-" -> parseCassava separator "(stdin)" <$> T.getContents
_ -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext
-- Parse text into CSV records, using Cassava and the given field separator.
parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
parseCassava separator path content =
-- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ?
-- filter (/=[""])
either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>