/
Datasets.hs
335 lines (271 loc) · 12.8 KB
/
Datasets.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
{- |
The @datasets@ package defines three different kinds of datasets:
* Tiny datasets (up to a few tens of rows) are embedded as part of the library source code, as lists of values.
* Small data sets are embedded indirectly (via @file-embed@)
in the package as pure values and do not require IO
to be downloaded (i.e. the data is loaded and parsed at compile time).
* Larger data sets which need to be fetched over the network
and are cached in a local temporary directory for subsequent use.
This module defines the 'getDataset' function for fetching datasets
and utilities for defining new data sets and modifying their options.
It is only necessary to import this module when using fetched data sets.
Embedded data sets can be used directly.
Please refer to the dataset modules for examples.
-}
{-# LANGUAGE OverloadedStrings, GADTs, DataKinds #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE ScopedTypeVariables #-}
-- {-# OPTIONS_GHC -fno-warn-unused-imports #-}
module Numeric.Datasets (getDataset, Dataset(..), Source(..), getDatavec, defaultTempDir, getFileFromSource,
-- * Parsing datasets
readDataset, safeReadDataset, ReadAs(..), csvRecord,
-- * Defining datasets
csvDataset, csvHdrDataset, csvHdrDatasetSep, csvDatasetSkipHdr,
jsonDataset, readArff, arffDataset,
-- ** Dataset options
withPreprocess, withTempDir,
-- ** Preprocessing functions
--
-- | These functions are to be used as first argument of 'withPreprocess' in order to improve the quality of the parser inputs.
dropLines, fixedWidthToCSV, removeEscQuotes, fixAmericanDecimals,
-- ** Helper functions
parseReadField, parseDashToCamelField,
yearToUTCTime,
-- * Dataset source URLs
umassMLDB, uciMLDB) where
import Data.Csv
import Data.Monoid
import Data.Foldable
import Data.List (isSuffixOf)
import Data.List.NonEmpty (NonEmpty((:|)))
import qualified Data.List.NonEmpty as NE
import System.FilePath (takeExtensions, (</>))
import System.Directory
import Data.Hashable
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import Data.Aeson as JSON
import Control.Applicative
import Data.Time
import Network.HTTP.Req (req, runReq, Url, (/:), http, https, Scheme(..), LbsResponse, lbsResponse, responseBody, GET(..), NoReqBody(..), HttpMethod(..), defaultHttpConfig)
-- import Lens.Micro ((^.))
import Control.Exception.Safe
import Data.Char (ord, toUpper)
import Text.Read (readMaybe)
import Data.Maybe (fromMaybe)
import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString.Lazy.Search (replace)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Vector.Generic (Vector)
import qualified Data.Vector as VB
import qualified Data.Vector.Generic as V
import qualified Data.Attoparsec.ByteString as Atto'
import qualified Data.Attoparsec.ByteString.Lazy as Atto
import Numeric.Datasets.Internal.ArffParser
-- * Using datasets
-- | Load a dataset into memory
getDataset :: (MonadThrow io, MonadIO io) => Dataset a -> io [a]
getDataset ds = VB.toList <$> getDatavec ds
-- | Load a dataset into memory as a vector
getDatavec :: (MonadThrow io, MonadIO io, Vector v a) => Dataset a -> io (v a)
getDatavec ds = liftIO $ do
folder <- tempDirForDataset ds
files <- getFileFromSource folder (source ds)
safeReadDataset (readAs ds) (fromMaybe id (preProcess ds) <$> files)
-- | Get a ByteString from the specified Source
getFileFromSource
:: FilePath -- ^ Cache directory
-> Source
-> IO (NonEmpty BL.ByteString)
getFileFromSource cacheDir (URL url) = do
createDirectoryIfMissing True cacheDir
let fnm = cacheDir </> "ds" <> show (hash $ show url)
ex <- doesFileExist fnm
if ex
then (:|[]) <$> BL.readFile fnm
else do
rsp <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse mempty
let bs = responseBody rsp
BL.writeFile fnm bs
return (bs:|[])
getFileFromSource _ (File fnm) = (:|[]) <$> BL.readFile fnm
getFileFromSource _ (ImgFolder root labels) =
NE.fromList <$> foldrM allImFolderData [] labels
where
allImFolderData :: String -> [BL.ByteString] -> IO [BL.ByteString]
allImFolderData label agg = (agg ++) <$> toImFolderData label
toImFolderData :: String -> IO [BL.ByteString]
toImFolderData l = map (asBytes l) . filter hasValidExt <$> listDirectory (root </> l)
asBytes :: String -> FilePath -> BL8.ByteString
asBytes label fp = BL8.pack $ label ++ "<<.>>" ++ (root </> label </> fp)
hasValidExt :: FilePath -> Bool
hasValidExt fp = any (`isExtensionOf` fp) ["png", "jpeg", "bitmap", "tiff"] -- "gif", "tga"]
-- For backwards compatability (only in filepath >= 1.4.2)
isExtensionOf :: String -> FilePath -> Bool
isExtensionOf ext@('.':_) = isSuffixOf ext . takeExtensions
isExtensionOf ext = isSuffixOf ('.':ext) . takeExtensions
-- | Parse a ByteString into a list of Haskell values
readDataset
:: ReadAs a -- ^ How to parse the raw data string
-> BL.ByteString -- ^ The data strings
-> [a]
readDataset ra bs =
case safeReadDataset ra (bs:|[]) of
Left e -> error (show e)
Right dat -> VB.toList dat
-- | Read a ByteString into a Haskell value
safeReadDataset :: (Vector v a, MonadThrow m) => ReadAs a -> NonEmpty BL.ByteString -> m (v a)
safeReadDataset ra bss = either throwString pure $
case (ra, bss) of
(JSON, bs:|[]) -> V.fromList <$> JSON.eitherDecode' bs
(CSVRecord hhdr opts, bs:|[]) -> V.convert <$> decodeWith opts hhdr bs
(CSVNamedRecord opts, bs:|[]) -> V.convert . snd <$> decodeByNameWith opts bs
(MultiRecordParsable mpsr, bs:|[]) -> V.fromList <$> Atto.eitherResult (Atto.parse mpsr bs)
(Parsable psr, bs:|[]) -> V.fromList <$> Atto.eitherResult (Atto.parse (Atto.many' psr) bs)
(ImageFolder labels, _) -> do
ds <- mapM (getImFiles labels) bss
pure $ V.fromList (toList ds)
_ -> Left $
if length bss > 1
then "Cannot parse more than one file for this data format"
else "impossible: logic has changed, please file this issue on dh-core"
where
getImFiles :: NonEmpty String -> BL.ByteString -> Either String (String, FilePath)
getImFiles labels bs' = Atto.eitherResult (Atto.parse (parseTaggedFile labels) bs')
parseTaggedFile :: NonEmpty String -> Atto.Parser (String, FilePath)
parseTaggedFile (l0:|ls) = do
lbl <- Atto.choice $ Atto.string . B8.pack <$> (l0:ls)
_ <- Atto.string "<<.>>"
fp <- Atto.takeByteString
pure (B8.unpack lbl, B8.unpack fp)
-- | Get a temporary directory for a dataset.
tempDirForDataset :: Dataset a -> IO FilePath
tempDirForDataset = defaultTempDir . temporaryDirectory
-- | Reify an optional temporary directory
defaultTempDir :: Maybe FilePath -> IO FilePath
defaultTempDir = \case
Nothing -> getTemporaryDirectory
Just tdir -> return tdir
-- | A 'Dataset' source can be either a URL (for remotely-hosted datasets) or the filepath of a local file.
data Source
= forall h . URL (Url h)
| File FilePath
| ImgFolder FilePath (NonEmpty String)
-- | A 'Dataset' contains metadata for loading, caching, preprocessing and parsing data.
data Dataset a = Dataset
{ source :: Source -- ^ Dataset source
, temporaryDirectory :: Maybe FilePath -- ^ Temporary directory (optional)
, preProcess :: Maybe (BL.ByteString -> BL.ByteString) -- ^ Dataset preprocessing function (optional)
, readAs :: ReadAs a
}
-- | ReadAs is a datatype to describe data formats that hold data sets
data ReadAs a where
JSON :: FromJSON a => ReadAs a
CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
-- Parsable that returns multiple records at one go
MultiRecordParsable :: Atto.Parser [a] -> ReadAs a
-- Parsable that returns a single record each time it is called
Parsable :: Atto.Parser a -> ReadAs a
ImageFolder
:: NonEmpty String -- labels used as folders
-> ReadAs (String, FilePath) -- FilePaths representing images on disk, Strings are labels
-- | Reads ARFF records from the given ARFF-format string
readArff :: BL.ByteString -> [ArffRecord]
readArff s = readDataset rasa s where
rasa :: ReadAs ArffRecord
rasa = MultiRecordParsable arffRecords
-- | A CSV record with default decoding options (i.e. columns are separated by commas)
csvRecord :: FromRecord a => ReadAs a
csvRecord = CSVRecord NoHeader defaultDecodeOptions
-- * Defining datasets
-- Define a dataset from an ARFF file
arffDataset :: Source -> Dataset ArffRecord
arffDataset src = Dataset src Nothing Nothing (MultiRecordParsable arffRecords)
-- | Define a dataset from a source for a CSV file
csvDataset :: FromRecord a => Source -> Dataset a
csvDataset src = Dataset src Nothing Nothing csvRecord
-- | Define a dataset from a source for a CSV file, skipping the header line
csvDatasetSkipHdr :: FromRecord a => Source -> Dataset a
csvDatasetSkipHdr src = Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions
-- |Define a dataset from a source for a CSV file with a known header
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
csvHdrDataset src = Dataset src Nothing Nothing $ CSVNamedRecord defaultDecodeOptions
-- |Define a dataset from a source for a CSV file with a known header and separator
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
csvHdrDatasetSep sepc src
= Dataset src Nothing Nothing
$ CSVNamedRecord defaultDecodeOptions { decDelimiter = fromIntegral (ord sepc)}
-- | Define a dataset from a source for a JSON file
jsonDataset :: FromJSON a => Source -> Dataset a
jsonDataset src = Dataset src Nothing Nothing JSON
-- * Modifying datasets
-- | Include a preprocessing stage to a Dataset: each field in the raw data will be preprocessed with the given function.
withPreprocess :: (BL8.ByteString -> BL8.ByteString) -> Dataset a -> Dataset a
withPreprocess preF ds = ds { preProcess = Just preF}
-- | Include a temporary directory for caching the dataset after this has been downloaded one first time.
withTempDir :: FilePath -> Dataset a -> Dataset a
withTempDir dir ds = ds { temporaryDirectory = Just dir }
-- * Helper functions for parsing datasets
-- | Turn dashes to CamelCase
dashToCamelCase :: String -> String
dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs
dashToCamelCase (c:cs) = c : dashToCamelCase cs
dashToCamelCase [] = []
-- | Parse a field, first turning dashes to CamelCase
parseDashToCamelField :: Read a => Field -> Parser a
parseDashToCamelField s =
case readMaybe (dashToCamelCase $ unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
-- | Parse a CSV field, based on its read instance
parseReadField :: Read a => Field -> Parser a
parseReadField s =
case readMaybe (unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
-- | Drop lines from a bytestring
dropLines :: Int -> BL.ByteString -> BL.ByteString
dropLines 0 s = s
dropLines n s = dropLines (n-1) $ BL.tail $ BL8.dropWhile (/='\n') s
-- | Turn US-style decimals starting with a period (e.g. .2) into something @cassava@ can parse (e.g. 0.2)
fixAmericanDecimals :: BL.ByteString -> BL.ByteString
fixAmericanDecimals = replace ",." (",0."::BL.ByteString)
-- | Convert a Fixed-width format to a CSV
fixedWidthToCSV :: BL.ByteString -> BL.ByteString
fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where
f [] = []
f (' ':cs) = ',':f (chomp cs)
f ('\n':cs) = '\n':fnl cs
f (c:cs) = c:f cs
fnl cs = f (chomp cs) --newline
chomp (' ':cs) = chomp cs
chomp (c:cs) = c:cs
chomp [] = []
-- | Filter out escaped double quotes from a field
removeEscQuotes :: BL8.ByteString -> BL8.ByteString
removeEscQuotes = BL8.filter (/= '\"')
-- * Helper functions for data analysis
-- | Convert a fractional year to UTCTime with second-level precision (due to not taking into account leap seconds)
yearToUTCTime :: Double -> UTCTime
yearToUTCTime yearDbl = UTCTime day dt
where
(yearn,yearFrac) = properFraction yearDbl
dayYearBegin = fromGregorian yearn 1 1
(dayn, dayFrac) = properFraction $ yearFrac * (if isLeapYear yearn then 366 else 365)
day = addDays dayn dayYearBegin
dt = secondsToDiffTime $ round $ dayFrac * 86400
-- * URLs
-- | The UMass machine learning database
--
-- <http://mlr.cs.umass.edu/ml/machine-learning-databases>
umassMLDB :: Url 'Http
umassMLDB = http "mlr.cs.umass.edu" /: "ml" /: "machine-learning-databases"
-- | The UCI machine learning database
--
-- | <https://archive.ics.uci.edu/ml/machine-learning-databases>
uciMLDB :: Url 'Https
uciMLDB = https "archive.ics.uci.edu" /: "ml" /: "machine-learning-databases"