/
spelling.hs
115 lines (94 loc) · 3.54 KB
/
spelling.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
{-# LANGUAGE BangPatterns #-}
module Main (main) where
import qualified Data.ByteString.Char8 as B
import Data.ByteString.Char8 (ByteString, pack, unpack)
import qualified Data.ByteString as BS
import Data.Bits
import Data.Word (Word8)
import Data.Map (Map, findWithDefault, insertWith', empty, member)
import qualified Data.Map as Map (empty)
import Data.Set (toList, fromList)
import Data.List (inits, tails, foldl')
import System.CPUTime (getCPUTime)
import Text.Printf
type WordFreq = Map ByteString Int
dataFile = "big.txt"
alphabet = "abcdefghijklmnopqrstuvwxyz"
splitWords :: ByteString -> [ByteString]
splitWords =
filter (not . BS.null) . BS.splitWith notLetter . BS.map mkLower
where mkLower :: Word8 -> Word8
mkLower x = x .|. 32
notLetter :: Word8 -> Bool
notLetter c = c < 97 || c > 122
train :: [ByteString] -> WordFreq
train = foldl' updateMap Map.empty
where updateMap model word = insertWith' (+) word 1 model
nwords :: IO WordFreq
nwords = (return $!) . train . splitWords =<< B.readFile dataFile
edits1 :: String -> [String]
edits1 s = deletes ++ transposes ++ replaces ++ inserts
where
deletes = [a ++ bs | (a, _:bs) <- splits]
transposes = [a ++ (b2:b1:bs) | (a, b1:b2:bs) <- splits]
replaces = [a ++ (c:bs) | (a, _:bs) <- splits, c <- alphabet]
inserts = [a ++ (c:b) | (a, b) <- splits, c <- alphabet]
splits = zip (inits s) (tails s)
correct :: WordFreq -> String -> String
correct wordCounts word =
unpack . fst $ foldl' maxCount (pack "?", 0) candidates
where
candidates :: [ByteString]
candidates =
known [word] `or` ((known e1) `or` known_edits2)
e1 :: [String]
e1 = toList . fromList $ edits1 word
known_edits2 :: [ByteString]
known_edits2 =
[w3 | w1 <- e1, w2 <- edits1 w1, let w3 = pack w2,
w3 `member` wordCounts]
known :: [String] -> [ByteString]
known ws = [w | w <- map pack ws, w `member` wordCounts]
maxCount :: (ByteString, Int) -> ByteString -> (ByteString, Int)
maxCount current@(_, currentMax) word
| count > currentMax = (word, count)
| otherwise = current
where count = findWithDefault 1 word wordCounts
or :: [ByteString] -> [ByteString] -> [ByteString]
or a b | null a = b
| otherwise = a
main :: IO ()
main = do
start <- getCPUTime
wordCounts <- nwords
interact (unlines . map (correct wordCounts) . words)
end <- getCPUTime
let diff = (fromIntegral (end - start)) / (10^12)
printf "Computation time: %0.3f sec\n" (diff :: Double)
-- time :: IO t -> IO t
-- time a = do
-- start <- getCPUTime
-- v <- a
-- end <- getCPUTime
-- let diff = (fromIntegral (end - start)) / (10^12)
-- printf "Computation time: %0.3f sec\n" (diff :: Double)
-- return v
-- main :: IO ()
-- main = do
-- args <- getArgs
-- wordCounts <- nwords
-- putStrLn "Starting..."
-- time $! (run wordCounts args) `seq` return ()
-- putStrLn "Done."
-- where run wordCounts args = do
-- mapM_ (printCorrect wordCounts) args
-- where
-- printCorrect :: WordFreq -> String -> IO ()
-- printCorrect wordCounts word =
-- putStrLn $ word ++ " -> " ++ correct wordCounts word
-- Testing --
-- instance Arbitrary Char where
-- arbitrary = frequency [(4, choose ('\33', '\128')), (1, return ' ')]
-- coarbitrary c = variant (ord c `rem` 4)
-- prop_words_nospaces s = all (not . elem ' ') (splitWords s)
-- prop_words_noempty s = all ((> 0) . length) (splitWords s)