refactor(lexer): ♻️ Change lexer to distinguish between manual and au…

…tomatic semicolon tokens Lexer now uses a `TokenLineSeparator` for whenever it auto-inserts "semicolons" rather than the normal `TokenSemicolon`
ElaraLang · Jun 2, 2024 · 613783b · 613783b
1 parent bda543e
commit 613783b
Show file tree

Hide file tree

Showing 7 changed files with 38 additions and 21 deletions.
diff --git a/src/Elara/Lexer/Token.hs b/src/Elara/Lexer/Token.hs
@@ -90,6 +90,7 @@ data Token
     | TokenUnderscore
     | TokenIndent
     | TokenDedent
+    | TokenLineSeparator
     | TokenEOF
     deriving (Show, Eq, Ord)
 
@@ -153,6 +154,7 @@ tokenRepr = \case
     TokenUnderscore -> "_"
     TokenIndent -> "<INDENT>"
     TokenDedent -> "<DEDENT>"
+    TokenLineSeparator -> "<LINESEP>"
     TokenEOF -> "<EOF>"
 
 unsafeTokenText :: Token -> Text
@@ -171,3 +173,9 @@ unsafeTokenFloat :: Token -> Double
 unsafeTokenFloat = \case
     TokenFloat f -> f
     t -> error ("unsafeTokenFloat: " <> show t)
+
+isIndent :: Token -> Bool
+isIndent TokenIndent = True
+isIndent TokenDedent = True
+isIndent TokenLineSeparator = True
+isIndent _ = False
diff --git a/src/Elara/Lexer/Utils.hs b/src/Elara/Lexer/Utils.hs
@@ -13,7 +13,7 @@ import Elara.AST.Name (ModuleName (..))
 import Elara.AST.Region (Located (Located), RealPosition (..), RealSourceRegion (..), SourceRegion (GeneratedRegion), column, line, positionToDiagnosePosition)
 import Elara.Error
 import Elara.Error.Codes qualified as Codes
-import Elara.Lexer.Token (Lexeme, TokPosition, Token (TokenDedent, TokenIndent, TokenSemicolon))
+import Elara.Lexer.Token (Lexeme, TokPosition, Token (..))
 import Error.Diagnose (Marker (..), Note (..), Report (Err))
 import Polysemy
 import Polysemy.Error
@@ -149,7 +149,7 @@ startWhite _ str = do
             case span (view (indent % to (> indentation))) indents of
                 (pre, top : xs) -> do
                     -- pre is all the levels that need to be closed, top is the level that we need to match
-                    fakeClosings <- sequenceA [fake TokenDedent, fake TokenSemicolon]
+                    fakeClosings <- sequenceA [fake TokenDedent, fake TokenLineSeparator]
                     if top ^. indent == indentation
                         then
                             put
@@ -160,7 +160,7 @@ startWhite _ str = do
                         else throw (TooMuchIndentation top (viaNonEmpty last $ init indents) indentation s)
                 (_, []) -> error (" Indent stack contains nothing greater than " <> show indentation)
             pure Nothing
-        EQ -> Just <$> fake TokenSemicolon
+        EQ -> Just <$> fake TokenLineSeparator
 
 -- Insert dedent for any leftover unclosed indents
 cleanIndentation :: LexMonad [Lexeme]

diff --git a/src/Elara/Parse.hs b/src/Elara/Parse.hs
@@ -25,7 +25,7 @@ parse p path = fromEither . first WParseErrorBundle . runParser p path
 type ParsePipelineEffects = '[Error (WParseErrorBundle TokenStream ElaraParseError)]
 
 createTokenStream :: String -> [Lexeme] -> TokenStream
-createTokenStream = TokenStream
+createTokenStream i tokens = TokenStream i tokens False
 
 parsePipeline ::
     Members ParsePipelineEffects r =>

diff --git a/src/Elara/Parse/Expression.hs b/src/Elara/Parse/Expression.hs
@@ -182,10 +182,10 @@ ifElse :: Parser FrontendExpr
 ifElse = locatedExpr $ do
     token_ TokenIf
     condition <- exprParser
-    _ <- optional (token_ TokenSemicolon)
+    _ <- optional lineSeparator
     token_ TokenThen
     thenBranch <- exprBlock element
-    _ <- optional (token_ TokenSemicolon)
+    _ <- optional lineSeparator
     token_ TokenElse
     elseBranch <- exprBlock element
     pure (If condition thenBranch elseBranch)

diff --git a/src/Elara/Parse/Indents.hs b/src/Elara/Parse/Indents.hs
@@ -10,6 +10,10 @@ import Elara.Parse.Combinators (sepEndBy1')
 import Elara.Parse.Primitives (Parser, token_)
 
 import Text.Megaparsec (try)
+import Text.Megaparsec.Debug
+
+lineSeparator :: Parser ()
+lineSeparator = token_ TokenLineSeparator <|> token_ TokenSemicolon
 
 indentToken :: Parser ()
 indentToken = token_ TokenIndent <|> token_ TokenLeftBrace
@@ -23,7 +27,7 @@ block mergeFunction single exprParser = try singleBlock <|> wholeBlock
     singleBlock = single <$> exprParser
     wholeBlock = do
         indentToken
-        exprs <- sepEndBy1' exprParser (token_ TokenSemicolon)
+        exprs <- sepEndBy1' exprParser lineSeparator
         dedentToken
         pure $ mergeFunction exprs
 

diff --git a/src/Elara/Parse/Module.hs b/src/Elara/Parse/Module.hs
@@ -12,16 +12,18 @@ import Elara.Parse.Names (opName, varName)
 import Elara.Parse.Names qualified as Parse (moduleName)
 import Elara.Parse.Primitives
 
+import Elara.Parse.Indents (lineSeparator)
 import Text.Megaparsec (MonadParsec (..), PosState (pstateSourcePos), SourcePos (sourceName), State (statePosState), sepEndBy)
+import Text.Megaparsec.Debug
 
 module' :: Parser (Module 'Frontend)
 module' = fmapLocated Module $ do
-    mHeader <- optional (header <* optional (token_ TokenSemicolon))
+    mHeader <- optional (header <* optional lineSeparator)
     thisFile <- sourceName . pstateSourcePos . statePosState <$> getParserState
     let _name = maybe (Located (GeneratedRegion thisFile) (ModuleName ("Main" :| []))) fst mHeader
-    imports <- sepEndBy import' (token_ TokenSemicolon)
-    _ <- optional (token_ TokenSemicolon)
-    declarations <- sepEndBy (declaration _name) (token_ TokenSemicolon)
+    imports <- sepEndBy import' lineSeparator
+
+    declarations <- sepEndBy (declaration _name) lineSeparator
 
     pure $
         Module'
@@ -58,9 +60,9 @@ exposition = exposedValue <|> exposedOp
 
 import' :: Parser (Import 'Frontend)
 import' = fmapLocated Import $ do
-    token_ TokenImport
+    dbg "import'" $ token_ TokenImport
 
-    moduleName' <- located Parse.moduleName
+    moduleName' <- dbg "mn" $ located Parse.moduleName
     isQualified <- isJust <$> optional (token_ TokenQualified)
     as <- optional . located $ do
         token_ TokenAs

diff --git a/src/Elara/Parse/Stream.hs b/src/Elara/Parse/Stream.hs
@@ -12,6 +12,7 @@ import Text.Megaparsec
 data TokenStream = TokenStream
     { tokenStreamInput :: !String
     , tokenStreamTokens :: ![Lexeme]
+    , skipIndents :: Bool
     }
     deriving (Show, Eq)
 
@@ -27,23 +28,25 @@ instance Stream TokenStream where
     chunkLength Proxy = length
     chunkEmpty Proxy = null
     take1_ :: TokenStream -> Maybe (Text.Megaparsec.Token TokenStream, TokenStream)
-    take1_ (TokenStream _ []) = Nothing
-    take1_ (TokenStream str (t : ts)) = Just (t, TokenStream (drop (tokensLength (Proxy @TokenStream) (t :| [])) str) ts)
-    takeN_ n (TokenStream str s)
-        | n <= 0 = Just ([], TokenStream str s)
+    take1_ (TokenStream _ [] _) = Nothing
+    take1_ (TokenStream str (Located _ t : ts) skipIndents@True) | isIndent t = take1_ (TokenStream str ts skipIndents)
+    take1_ (TokenStream str (t : ts) skipIndents) =
+        Just (t, TokenStream (drop (tokensLength (Proxy @TokenStream) (t :| [])) str) ts skipIndents)
+    takeN_ n (TokenStream str s skipIndents)
+        | n <= 0 = Just ([], TokenStream str s skipIndents)
         | null s = Nothing
         | otherwise -- repeatedly call take1_ until it returns Nothing
             =
-            let (x, s') = takeWhile_ (const True) (TokenStream str s)
+            let (x, s') = takeWhile_ (const True) (TokenStream str s skipIndents)
              in case takeN_ (n - length x) s' of
                     Nothing -> Nothing
                     Just (xs, s'') -> Just (x ++ xs, s'')
 
-    takeWhile_ f (TokenStream str s) =
+    takeWhile_ f (TokenStream str s skipIndents) =
         let (x, s') = span f s
          in case nonEmpty x of
-                Nothing -> (x, TokenStream str s')
-                Just nex -> (x, TokenStream (drop (tokensLength (Proxy @TokenStream) nex) str) s')
+                Nothing -> (x, TokenStream str s' skipIndents)
+                Just nex -> (x, TokenStream (drop (tokensLength (Proxy @TokenStream) nex) str) s' skipIndents)
 
 instance VisualStream TokenStream where
     showTokens Proxy =