In [None]:
#Лексический анализатор1

In [None]:
#Лексический анализатор2 
import re
from enum import Enum, auto

class TokenType(Enum):
    COMMAND = auto()          # \command
    SECTION = auto()          # \section
    BEGIN = auto()            # \begin
    END = auto()              # \end
    BEGIN_FIGURE = auto()     # \begin{figure}
    END_FIGURE = auto()       # \end{figure}
    BEGIN_BIBLIO = auto()     # \begin{thebibliography}
    END_BIBLIO = auto()       # \end{thebibliography}
    BIBITEM = auto()          # \bibitem
    LBRACE = auto()           # {
    RBRACE = auto()           # }
    LBRACKET = auto()         # [
    RBRACKET = auto()         # ]
    TEXT = auto()             # обычный текст
    COMMENT = auto()          # % комментарий
    ESCAPED = auto()          # \%
    KEY = auto()              # ключ в bibitem
    EOF = auto()              # конец файла

class Token:
    def __init__(self, type: TokenType, value: str = '', line: int = 0, col: int = 0):
        self.type = type
        self.value = value
        self.line = line
        self.col = col
    
    def __str__(self):
        return f'Token({self.type}, {repr(self.value)}, line={self.line}, col={self.col})'
    
    def __repr__(self):
        return self.__str__()

class Lexer:
    def __init__(self, text: str):
        self.text = text
        self.pos = 0
        self.line = 1
        self.col = 1
        self.current_char = self.text[0] if self.text else None
    
    def error(self, message):
        raise Exception(f'Lexer error at line {self.line}, col {self.col}: {message}')
    
    def advance(self):
        if self.current_char == '\n':
            self.line += 1
            self.col = 1
        else:
            self.col += 1
        
        self.pos += 1
        if self.pos >= len(self.text):
            self.current_char = None
        else:
            self.current_char = self.text[self.pos]
    
    def skip_whitespace(self):
        while self.current_char is not None and self.current_char.isspace():
            self.advance()
    
    def get_text(self):
        result = []
        while (self.current_char is not None and 
               self.current_char not in ['\\', '{', '}', '[', ']', '%']):
            result.append(self.current_char)
            self.advance()
        return ''.join(result)
    
    def get_comment(self):
        result = []
        while self.current_char is not None and self.current_char != '\n':
            result.append(self.current_char)
            self.advance()
        return ''.join(result)
    
    def get_command_name(self):
        result = []
        # Команда может состоять из букв или одного спецсимвола
        if self.current_char and (self.current_char.isalpha() or self.current_char in ['@', '*', '_']):
            result.append(self.current_char)
            self.advance()
            while self.current_char and self.current_char.isalpha():
                result.append(self.current_char)
                self.advance()
        return ''.join(result)
    
    def get_key(self):
        result = []
        while self.current_char and (self.current_char.isalnum() or self.current_char in ['-', '_', ':']):
            result.append(self.current_char)
            self.advance()
        return ''.join(result)
    
    def get_next_token(self):
        while self.current_char is not None:
            if self.current_char.isspace():
                self.skip_whitespace()
                continue
            
            # Комментарии
            if self.current_char == '%':
                start_line = self.line
                start_col = self.col
                self.advance()
                comment = self.get_comment()
                return Token(TokenType.COMMENT, comment, start_line, start_col)
            
            # Экранированные символы
            if self.current_char == '\\':
                # Проверяем, не является ли это командой
                start_pos = self.pos
                start_line = self.line
                start_col = self.col
                self.advance()
                
                if self.current_char is None:
                    return Token(TokenType.ESCAPED, '\\', start_line, start_col)
                
                # Команды \begin, \end, \section, \bibitem
                if self.current_char.isalpha():
                    command = self.get_command_name()
                    full_cmd = '\\' + command
                    
                    if full_cmd == '\\begin':
                        # Проверяем следующий аргумент
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            env = self.get_text()
                            if self.current_char == '}':
                                self.advance()
                                if env == 'figure':
                                    return Token(TokenType.BEGIN_FIGURE, full_cmd + '{figure}', start_line, start_col)
                                elif env == 'thebibliography':
                                    return Token(TokenType.BEGIN_BIBLIO, full_cmd + '{thebibliography}', start_line, start_col)
                    
                    elif full_cmd == '\\end':
                        # Аналогично для \end
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            env = self.get_text()
                            if self.current_char == '}':
                                self.advance()
                                if env == 'figure':
                                    return Token(TokenType.END_FIGURE, full_cmd + '{figure}', start_line, start_col)
                                elif env == 'thebibliography':
                                    return Token(TokenType.END_BIBLIO, full_cmd + '{thebibliography}', start_line, start_col)
                    
                    elif full_cmd == '\\section':
                        return Token(TokenType.SECTION, full_cmd, start_line, start_col)
                    
                    elif full_cmd == '\\bibitem':
                        return Token(TokenType.BIBITEM, full_cmd, start_line, start_col)
                    
                    # Обычная команда
                    return Token(TokenType.COMMAND, full_cmd, start_line, start_col)
                
                else:
                    # Экранированный символ
                    char = self.current_char
                    self.advance()
                    return Token(TokenType.ESCAPED, '\\' + char, start_line, start_col)
            
            # Скобки и аргументы
            if self.current_char == '{':
                self.advance()
                return Token(TokenType.LBRACE, '{', self.line, self.col-1)
            
            if self.current_char == '}':
                self.advance()
                return Token(TokenType.RBRACE, '}', self.line, self.col-1)
            
            if self.current_char == '[':
                self.advance()
                return Token(TokenType.LBRACKET, '[', self.line, self.col-1)
            
            if self.current_char == ']':
                self.advance()
                return Token(TokenType.RBRACKET, ']', self.line, self.col-1)
            
            # Обычный текст
            text = self.get_text()
            if text:
                return Token(TokenType.TEXT, text, self.line, self.col-len(text))
            
            self.error(f"Unexpected character: {self.current_char}")
        
        return Token(TokenType.EOF, '', self.line, self.col)

# Пример использования
def tokenize_latex(text):
    lexer = Lexer(text)
    tokens = []
    while True:
        token = lexer.get_next_token()
        tokens.append(token)
        if token.type == TokenType.EOF:
            break
    return tokens

# Тестовый пример
if __name__ == "__main__":
    test_text = r"""
\documentclass{article}
\begin{document}
\section{Introduction}
This is a test document with % комментарий
some \textit{formatted} text and a \textbf{command}.

\begin{figure}
    \caption{A figure}
\end{figure}

\begin{thebibliography}{9}
\bibitem{key1} Author, Title
\end{thebibliography}
\end{document}
"""
    
    tokens = tokenize_latex(test_text)
    for token in tokens:
        print(token)




Token(TokenType.COMMAND, '\\documentclass', line=2, col=1)
Token(TokenType.LBRACE, '{', line=2, col=15)
Token(TokenType.TEXT, 'article', line=2, col=16)
Token(TokenType.RBRACE, '}', line=2, col=23)
Token(TokenType.COMMAND, '\\begin', line=3, col=1)
Token(TokenType.SECTION, '\\section', line=4, col=1)
Token(TokenType.LBRACE, '{', line=4, col=9)
Token(TokenType.TEXT, 'Introduction', line=4, col=10)
Token(TokenType.RBRACE, '}', line=4, col=22)
Token(TokenType.TEXT, 'This is a test document with ', line=5, col=1)
Token(TokenType.COMMENT, ' комментарий', line=5, col=30)
Token(TokenType.TEXT, 'some ', line=6, col=1)
Token(TokenType.COMMAND, '\\textit', line=6, col=6)
Token(TokenType.LBRACE, '{', line=6, col=13)
Token(TokenType.TEXT, 'formatted', line=6, col=14)
Token(TokenType.RBRACE, '}', line=6, col=23)
Token(TokenType.TEXT, 'text and a ', line=6, col=25)
Token(TokenType.COMMAND, '\\textbf', line=6, col=36)
Token(TokenType.LBRACE, '{', line=6, col=43)
Token(TokenType.TEXT, 'command', line=