In [None]:
#Все импорты
import os
import sys
import re
import chardet
import rarfile
import shutil
import logging
import uuid
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom
from enum import Enum, auto
from typing import List, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor
from rarfile import RarCannotExec, BadRarFile
from dataclasses import dataclass
import traceback

In [None]:
#1-Программа меняет кодировки файлов латех

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

total_tex_files = 0
success_count = 0
error_count = 0
lock = None  

def setup_unrar_tool():
    """Настраиваем путь к unrar/winrar"""
    possible_paths = [
        '/usr/bin/unrar',
        '/usr/local/bin/unrar',
        'C:\\Program Files\\WinRAR\\UnRAR.exe',
        'C:\\Program Files (x86)\\WinRAR\\UnRAR.exe',
        'C:\\Program Files\\WinRAR\\winrar.exe',
        'C:\\Program Files (x86)\\WinRAR\\winrar.exe'
    ]
    for path in possible_paths:
        if os.path.exists(path):
            rarfile.UNRAR_TOOL = path
            return
    unrar_path = shutil.which('unrar') or shutil.which('winrar')
    if unrar_path:
        rarfile.UNRAR_TOOL = unrar_path
        return
    raise Exception("Не удалось найти unrar/winrar. Установите WinRAR или unrar.")

def convert_latex_file(input_file):
    global success_count, error_count
    try:
        with open(input_file, 'rb') as file:
            raw_data = file.read()

        detection = chardet.detect(raw_data)
        encodings_to_try = ['utf-8', 'cp866', 'windows-1251', 'koi8-r', 'utf-16']

        content = None
        for enc in encodings_to_try:
            try:
                content = raw_data.decode(enc)
                break
            except UnicodeDecodeError:
                continue

        if content is None:
            logging.warning(f"Не удалось декодировать файл {input_file}. Пропускаем.")
            error_count += 1
            return

        lines = content.splitlines()
        updated_lines = []
        inputenc_found = False

        for line in lines:
            if r'\usepackage[' in line and 'inputenc' in line:
                line = r'\usepackage[utf8]{inputenc}'
                inputenc_found = True
            updated_lines.append(line)

        if not inputenc_found:
            for i, line in enumerate(updated_lines):
                if r'\documentclass' in line:
                    updated_lines.insert(i + 1, r'\usepackage[utf8]{inputenc}')
                    break

        output_file = os.path.join('output', os.path.relpath(input_file, 'dataset'))
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write('\n'.join(updated_lines))

        success_count += 1

    except Exception as e:
        logging.error(f"Ошибка при обработке файла {input_file}: {e}")
        error_count += 1

def process_extracted_files(folder_path):
    global total_tex_files
    tex_files = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)
            if filename.lower().endswith('.tex'):
                tex_files.append(file_path)
            else:
                try:
                    os.remove(file_path)
                except Exception as e:
                    logging.error(f"Ошибка при удалении {filename}: {e}")

    total_tex_files += len(tex_files)

    with ThreadPoolExecutor() as executor:
        executor.map(convert_latex_file, tex_files)

    for root, dirs, _ in os.walk(folder_path, topdown=False):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            try:
                if not os.listdir(dir_path):
                    os.rmdir(dir_path)
            except Exception as e:
                logging.error(f"Ошибка при удалении папки {dir_path}: {e}")

def process_rar_archive(rar_path, output_folder):
    try:
        with rarfile.RarFile(rar_path) as rf:
            archive_name = os.path.splitext(os.path.basename(rar_path))[0]
            extract_folder = os.path.join(output_folder, archive_name)
            os.makedirs(extract_folder, exist_ok=True)

            # Извлечение только .tex файлов
            for file_info in rf.infolist():
                if file_info.isdir() or not file_info.filename.lower().endswith('.tex'):
                    continue
                try:
                    rf.extract(file_info, extract_folder)
                except Exception as e:
                    logging.error(f"Ошибка при извлечении {file_info.filename}: {e}")

            process_extracted_files(extract_folder)

            try:
                shutil.rmtree(extract_folder)
            except Exception as e:
                logging.error(f"Ошибка при удалении {extract_folder}: {e}")

    except (RarCannotExec, BadRarFile) as e:
        logging.error(f"Ошибка архива {rar_path}: {e}")
    except Exception as e:
        logging.error(f"Неожиданная ошибка при обработке {rar_path}: {e}")

def process_dataset_folder(dataset_folder):
    global total_tex_files
    if not os.path.exists(dataset_folder):
        raise FileNotFoundError(f"Папка {dataset_folder} не найдена!")

    os.makedirs('output', exist_ok=True)
    rar_files = [os.path.join(dataset_folder, f) for f in os.listdir(dataset_folder) if f.lower().endswith('.rar')]

    if not rar_files:
        logging.info("Не найдено RAR архивов для обработки!")
        return

    with ThreadPoolExecutor() as executor:
        executor.map(lambda rar: process_rar_archive(rar, dataset_folder), rar_files)

if __name__ == "__main__":
    global total_tex_files, success_count, error_count
    total_tex_files = 0
    success_count = 0
    error_count = 0

    try:
        setup_unrar_tool()
        dataset_folder = 'dataset'
        process_dataset_folder(dataset_folder)

        print("\n📊 Результаты обработки:")
        print(f" Успешно обработано файлов: {success_count}")
        print(f" Ошибок при обработке: {error_count}")
        print(f" Всего найдено .tex файлов: {total_tex_files}")
        if error_count > 0:
            print(" Некоторые файлы были пропущены из-за ошибок.")
        else:
            print(" Все файлы успешно обработаны!")

    except Exception as e:
        logging.critical(f"Критическая ошибка: {e}")
        print(f"\n❌ Критическая ошибка: {e}")


📊 Результаты обработки:
 Успешно обработано файлов: 1268
 Ошибок при обработке: 0
 Всего найдено .tex файлов: 1268
 Все файлы успешно обработаны!


In [None]:
#Лексер

class TokenType(Enum):
    CODE = auto()
    DOCUMENT = auto()
    DEF = auto()
    COMMAND = auto()
    TEXT = auto()
    TITLE = auto()
    AUTHOR = auto()
    EMAIL = auto()
    COMMENT = auto()
    BRACES = auto()
    BRACKETS = auto()
    FOOTNOTE = auto()
    ANNOTATION = auto()
    KEYWORDS = auto()
    DOI = auto()
    LABEL = auto()
    PAGESTYLE = auto()
    ESCAPED = auto()
    GROUP = auto()
    FORMULA = auto()
    THEOREM = auto()
    LEMMA = auto()
    COROLLARY = auto()
    SUBSECTION = auto()
    PROOF = auto()
    BIBITEM = auto()
    SECTION = auto()
    ITEM = auto()
    EPSFXSIZE = auto()
    EPSFBOX = auto()
    FOOTNOTETEXT = auto()
    REF = auto()
    CITE = auto()
    LINEBREAK = auto()
    TEXTIT = auto()
    SPECIAL_SYMBOL = auto()
    NOINDENT = auto()
    SMALLSKIP = auto()
    TEXTBF = auto()
    VSPACE = auto()
    ENUMERATE = auto()
    ITEMIZE = auto()
    CENTER = auto()
    EOF = auto()
    FIGURE = auto()

class Token:
    def __init__(self, type: TokenType, value: str, line: int, col: int):
        self.type = type
        self.value = value
        self.line = line
        self.col = col

    def __str__(self):
        return f'Token({self.type}", "{self.value}", line={self.line}, col={self.col})'

    def __repr__(self):
        return self.__str__()


class Lexer:
    def __init__(self, text: str):
        self.text = text
        self.pos = 0
        self.line = 1
        self.col = 1
        self.current_char = self.text[0] if self.text else None
        self.email_regex = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
        self.author = ['aut', 'autkol', 'Au', 'Aue', 'author', 'Author']
        self.title_commands = ['tit', 'titkol', 'title', 'Title']
        self.annotation_commands = ['abstract', 'Abstract', 'Abst', 'Abste', 'annotation', 'Annotation']
        self.affiliation_commands = ['affiliation', 'Affiliation', 'address', 'Address']
        self.math_commands = ['fr', 'hm', 'to', 'acute', 'limits', 'd', 'mathbb', 'mathcal',
                             'mathbf', 'mathit', 'mathrm', 'mathsf', 'mathtt', 'mathfrak',
                             'mathscr', 'mathbbm', 'mathbbmscript', 'mathbbmtt', 'pi', 'langle', 'rangle', 'circ']
        self.theorem_commands = ['theorem', 'lemma', 'corollary']
        self.section_commands = ['section', 'subsection']
        self.proof_commands = ['proof']
        self.reference_commands = ['ref', 'label', 'cite']
        self.formatting_commands = ['linebreak', 'textit', 'emph', 'noindent', 'smallskip', 'textbf', 'vspace']
        self.list_environments = ['enumerate', 'itemize', 'center']
        self.document_commands = ['documentclass']
        self.pagestyle_commands = ['pagestyle']
        self.keyword_commands = ['KW', 'KWE']
        self.footnote_commands = ['footnote']
        self.authors = []
        self.author_translations = {}
        self.current_footnote_index = None

    def error(self, message):
        start = max(0, self.pos - 10)
        end = min(len(self.text), self.pos + 10)
        context = self.text[start:end].replace('\n', '\\n')
        raise Exception(f'Lexer error at line {self.line}, col {self.col}: {message}\nContext: ...{context}...')

    def advance(self):
        if self.current_char == '\n':
            self.line += 1
            self.col = 1
        else:
            self.col += 1
        self.pos += 1
        if self.pos >= len(self.text):
            self.current_char = None
        else:
            self.current_char = self.text[self.pos]

    def skip_whitespace(self):
        while self.current_char is not None and self.current_char.isspace():
            self.advance()

    def get_braced_content(self):
        if self.current_char != '{':
            return ''
        result = []
        brace_count = 1
        start_line = self.line
        start_col = self.col
        self.advance()
        while self.current_char is not None and brace_count > 0:
            if self.current_char == '\\':
                result.append(self.current_char)
                self.advance()
                if self.current_char is not None:
                    result.append(self.current_char)
                    self.advance()
                continue
            if self.current_char == '{':
                brace_count += 1
                result.append(self.current_char)
                self.advance()
            elif self.current_char == '}':
                brace_count -= 1
                if brace_count > 0:
                    result.append(self.current_char)
                self.advance()
            else:
                result.append(self.current_char)
                self.advance()
        if brace_count != 0:
            self.error(f"Unclosed brace at line {start_line}, col {start_col}")
        return ''.join(result)

    def clean_text_content(self, text: str) -> str:
        cleaned_text = re.sub(r'~(?:\s|-{1,3})?|\\-', ' ', text)
        cleaned_text = re.sub(r'\\(?:hm|,|[a-zA-Z]+)?', '', cleaned_text)
        cleaned_text = ' '.join(cleaned_text.split())
        cleaned_text = cleaned_text.replace('\n', ' \n ')
        return cleaned_text.strip()

    def get_text(self, stop_chars=None):
        if stop_chars is None:
            stop_chars = ['\\', '{', '}', '[', ']', '%', '$']
        result = []
        start_line = self.line
        start_col = self.col
        while self.current_char is not None and self.current_char not in stop_chars:
            if self.current_char == '~':
                result.append(self.current_char)
                self.advance()
                if self.current_char == '-' and self.pos + 2 < len(self.text) and self.text[self.pos:self.pos+3] == '---':
                    result.append('---')
                    for _ in range(3):
                        self.advance()
                    return Token(TokenType.SPECIAL_SYMBOL, '~---', start_line, start_col)
                elif self.current_char == ' ':
                    result.append(' ')
                    self.advance()
                    return Token(TokenType.SPECIAL_SYMBOL, '~', start_line, start_col)
                continue
            elif self.current_char == '\\' and self.pos + 1 < len(self.text):
                if self.text[self.pos:self.pos+2] == '\\,':
                    result.append('\\,')
                    self.advance()
                    self.advance()
                    if self.current_char == 'п' and self.pos > 2 and self.text[self.pos-3:self.pos] == 'т.\\,':
                        result.append('п')
                        self.advance()
                        result.append('.')
                        self.advance()
                        return Token(TokenType.SPECIAL_SYMBOL, 'т.\\,п.', start_line, start_col)
                    continue
                elif self.text[self.pos:self.pos+2] == '\\-':
                    result.append('\\-')
                    self.advance()
                    self.advance()
                    continue
                continue
            result.append(self.current_char)
            self.advance()
        text = ''.join(result)
        if text:
            cleaned_text = self.clean_text_content(text)
            return Token(TokenType.TEXT, cleaned_text, start_line, start_col)
        return None

    def get_comment(self):
        result = []
        while self.current_char is not None and self.current_char != '\n':
            result.append(self.current_char)
            self.advance()
        return ''.join(result)

    def get_email(self):
        start_pos = self.pos
        temp = []
        while self.current_char is not None and not self.current_char.isspace() and self.current_char not in ['\\', '{', '}', '[', ']', '%', '$']:
            temp.append(self.current_char)
            self.advance()
        possible_email = ''.join(temp)
        if self.email_regex.match(possible_email):
            return possible_email
        self.pos = start_pos
        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
        return None

    def clean_textit_content(self, content: str) -> str:
        formula_pattern = r'(\$.*?\$|\\\[.*?\\\])'
        parts = []
        last_pos = 0
        for match in re.finditer(formula_pattern, content):
            text_before = content[last_pos:match.start()]
            if text_before:
                text_before = re.sub(r'~(?:\s|-{1,3})?|\\-', ' ', text_before)
                text_before = re.sub(r'\\(?:hm|,|[a-zA-Z]+)?', '', text_before)
                parts.append(text_before)
            parts.append(match.group(0))
            last_pos = match.end()
        text_after = content[last_pos:]
        if text_after:
            text_after = re.sub(r'~(?:\s|-{1,3})?|\\-', ' ', text_after)
            text_after = re.sub(r'~(?:hm|,|[a-zA-Z]+)?', '', text_after)
            parts.append(text_after)
        cleaned_content = ''.join(parts).strip()
        return cleaned_content

    def get_formula_inline(self, delimiter):
        result = [delimiter]
        start_line = self.line
        start_col = self.col
        self.advance()
        brace_count = 0
        while self.current_char is not None:
            if self.current_char == '{':
                brace_count += 1
                result.append(self.current_char)
                self.advance()
            elif self.current_char == '}':
                brace_count -= 1
                result.append(self.current_char)
                self.advance()
            elif self.current_char == '\\':
                result.append(self.current_char)
                self.advance()
                if self.current_char is not None:
                    cmd = self.get_command_name()
                    result.append(cmd)
                    if cmd in self.math_commands:
                        self.skip_whitespace()
                        if self.current_char == '{':
                            result.append('{')
                            self.advance()
                            brace_count += 1
            elif self.current_char == delimiter and (self.pos == 0 or self.text[self.pos - 1] != '\\') and brace_count == 0:
                result.append(self.current_char)
                self.advance()
                formula = ''.join(result)
                content = formula[1:-1].strip()
                if len(content) >= 4:
                    return Token(TokenType.FORMULA, formula, start_line, start_col)
                return Token(TokenType.TEXT, formula, start_line, start_col)
            else:
                result.append(self.current_char)
                self.advance()
        print(f"Warning: Unclosed formula starting at line {start_line}, col {start_col}")
        formula = ''.join(result)
        content = formula[1:].strip()
        if len(content) >= 4:
            return Token(TokenType.FORMULA, formula, start_line, start_col)
        return Token(TokenType.TEXT, formula, start_line, start_col)

    def get_formula_block(self, env):
        result = [f'\\begin{{{env}}}']
        start_line = self.line
        start_col = self.col
        while self.current_char is not None and self.current_char != '\n':
            self.advance()
        while self.current_char is not None:
            if self.current_char == '\\':
                self.advance()
                cmd = self.get_command_name()
                if cmd == 'end':
                    self.skip_whitespace()
                    if self.current_char == '{':
                        self.advance()
                        end_env = self.get_text(stop_chars=['}']).value
                        if end_env == env and self.current_char == '}':
                            self.advance()
                            result.append(f'\\end{{{end_env}}}')
                            formula = ''.join(result)
                            content_start = len(f'\\begin{{{env}}}')
                            content_end = -len(f'\\end{{{env}}}')
                            content = formula[content_start:content_end].strip()
                            if len(content) >= 4:
                                token_type = TokenType.FORMULA if env in ['equation', 'align', 'gather'] else TokenType.COMMAND
                                return Token(token_type, formula, start_line, start_col)
                            return Token(TokenType.COMMAND, formula, start_line, start_col)
                result.append(f'\\{cmd}')
            elif self.current_char == '$':
                inline_formula = self.get_formula_inline('$')
                if inline_formula is not None:
                    result.append(inline_formula.value)
            else:
                result.append(self.current_char)
                self.advance()
        formula = ''.join(result)
        content_start = len(f'\\begin{{{env}}}')
        content = formula[content_start:].strip()
        if len(content) >= 4:
            token_type = TokenType.FORMULA if env in ['equation', 'align', 'gather'] else TokenType.COMMAND
            return Token(token_type, formula, start_line, start_col)
        return Token(TokenType.COMMAND, formula, start_line, start_col)

    def get_command_name(self):
        result = []
        if self.current_char and (self.current_char.isalpha() or self.current_char in ['@', '*', '_']):
            result.append(self.current_char)
            self.advance()
            while self.current_char and (self.current_char.isalpha() or self.current_char in ['*']):
                result.append(self.current_char)
                self.advance()
        return ''.join(result)

    def get_author(self, command):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()

        indices = []
        content_clean = content
        if '^' in content:
            parts = content.split('^')
            content_clean = parts[0].strip()
            for part in parts[1:]:
                index = part.strip('$,}')
                if index.isdigit():
                    indices.append(int(index))

        if not content_clean:
            return Token(TokenType.AUTHOR, f'\\{command}{{{content}}}', start_line, start_col)

        is_english = all(ord(c) < 128 for c in content_clean if c.isalpha())

        is_author_match = False
        normalized_content = content_clean.replace(' ', '').lower()
        for author in self.authors:
            normalized_author_name = author['name'].replace(' ', '').lower()
            if (normalized_content == normalized_author_name or
                normalized_content == self.author_translations.get(author['name'], '').replace(' ', '').lower()):
                is_author_match = True
                existing_indices = set(author['indices'])
                existing_indices.update(indices)
                author['indices'] = sorted(list(existing_indices))
                break

        if command == 'index':
            if is_author_match or any(normalized_content == author['name'].replace(' ', '').lower() for author in self.authors):
                return Token(TokenType.AUTHOR, f'\\index{{{content}}}', start_line, start_col)
            return Token(TokenType.COMMAND, f'\\index{{{content}}}', start_line, start_col)

        if not is_author_match:
            self.authors.append({'name': content_clean, 'indices': indices, 'command': command})

        if is_english and command in ['Au', 'Aue', 'author', 'Author']:
            for author in self.authors:
                if author['command'] in ['aut', 'autkol'] and normalized_content in author['name'].replace(' ', '').lower():
                    self.author_translations[author['name']] = content_clean
                    break
        elif not is_english and command in ['aut', 'autkol']:
            for author in self.authors:
                if author['command'] in ['Au', 'Aue', 'author', 'Author'] and author['name'].replace(' ', '').lower() in normalized_content:
                    self.author_translations[content_clean] = author['name']
                    break

        return Token(TokenType.AUTHOR, f'\\{command}{{{content}}}', start_line, start_col)

    def get_title(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()
        return Token(TokenType.TITLE, f'\\title{{{content}}}', start_line, start_col)

    def get_annotation(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        
        content_parts = []
        while self.current_char is not None and self.current_char != '}':
            if self.current_char == '\\':
                self.advance()
                if self.current_char is not None:
                    cmd = self.get_command_name()
                    content_parts.append(f'\\{cmd}')
            else:
                text_token = self.get_text(stop_chars=['\\', '}'])
                if text_token:
                    content_parts.append(text_token.value)
        
        if self.current_char == '}':
            self.advance()
        
        full_content = ''.join(content_parts)
        cleaned_content = self.clean_text_content(full_content)
        
        return Token(TokenType.ANNOTATION, f'\\Abst{{{cleaned_content}}}', start_line, start_col)

    def get_footnote_content(self, command):
        start_line = self.line
        start_col = self.col
        number = ''
        if self.current_char == '[':
            self.advance()
            number = self.get_text(stop_chars=[']']).value
            if self.current_char == ']':
                self.advance()
            self.skip_whitespace()

        if self.current_char == '{':
            self.advance()
            content = self.get_braced_content()
            if self.current_char == '}':
                self.advance()
            return Token(TokenType.FOOTNOTETEXT, f'\\{command}[{number}]{{{content}}}', start_line, start_col)
        return Token(TokenType.COMMAND, f'\\{command}', start_line, start_col)

    def get_section_content(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()
        return Token(TokenType.SECTION, f'\\section{{{content}}}', start_line, start_col)

    def get_subsection_content(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()
        return Token(TokenType.SUBSECTION, f'\\subsection{{{content}}}', start_line, start_col)

    def get_documentclass(self):
        start_line = self.line
        start_col = self.col
        self.skip_whitespace()
        options = ''
        if self.current_char == '[':
            self.advance()
            options = self.get_text(stop_chars=[']']).value
            if self.current_char == ']':
                self.advance()
        self.skip_whitespace()
        if self.current_char == '{':
            self.advance()
            content = self.get_braced_content()
            if self.current_char == '}':
                self.advance()
            return Token(TokenType.DOCUMENT, f'\\documentclass[{options}]{{{content}}}', start_line, start_col)
        return Token(TokenType.DOCUMENT, f'\\documentclass{{{content}}}' if 'content' in locals() else '\\documentclass', start_line, start_col)

    def get_pagestyle(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()
        return Token(TokenType.PAGESTYLE, f'\\pagestyle{{{content}}}', start_line, start_col)

    def get_footnote(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()
        return Token(TokenType.FOOTNOTE, f'\\footnote{{{content}}}', start_line, start_col)

    def get_keywords(self, command):
        start_line = self.line
        start_col = self.col
        self.advance()
        
        content_parts = []
        while self.current_char is not None and self.current_char != '}':
            if self.current_char == '\\':
                self.advance()
                if self.current_char is not None:
                    cmd = self.get_command_name()
                    content_parts.append(f'\\{cmd}')
            else:
                text_token = self.get_text(stop_chars=['\\', '}'])
                if text_token:
                    content_parts.append(text_token.value)
        
        if self.current_char == '}':
            self.advance()
        
        full_content = ''.join(content_parts)
        cleaned_content = self.clean_text_content(full_content)
        
        return Token(TokenType.KEYWORDS, f'\\{command}{{{cleaned_content}}}', start_line, start_col)

    def get_group(self):
        start_line = self.line
        start_col = self.col
        self.advance()
        content = self.get_braced_content()
        if self.current_char == '}':
            self.advance()
        return Token(TokenType.GROUP, f'{{{content}}}', start_line, start_col)

    def get_next_token(self):
        while self.current_char is not None:
            if self.current_char.isspace():
                self.skip_whitespace()
                continue
            if self.current_char == '%':
                start_line = self.line
                start_col = self.col
                self.advance()
                comment = self.get_comment()
                return Token(TokenType.COMMENT, comment, start_line, start_col)
            if self.current_char == '$':
                start_line = self.line
                start_col = self.col
                formula = self.get_formula_inline('$')
                if formula is not None:
                    return formula
                else:
                    text_token = self.get_text(['$', '\\', '{', '}', '[', ']', '%'])
                    text_value = text_token.value if text_token is not None else ''
                    return Token(TokenType.TEXT, '$' + text_value, start_line, start_col)
            if self.current_char == '\\' and self.pos + 1 < len(self.text) and self.text[self.pos + 1] == '[':
                start_line = self.line
                start_col = self.col
                self.advance()
                formula = self.get_formula_inline('[')
                if formula is not None:
                    return formula
                else:
                    text_token = self.get_text([']', '\\', '{', '}', '[', '%', '$'])
                    text_value = text_token.value if text_token is not None else ''
                    return Token(TokenType.TEXT, '\\[' + text_value, start_line, start_col)
            if self.current_char == '\\':
                start_line = self.line
                start_col = self.col
                self.advance()
                if self.current_char is None:
                    return Token(TokenType.ESCAPED, '\\', start_line, start_col)
                if self.current_char.isalpha() or self.current_char in ['@', '*', '_']:
                    command = self.get_command_name()
                    full_cmd = '\\' + command
                    self.skip_whitespace()
                    if command == 'begin' and self.current_char == '{':
                        self.advance()
                        env = self.get_text(stop_chars=['}']).value
                        if self.current_char == '}':
                            self.advance()
                            if env in ['equation', 'align', 'gather']:
                                formula = self.get_formula_block(env)
                                if formula is not None:
                                    return Token(TokenType.FORMULA, formula, start_line, start_col)
                            elif env in self.list_environments:
                                content = self.get_formula_block(env)
                                token_type = TokenType.ENUMERATE if env == 'enumerate' else TokenType.ITEMIZE if env == 'itemize' else TokenType.CENTER
                                return Token(token_type, content, start_line, start_col)
                            elif env == 'proof':
                                content = self.get_formula_block(env)
                                return Token(TokenType.PROOF, content, start_line, start_col)
                            elif env in ['figure', 'figure*']:
                                content = self.get_formula_block(env)
                                return Token(TokenType.FIGURE, content, start_line, start_col)
                    elif command in self.document_commands and self.current_char in ['{', '[']:
                        return self.get_documentclass()
                    elif command in self.pagestyle_commands and self.current_char == '{':
                        return self.get_pagestyle()
                    elif command in self.footnote_commands and self.current_char == '{':
                        return self.get_footnote()
                    elif command in self.keyword_commands and self.current_char == '{':
                        return self.get_keywords(command)
                    elif command in self.author and self.current_char == '{':
                        return self.get_author(command)
                    elif command == 'index' and self.current_char == '{':
                        return self.get_author(command)
                    elif command == 'section' and self.current_char == '{':
                        return self.get_section_content()
                    elif command == 'subsection' and self.current_char == '{':
                        return self.get_subsection_content()
                    elif command == 'bibitem':
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            bib_key = self.get_text(stop_chars=['}']).value
                            if self.current_char == '}':
                                self.advance()
                            return Token(TokenType.BIBITEM, f'\\bibitem{{{bib_key}}}', start_line, start_col)
                    elif command == 'item':
                        return Token(TokenType.ITEM, full_cmd, start_line, start_col)
                    elif command == 'def':
                        self.skip_whitespace()
                        if self.current_char == '\\':
                            self.advance()
                            macro_name = '\\' + self.get_command_name()
                            self.skip_whitespace()
                            if self.current_char == '{':
                                self.advance()
                                macro_content = self.get_braced_content()
                                if self.current_char == '}':
                                    self.advance()
                                return Token(TokenType.DEF, f'\\def{macro_name}{{{macro_content}}}', start_line, start_col)
                        return Token(TokenType.DEF, full_cmd, start_line, start_col)
                    elif command == 'titel':
                        return Token(TokenType.COMMAND, full_cmd, start_line, start_col)
                    elif command in self.annotation_commands:
                        return self.get_annotation()
                    elif command == 'DOI':
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            doi_content = self.get_text(stop_chars=['}']).value
                            if self.current_char == '}':
                                self.advance()
                            return Token(TokenType.DOI, f'\\DOI{{{doi_content}}}', start_line, start_col)
                        return Token(TokenType.DOI, full_cmd, start_line, start_col)
                    elif command == 'Ack':
                        return Token(TokenType.COMMAND, full_cmd, start_line, start_col)
                    elif command == 'Contrl':
                        return Token(TokenType.COMMAND, full_cmd, start_line, start_col)
                    elif command == 'Caption':
                        return Token(TokenType.COMMAND, full_cmd, start_line, start_col)
                    elif command == 'epsfxsize':
                        self.skip_whitespace()
                        if self.current_char == '=':
                            self.advance()
                            value = self.get_text(stop_chars=['\\', '{', '}', '[', ']', '%', '$']).value
                            return Token(TokenType.EPSFXSIZE, f'\\epsfxsize={value}', start_line, start_col)
                        return Token(TokenType.EPSFXSIZE, full_cmd, start_line, start_col)
                    elif command == 'epsfbox':
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            file_name = self.get_text(stop_chars=['}']).value
                            if self.current_char == '}':
                                self.advance()
                            return Token(TokenType.EPSFBOX, f'\\epsfbox{{{file_name}}}', start_line, start_col)
                        return Token(TokenType.EPSFBOX, full_cmd, start_line, start_col)
                    elif command == 'footnotetext':
                        return self.get_footnote_content(command)
                    elif command == 'linebreak':
                        return Token(TokenType.LINEBREAK, full_cmd, start_line, start_col)
                    elif command == 'textit':
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            content = self.get_braced_content()
                            if self.current_char == '}':
                                self.advance()
                            cleaned_content = self.clean_textit_content(content)
                            return Token(TokenType.TEXTIT, f'\\textit{{{cleaned_content}}}', start_line, start_col)
                        return Token(TokenType.TEXTIT, full_cmd, start_line, start_col)
                    elif command == 'noindent':
                        return Token(TokenType.NOINDENT, full_cmd, start_line, start_col)
                    elif command == 'smallskip':
                        return Token(TokenType.SMALLSKIP, full_cmd, start_line, start_col)
                    elif command == 'textbf':
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            content = self.get_braced_content()
                            if self.current_char == '}':
                                self.advance()
                            return Token(TokenType.TEXTBF, f'\\textbf{{{content}}}', start_line, start_col)
                        return Token(TokenType.TEXTBF, full_cmd, start_line, start_col)
                    elif command == 'vspace':
                        self.skip_whitespace()
                        if self.current_char == '{':
                            self.advance()
                            content = self.get_text(stop_chars=['}']).value
                            if self.current_char == '}':
                                self.advance()
                            return Token(TokenType.VSPACE, f'\\vspace{{{content}}}', start_line, start_col)
                        elif self.current_char == '*':
                            self.advance()
                            return Token(TokenType.VSPACE, '\\vspace*', start_line, start_col)
                        return Token(TokenType.VSPACE, full_cmd, start_line, start_col)
                    elif command in self.theorem_commands:
                        return Token(TokenType[command.upper()], full_cmd, start_line, start_col)
                    return Token(TokenType.COMMAND, full_cmd, start_line, start_col)
                elif self.current_char == '-':
                    self.advance()
                    return self.get_text()
                elif self.current_char == '=':
                    self.advance()
                    return Token(TokenType.ESCAPED, '\\=', start_line, start_col)
                else:
                    char = self.current_char
                    self.advance()
                    return Token(TokenType.ESCAPED, '\\' + char, start_line, start_col)
            elif self.current_char == '{':
                return self.get_group()
            elif self.current_char == '}':
                self.advance()
                return Token(TokenType.BRACES, '}', self.line, self.col-1)
            elif self.current_char == '[':
                self.advance()
                return Token(TokenType.BRACKETS, '[', self.line, self.col-1)
            elif self.current_char == ']':
                self.advance()
                return Token(TokenType.BRACKETS, ']', self.line, self.col-1)
            email = self.get_email()
            if email:
                return Token(TokenType.EMAIL, email, self.line, self.col - len(email))
            text_token = self.get_text()
            if text_token:
                return text_token
            self.error(f"Unexpected character: {self.current_char}")
        return Token(TokenType.EOF, '', self.line, self.col)

def tokenize_latex_from_file(file_path: str):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                latex_text = file.read()
            lexer = Lexer(latex_text)
            tokens = []
            while True:
                token = lexer.get_next_token()
                tokens.append(token)
                if token.type == TokenType.EOF:
                    break

            merged_tokens = []
            i = 0
            while i < len(tokens):
                if i < len(tokens) - 1 and tokens[i].type == TokenType.TEXT and tokens[i+1].type == TokenType.TEXT:
                    text_values = [tokens[i].value]
                    start_line = tokens[i].line
                    start_col = tokens[i].col
                    i += 1
                    while i < len(tokens) and tokens[i].type == TokenType.TEXT:
                        text_values.append(tokens[i].value)
                        i += 1
                    merged_value = ''.join(text_values).strip()
                    merged_tokens.append(Token(TokenType.TEXT, merged_value, start_line, start_col))
                else:
                    merged_tokens.append(tokens[i])
                    i += 1

            return merged_tokens

        except FileNotFoundError:
            print(f"Ошибка: Файл '{file_path}' не найден")
            return []
        except UnicodeDecodeError:
            print(f"Ошибка: Не удалось декодировать файл '{file_path}'. Убедитесь, что файл в кодировке UTF-8")
            return []
        except Exception as e:
            print(f"Ошибка лексера: {e}")
            return []

if __name__ == "__main__":
    file_path = input("Введите путь к файлу с LaTeX: ")
    tokens = tokenize_latex_from_file(file_path)
    if tokens:
        print("\nНайденные токены:")
        for token in tokens:
            print(token)
        print(f"\nВсего токенов: {len(tokens)}")


Найденные токены:
Token(TokenType.COMMAND", "\newcommand", line=1, col=1)
Token(TokenType.GROUP", "{}", line=1, col=12)
Token(TokenType.COMMAND", "\bs", line=1, col=13)
Token(TokenType.BRACES", "}", line=1, col=16)
Token(TokenType.GROUP", "{\boldsymbol{\sigma}}", line=1, col=17)
Token(TokenType.COMMAND", "\newcommand", line=2, col=1)
Token(TokenType.GROUP", "{}", line=2, col=12)
Token(TokenType.COMMAND", "\bb", line=2, col=13)
Token(TokenType.BRACES", "}", line=2, col=16)
Token(TokenType.GROUP", "{\mathbf{b}}", line=2, col=17)
Token(TokenType.COMMENT", "\newcommand{\ba}{{\mathbf{a}}}", line=5, col=1)
Token(TokenType.COMMAND", "\newcommand", line=7, col=1)
Token(TokenType.GROUP", "{}", line=7, col=12)
Token(TokenType.COMMAND", "\bx", line=7, col=13)
Token(TokenType.BRACES", "}", line=7, col=16)
Token(TokenType.GROUP", "{\mathbf{x}}", line=7, col=17)
Token(TokenType.COMMAND", "\newcommand", line=8, col=1)
Token(TokenType.GROUP", "{}", line=8, col=12)
Token(TokenType.COMMAND", "\bbf", li

In [None]:
#Парсер

@dataclass
class Node:
    node_type: str
    value: str
    children: List['Node']
    id: str = None

    def __post_init__(self):
        self.id = str(uuid.uuid4())[-8:]  # Сокращаем UUID до последних 8 символов
        # Гарантируем, что value всегда строка
        if not isinstance(self.value, str):
            self.value = str(self.value)

    def __str__(self, level=0):
        # Пропускаем узлы Braces с '}', чтобы уменьшить шум
        if self.node_type == "Braces" and self.value == "}":
            result = ""
            for child in self.children:
                result += child.__str__(level)
            return result

        # Форматируем узел с компактным отступом
        indent = "  " * level
        short_value = self.value[:50] + "..." if len(self.value) > 50 else self.value
        result = f"{indent}└── {self.node_type} (ID: {self.id}): {short_value} [Children: {len(self.children)}]\n"
        for child in self.children:
            result += child.__str__(level + 1)
        return result

class Parser:
    def __init__(self, tokens: List['Token']):
        self.tokens = tokens
        self.pos = 0
        self.current_token = self.tokens[0] if self.tokens else None
        self.email_regex = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
        self.authors = []

    def error(self, message):
        if self.current_token:
            raise Exception(f"Ошибка парсинга на токене {self.current_token}: {message}")
        else:
            raise Exception(f"Ошибка парсинга: {message}")

    def advance(self):
        self.pos += 1
        if self.pos < len(self.tokens):
            self.current_token = self.tokens[self.pos]
        else:
            self.current_token = None

    def expect(self, token_type: 'TokenType'):
        if self.current_token is None or self.current_token.type != token_type:
            self.error(f"Ожидался токен типа {token_type}, получен {self.current_token.type if self.current_token else 'None'}")
        token = self.current_token
        self.advance()
        return token

    def parse(self) -> Node:
        document_node = Node("Document", "[empty]", [])
        while self.current_token and self.current_token.type != TokenType.EOF:
            node = self.parse_statement()
            if node:
                document_node.children.append(node)
        return document_node

    def parse_statement(self) -> Optional[Node]:
        if self.current_token is None:
            return None

        token = self.current_token

        # Обработка текста
        if token.type == TokenType.TEXT:
            self.advance()
            return Node("Text", token.value, [])

        # Обработка email
        elif token.type == TokenType.EMAIL:
            self.advance()
            return Node("Email", token.value, [])

        # Обработка команд документа
        elif token.type == TokenType.DOCUMENT:
            self.advance()
            return Node("DocumentCommand", token.value, [])

        # Обработка определений макросов
        elif token.type == TokenType.DEF:
            self.advance()
            return Node("Definition", token.value, [])

        # Обработка общих команд
        elif token.type == TokenType.COMMAND:
            self.advance()
            command_node = Node("Command", token.value, [])
            while self.current_token and self.current_token.type in [TokenType.TEXT, TokenType.GROUP, TokenType.BRACES, TokenType.BRACKETS, TokenType.EMAIL]:
                child = self.parse_statement()
                if child:
                    command_node.children.append(child)
            return command_node

        # Обработка заголовка
        elif token.type == TokenType.TITLE:
            self.advance()
            return Node("Title", token.value, [])

        # Обработка автора
        elif token.type == TokenType.AUTHOR:
            self.advance()
            author_match = re.match(r'\\(aut|autkol|Au|Aue|author|Author|index)\{([^}]*)\}', token.value)
            if author_match:
                command = author_match.group(1)
                author_name = author_match.group(2).strip()
                indices = []
                if '^' in author_name:
                    parts = author_name.split('^')
                    author_name = parts[0].strip()
                    for part in parts[1:]:
                        index = part.strip('$,}')
                        if index.isdigit():
                            indices.append(int(index))
                if author_name:
                    for author in self.authors:
                        if author['name'].replace(' ', '').lower() == author_name.replace(' ', '').lower():
                            existing_indices = set(author['indices'])
                            existing_indices.update(indices)
                            author['indices'] = sorted(list(existing_indices))
                            break
                    else:
                        self.authors.append({'name': author_name, 'indices': indices})
                return Node("Author", author_name or token.value, [])

        # Обработка текста сносок
        elif token.type == TokenType.FOOTNOTETEXT:
            self.advance()
            return Node("Footnotetext", token.value, [])

        # Обработка комментариев
        elif token.type == TokenType.COMMENT:
            self.advance()
            return Node("Comment", token.value, [])

        # Обработка фигурных скобок
        elif token.type == TokenType.BRACES:
            self.advance()
            return Node("Braces", token.value, [])

        # Обработка квадратных скобок
        elif token.type == TokenType.BRACKETS:
            self.advance()
            return Node("Brackets", token.value, [])

        # Обработка сносок
        elif token.type == TokenType.FOOTNOTE:
            self.advance()
            footnote_node = Node("Footnote", token.value, [])
            while self.current_token and self.current_token.type in [TokenType.TEXT, TokenType.GROUP, TokenType.COMMAND, TokenType.EMAIL]:
                child = self.parse_statement()
                if child:
                    footnote_node.children.append(child)
            return footnote_node

        # Обработка аннотации
        elif token.type == TokenType.ANNOTATION:
            self.advance()
            return Node("Annotation", token.value, [])

        # Обработка ключевых слов
        elif token.type == TokenType.KEYWORDS:
            self.advance()
            return Node("Keywords", token.value, [])

        # Обработка DOI
        elif token.type == TokenType.DOI:
            self.advance()
            return Node("DOI", token.value, [])

        # Обработка меток
        elif token.type == TokenType.LABEL:
            self.advance()
            return Node("Label", token.value, [])

        # Обработка стиля страницы
        elif token.type == TokenType.PAGESTYLE:
            self.advance()
            return Node("Pagestyle", token.value, [])

        # Обработка экранированных символов
        elif token.type == TokenType.ESCAPED:
            self.advance()
            return Node("Escaped", token.value, [])

        # Обработка групп
        elif token.type == TokenType.GROUP:
            self.advance()
            group_node = Node("Group", token.value, [])
            while self.current_token and self.current_token.type in [TokenType.TEXT, TokenType.COMMAND, TokenType.FORMULA, TokenType.EMAIL]:
                child = self.parse_statement()
                if child:
                    group_node.children.append(child)
            return group_node

        # Обработка формул
        elif token.type == TokenType.FORMULA:
            self.advance()
            return Node("Formula", token.value, [])

        # Обработка подсекций
        elif token.type == TokenType.SUBSECTION:
            self.advance()
            subsection_node = Node("Subsection", token.value, [])
            while self.current_token and self.current_token.type not in [TokenType.SECTION, TokenType.SUBSECTION, TokenType.EOF]:
                child = self.parse_statement()
                if child:
                    subsection_node.children.append(child)
            return subsection_node

        # Обработка доказательств
        elif token.type == TokenType.PROOF:
            self.advance()
            proof_node = Node("Proof", token.value, [])
            while self.current_token and self.current_token.type not in [TokenType.SECTION, TokenType.SUBSECTION, TokenType.EOF]:
                child = self.parse_statement()
                if child:
                    proof_node.children.append(child)
            return proof_node

        # Обработка библиографических ссылок
        elif token.type == TokenType.BIBITEM:
            self.advance()
            bibitem_node = Node("Bibitem", token.value, [])
            while self.current_token and self.current_token.type in [TokenType.TEXT, TokenType.GROUP, TokenType.COMMAND, TokenType.EMAIL]:
                child = self.parse_statement()
                if child:
                    bibitem_node.children.append(child)
            return bibitem_node

        # Обработка секций
        elif token.type == TokenType.SECTION:
            self.advance()
            section_node = Node("Section", token.value, [])
            while self.current_token and self.current_token.type not in [TokenType.SECTION, TokenType.EOF]:
                child = self.parse_statement()
                if child:
                    section_node.children.append(child)
            return section_node

        # Обработка элементов списка
        elif token.type == TokenType.ITEM:
            self.advance()
            item_node = Node("Item", token.value, [])
            while self.current_token and self.current_token.type not in [TokenType.ITEM, TokenType.EOF]:
                child = self.parse_statement()
                if child:
                    item_node.children.append(child)
            return item_node

        # Обработка графики (epsfxsize)
        elif token.type == TokenType.EPSFXSIZE:
            self.advance()
            return Node("Epsfxsize", token.value, [])

        # Обработка графики (epsfbox)
        elif token.type == TokenType.EPSFBOX:
            self.advance()
            return Node("Epsfbox", token.value, [])

        # Обработка ссылок
        elif token.type == TokenType.REF:
            self.advance()
            return Node("Ref", token.value, [])

        # Обработка цитирований
        elif token.type == TokenType.CITE:
            self.advance()
            return Node("Cite", token.value, [])

        # Обработка разрыва строки
        elif token.type == TokenType.LINEBREAK:
            self.advance()
            return Node("Linebreak", token.value, [])

        # Обработка курсива
        elif token.type == TokenType.TEXTIT:
            self.advance()
            textit_node = Node("Textit", token.value, [])
            while self.current_token and self.current_token.type in [TokenType.TEXT, TokenType.GROUP, TokenType.EMAIL]:
                child = self.parse_statement()
                if child:
                    textit_node.children.append(child)
            return textit_node

        # Обработка специальных символов
        elif token.type == TokenType.SPECIAL_SYMBOL:
            self.advance()
            return Node("SpecialSymbol", token.value, [])

        # Обработка отсутствия отступа
        elif token.type == TokenType.NOINDENT:
            self.advance()
            return Node("Noindent", token.value, [])

        # Обработка малого отступа
        elif token.type == TokenType.SMALLSKIP:
            self.advance()
            return Node("Smallskip", token.value, [])

        # Обработка жирного текста
        elif token.type == TokenType.TEXTBF:
            self.advance()
            textbf_node = Node("Textbf", token.value, [])
            while self.current_token and self.current_token.type in [TokenType.TEXT, TokenType.GROUP, TokenType.EMAIL]:
                child = self.parse_statement()
                if child:
                    textbf_node.children.append(child)
            return textbf_node

        # Обработка вертикального отступа
        elif token.type == TokenType.VSPACE:
            self.advance()
            return Node("Vspace", token.value, [])

        # Обработка списков (enumerate)
        elif token.type == TokenType.ENUMERATE:
            self.advance()
            list_node = Node("Enumerate", token.value, [])
            while self.current_token and self.current_token.type == TokenType.ITEM:
                item_token = self.expect(TokenType.ITEM)
                item_node = Node("Item", item_token.value, [])
                list_node.children.append(item_node)
                while self.current_token and self.current_token.type not in [TokenType.ITEM, TokenType.EOF]:
                    child = self.parse_statement()
                    if child:
                        item_node.children.append(child)
            return list_node

        # Обработка списков (itemize)
        elif token.type == TokenType.ITEMIZE:
            self.advance()
            list_node = Node("Itemize", token.value, [])
            while self.current_token and self.current_token.type == TokenType.ITEM:
                item_token = self.expect(TokenType.ITEM)
                item_node = Node("Item", item_token.value, [])
                list_node.children.append(item_node)
                while self.current_token and self.current_token.type not in [TokenType.ITEM, TokenType.EOF]:
                    child = self.parse_statement()
                    if child:
                        item_node.children.append(child)
            return list_node

        # Обработка центрирования
        elif token.type == TokenType.CENTER:
            self.advance()
            list_node = Node("Center", token.value, [])
            while self.current_token and self.current_token.type == TokenType.ITEM:
                item_token = self.expect(TokenType.ITEM)
                item_node = Node("Item", item_token.value, [])
                list_node.children.append(item_node)
                while self.current_token and self.current_token.type not in [TokenType.ITEM, TokenType.EOF]:
                    child = self.parse_statement()
                    if child:
                        item_node.children.append(child)
            return list_node

        # Обработка фигур
        elif token.type == TokenType.FIGURE:
            self.advance()
            figure_node = Node("Figure", token.value, [])
            while self.current_token and self.current_token.type not in [TokenType.SECTION, TokenType.SUBSECTION, TokenType.EOF]:
                child = self.parse_statement()
                if child:
                    figure_node.children.append(child)
            return figure_node

        # Обработка конца файла
        elif token.type == TokenType.EOF:
            self.advance()
            return None

        else:
            self.error(f"Неизвестный тип токена: {token.type}")
            return None

def parse_latex(tokens: List['Token']) -> Node:
    parser = Parser(tokens)
    try:
        ast = parser.parse()
        return ast
    except Exception as e:
        print(f"Ошибка парсинга: {e}")
        return None

if __name__ == "__main__":
    tokens = tokenize_latex_from_file("input.tex")
    ast = parse_latex(tokens)
    if ast:
        print("\nАбстрактное синтаксическое дерево (AST):")
        print(ast)


Абстрактное синтаксическое дерево (AST):
└── Document (ID: cea880d6): [empty] [Children: 68]
  └── Definition (ID: 994ea263): \def\stat{} [Children: 0]
  └── Text (ID: 104f4c57): zatsar [Children: 0]
  └── Definition (ID: 56901023): \def\tit{} [Children: 0]
  └── Text (ID: c558a530): СИСТЕМА СИТУАЦИОННОГО УПРАВЛЕНИЯ КАК МУЛЬТИСЕРВИСН... [Children: 0]
  └── Definition (ID: af66b628): \def\titkol{} [Children: 0]
  └── Text (ID: d220bdd1): Система ситуационного управления как мультисервисн... [Children: 0]
  └── Definition (ID: 8fcb2213): \def\aut{} [Children: 0]
  └── Text (ID: b97cdcf1): А. [Children: 0]
  └── Escaped (ID: daffb256): \, [Children: 0]
  └── Text (ID: 37b082fb): А. Зацаринный$^1$, А. [Children: 0]
  └── Escaped (ID: cd56a1a5): \, [Children: 0]
  └── Text (ID: bc8ffdea): П. Сучков$^2$ [Children: 0]
  └── Definition (ID: 7051e8d7): \def\autkol{} [Children: 0]
  └── Text (ID: 527d5746): А. [Children: 0]
  └── Escaped (ID: a29dceaf): \, [Children: 0]
  └── Text (ID: 3bae60d8

In [None]:
#Генератор xml файла

import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom
from typing import Optional

class XMLGenerator:
    def __init__(self, ast: 'Node'):
        self.ast = ast
        self.id_counter = 1  
        self.authors = []  

    def generate_id(self) -> str:
        """Генерирует уникальный ID для элементов."""
        id_str = f"id{self.id_counter}"
        self.id_counter += 1
        return id_str

    def escape_xml(self, value: any) -> str:
        """Экранирует специальные символы для XML, преобразуя value в строку."""
        text = str(value) if not isinstance(value, str) else value
        return (
            text.replace("&", "&amp;")
                .replace("<", "&lt;")
                .replace(">", "&gt;")
                .replace('"', "&quot;")
                .replace("'", "&apos;")
        )

    def process_node(self, node: 'Node', parent: ET.Element):
        """Рекурсивно обрабатывает узел AST и добавляет его в XML-дерево."""
        if node.node_type == "Document":
            paper = ET.SubElement(parent, "paper")
            metadata = ET.SubElement(paper, "metadata")
            fulltext = ET.SubElement(paper, "fulltext")
            literature = ET.SubElement(fulltext, "literature")
            
            # Собираем всех авторов перед созданием XML-узлов
            for child in node.children:
                if child.node_type == "Author":
                    self.authors.append(child)
            
            # Создаем <authors> и добавляем всех авторов
            if self.authors:
                authors_elem = ET.SubElement(metadata, "authors")
                for author_node in self.authors:
                    ET.SubElement(authors_elem, "author", value=self.escape_xml(author_node.value))
            
            # Обрабатываем остальные метаданные и содержимое
            for child in node.children:
                if child.node_type in ["Email", "Title", "Annotation", "Keywords"]:
                    self.process_node(child, metadata)
                elif child.node_type == "Bibitem":
                    self.process_node(child, literature)
                elif child.node_type != "Author":  # Пропускаем Author, так как они уже обработаны
                    self.process_node(child, fulltext)

        elif node.node_type == "Email":
            email = ET.SubElement(parent, "email")
            text = ET.SubElement(email, "text", value=self.escape_xml(node.value))

        elif node.node_type == "Title":
            title = ET.SubElement(parent, "title")
            text = ET.SubElement(title, "text", value=self.escape_xml(node.value))

        elif node.node_type == "Annotation":
            annotation = ET.SubElement(parent, "annotation", value=self.escape_xml(node.value))

        elif node.node_type == "Keywords":
            keywords = ET.SubElement(parent, "keywords")
            text = ET.SubElement(keywords, "text", value=self.escape_xml(node.value))

        elif node.node_type == "Section":
            section = ET.SubElement(parent, "section", id=self.generate_id())
            text_elem = ET.SubElement(section, "text")
            value = ET.SubElement(text_elem, "value", value=self.escape_xml(node.value))
            for child in node.children:
                self.process_node(child, section)

        elif node.node_type == "Subsection":
            subsection = ET.SubElement(parent, "subsection")
            for child in node.children:
                self.process_node(child, subsection)

        elif node.node_type == "Text":
            # Разбиваем текст, если он содержит LaTeX-команды или специальные символы
            if node.value and isinstance(node.value, str):
                parts = self.split_latex_content(node.value)
                for part in parts:
                    if part.startswith("\\") or part in ["~", "\\,"]:
                        escaped = ET.SubElement(parent, "escaped", value=self.escape_xml(part))
                    else:
                        text = ET.SubElement(parent, "text")
                        value = ET.SubElement(text, "value", value=self.escape_xml(part))
            else:
                text = ET.SubElement(parent, "text")
                value = ET.SubElement(text, "value", value=self.escape_xml(node.value))

        elif node.node_type == "Formula":
            formula = ET.SubElement(parent, "formula", id=self.generate_id(), value=self.escape_xml(node.value))

        elif node.node_type == "Bibitem":
            item = ET.SubElement(parent, "item", id=self.generate_id(), value="\\bibitem")
            text = ET.SubElement(item, "text", value=self.escape_xml(node.value))
            for child in node.children:
                self.process_node(child, item)

        elif node.node_type in ["Enumerate", "Itemize", "Center"]:
            list_element = ET.SubElement(parent, node.node_type.lower())
            for child in node.children:
                if child.node_type == "Item":
                    item = ET.SubElement(list_element, "item")
                    for item_child in child.children:
                        self.process_node(item_child, item)

        elif node.node_type == "Figure":
            figure = ET.SubElement(parent, "figure", id=self.generate_id(), value=self.escape_xml(node.value))
            for child in node.children:
                self.process_node(child, figure)

        elif node.node_type in ["Textit", "Textbf", "SpecialSymbol", "Linebreak", "Noindent", "Smallskip", "Vspace", "Epsfxsize", "Epsfbox", "Ref", "Cite", "Footnotetext", "Footnote"]:
            command = ET.SubElement(parent, "command", value=self.escape_xml(f"\\{node.node_type.lower()}"))
            if node.value:
                text = ET.SubElement(command, "text", value=self.escape_xml(node.value))
            for child in node.children:
                self.process_node(child, command)

        elif node.node_type == "Command":
            command = ET.SubElement(parent, "command", value=self.escape_xml(node.value))
            for child in node.children:
                text = ET.SubElement(command, "text")
                self.process_node(child, text)

        elif node.node_type == "Group":
            group = ET.SubElement(parent, "group")
            if node.value:
                text = ET.SubElement(group, "text", value=self.escape_xml(node.value))
            for child in node.children:
                self.process_node(child, group)

    def split_latex_content(self, text: str) -> list:
        """Разбивает текст на части, выделяя LaTeX-команды и специальные символы."""
        parts = []
        current = ""
        i = 0
        while i < len(text):
            if text[i] == "\\":
                if current:
                    parts.append(current)
                    current = ""
                if i + 1 < len(text) and text[i + 1] in [",", "~"]:
                    parts.append(text[i:i + 2])
                    i += 2
                else:
                    parts.append(text[i])
                    i += 1
            elif text[i] == "~":
                if current:
                    parts.append(current)
                    current = ""
                parts.append("~")
                i += 1
            else:
                current += text[i]
                i += 1
        if current:
            parts.append(current)
        return parts

    def generate_xml(self, output_file: str = None) -> str:
        """Генерирует XML из AST и возвращает строку или записывает в файл."""
        root = ET.Element("papers")
        self.process_node(self.ast, root)
        
        rough_string = ET.tostring(root, encoding="unicode")
        parsed = minidom.parseString(rough_string)
        pretty_xml = parsed.toprettyxml(indent="  ", encoding="utf-8").decode("utf-8")


        pretty_xml = "\n".join(line for line in pretty_xml.split("\n") if line.strip())

        if output_file:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(pretty_xml)
        
        return pretty_xml

def generate_xml_from_ast(ast: 'Node', output_file: Optional[str] = None) -> str:
    """Основная функция для генерации XML из AST."""
    try:
        generator = XMLGenerator(ast)
        return generator.generate_xml(output_file)
    except Exception as e:
        print(f"Ошибка генерации XML: {e}")
        return ""

if __name__ == "__main__":
    tokens = tokenize_latex_from_file("input.tex")
    ast = parse_latex(tokens)
    if ast:
        xml_output = generate_xml_from_ast(ast, "output1.xml")
        print("\nСгенерированный XML:")
        print(xml_output)


Сгенерированный XML:
<?xml version="1.0" encoding="utf-8"?>
<papers>
  <paper>
    <metadata>
      <authors>
        <author value="\index{}"/>
        <author value="\index{}"/>
        <author value="\index{}"/>
        <author value="\index{}"/>
      </authors>
      <annotation value="\Abst{Рассматриваются системотехнические подходы к созданию унифицированной системы управ ле ния в среде облачных технологий в виде совокупности сервисов. Предложенный подход на основе процессной пятистадийной модели позволяет определить общий вид (прототип) унифицированной системы ситуационного управления с учетом всех видов внешних и внутренних информационных взаимодействий в составе иерархической системы управления. Показано, что унифицированная система должна обладать средствами настройки (локализации) на конкретную об ласть применения. Предложен перечень сервисов локализации и сервисов обеспечения основных функций сис те мы управ ле ния. Ожидается, что такие сервисы будут востребованы в широко

In [None]:
#Генератор всех файлов 

class XMLGenerator:
    def __init__(self, ast: 'Node'):
        self.ast = ast
        self.id_counter = 1

    def generate_id(self) -> str:
        id_str = f"id{self.id_counter}"
        self.id_counter += 1
        return id_str

    def escape_xml(self, value: any) -> str:
        """Экранирует специальные символы для XML."""
        text = str(value) if not isinstance(value, str) else value
        return (
            text.replace("&", "&amp;")
            .replace("<", "&lt;")
            .replace(">", "&gt;")
            .replace('"', "&quot;")
            .replace("'", "&apos;")
        )

    def split_latex_specials(self, text: str) -> list:
        """Разбивает текст на части, выделяя LaTeX-команды и специальные символы."""
        parts = []
        current = ""
        i = 0
        while i < len(text):
            if text[i] == "\\":
                if current:
                    parts.append(current)
                    current = ""
                if i + 1 < len(text) and text[i + 1] in [",", "~"]:
                    parts.append(text[i:i + 2])
                    i += 2
                else:
                    parts.append(text[i])
                    i += 1
            elif text[i] == "~":
                if current:
                    parts.append(current)
                    current = ""
                parts.append("~")
                i += 1
            else:
                current += text[i]
                i += 1
        if current:
            parts.append(current)
        return parts

    def process_node(self, node: 'Node', parent: ET.Element):
        try:
            if node.node_type == "Document":
                paper = ET.SubElement(parent, "paper")
                metadata = ET.SubElement(paper, "metadata")
                fulltext = ET.SubElement(paper, "fulltext")
                literature = ET.SubElement(fulltext, "literature")
                
                for child in node.children:
                    if child.node_type in ["Author", "Email", "Title", "Annotation", "Keywords"]:
                        self.process_node(child, metadata)
                    elif child.node_type == "Bibitem":
                        self.process_node(child, literature)
                    else:
                        self.process_node(child, fulltext)

            elif node.node_type == "Author":
                author = ET.SubElement(parent, "author", value=self.escape_xml(node.value))

            elif node.node_type == "Email":
                affiliation = ET.SubElement(parent, "affiliation")
                braces = ET.SubElement(affiliation, "braces")
                text = ET.SubElement(braces, "text", value=self.escape_xml(node.value))

            elif node.node_type == "Title":
                text = ET.SubElement(parent, "text", value=self.escape_xml(node.value))

            elif node.node_type == "Annotation":
                annotation = ET.SubElement(parent, "annotation", value=self.escape_xml(node.value))

            elif node.node_type == "Keywords":
                keywords = ET.SubElement(parent, "keywords")
                braces = ET.SubElement(keywords, "braces")
                text = ET.SubElement(braces, "text", value=self.escape_xml(node.value))

            elif node.node_type == "Section":
                section = ET.SubElement(parent, "section", id=self.generate_id())
                braces = ET.SubElement(section, "braces")
                text = ET.SubElement(braces, "text", value=self.escape_xml(node.value))
                for child in node.children:
                    self.process_node(child, section)

            elif node.node_type == "Subsection":
                subsection = ET.SubElement(parent, "subsection")
                for child in node.children:
                    self.process_node(child, subsection)

            elif node.node_type == "Text":
                if node.value and isinstance(node.value, str):
                    parts = self.split_latex_specials(node.value)
                    for part in parts:
                        if part.startswith("\\") or part in ["~", "\\,"]:
                            escaped = ET.SubElement(parent, "escaped", value=self.escape_xml(part))
                        else:
                            text = ET.SubElement(parent, "text", value=self.escape_xml(part))
                else:
                    text = ET.SubElement(parent, "text", value=self.escape_xml(node.value))

            elif node.node_type == "Formula":
                formula = ET.SubElement(parent, "formula", id=self.generate_id(), value=self.escape_xml(node.value))

            elif node.node_type in ["Theorem", "Lemma", "Consequence"]:
                tag_name = {"Theorem": "theorem", "Lemma": "lemma", "Consequence": "consequence"}[node.node_type]
                element = ET.SubElement(parent, tag_name, id=self.generate_id())
                text = ET.SubElement(element, "text", value=self.escape_xml(node.value))
                for child in node.children:
                    if child.node_type == "Proof":
                        evidence = ET.SubElement(element, "evidence")
                        for proof_child in child.children:
                            self.process_node(proof_child, evidence)
                    else:
                        self.process_node(child, element)

            elif node.node_type == "Bibitem":
                item = ET.SubElement(parent, "item", id=self.generate_id(), value="\\bibitem")
                braces = ET.SubElement(item, "braces")
                text = ET.SubElement(braces, "text", value=self.escape_xml(node.value))
                for child in node.children:
                    self.process_node(child, item)

            elif node.node_type in ["Enumerate", "Itemize", "Center"]:
                list_element = ET.SubElement(parent, node.node_type.lower())
                for child in node.children:
                    if child.node_type == "Item":
                        item = ET.SubElement(list_element, "item")
                        for item_child in child.children:
                            self.process_node(item_child, item)

            elif node.node_type == "Figure":
                figure = ET.SubElement(parent, "figure", id=self.generate_id(), value=self.escape_xml(node.value))
                for child in node.children:
                    self.process_node(child, figure)

            elif node.node_type in ["Textit", "Textbf", "SpecialSymbol", "Linebreak", "Noindent", "Smallskip", "Vspace", "Epsfxsize", "Epsfbox", "Ref", "Cite", "Footnotetext", "Footnote"]:
                command = ET.SubElement(parent, "command", value=self.escape_xml(f"\\{node.node_type.lower()}"))
                if node.value:
                    braces = ET.SubElement(command, "braces")
                    text = ET.SubElement(braces, "text", value=self.escape_xml(node.value))
                for child in node.children:
                    self.process_node(child, command)

            elif node.node_type == "Command":
                command = ET.SubElement(parent, "command", value=self.escape_xml(node.value))
                for child in node.children:
                    braces = ET.SubElement(command, "braces")
                    self.process_node(child, braces)

            elif node.node_type == "Group":
                braces = ET.SubElement(parent, "braces")
                if node.value:
                    text = ET.SubElement(braces, "text", value=self.escape_xml(node.value))
                for child in node.children:
                    self.process_node(child, braces)

        except Exception as e:
            error_msg = f"Ошибка обработки узла {node.node_type if hasattr(node, 'node_type') else 'неизвестный'}: {str(e)}"
            print(error_msg, file=sys.stderr)
            traceback.print_exc()
            raise

    def generate_xml(self, output_file: str) -> str:
        try:
            root = ET.Element("papers")
            self.process_node(self.ast, root)
            if len(root) == 0:
                raise Exception("XML-дерево пустое: AST не содержит узлов")
            rough_string = ET.tostring(root, encoding="unicode")
            parsed = minidom.parseString(rough_string)
            pretty_xml = parsed.toprettyxml(indent="  ", encoding="utf-8").decode("utf-8")
            pretty_xml = "\n".join(line for line in pretty_xml.split("\n") if line.strip())
            
            output_dir = os.path.dirname(output_file)
            if output_dir and not os.path.exists(output_dir):
                try:
                    os.makedirs(output_dir)
                    print(f"Создана директория: {output_dir}")
                except Exception as e:
                    raise Exception(f"Не удалось создать директорию {output_dir}: {str(e)}")
            
            try:
                with open(output_file, "w", encoding="utf-8") as f:
                    f.write(pretty_xml)
            except IOError as e:
                raise Exception(f"Ошибка записи в файл {output_file}: {str(e)}")
            
            if not os.path.exists(output_file):
                raise Exception(f"Файл {output_file} не был создан")
            if os.path.getsize(output_file) == 0:
                raise Exception(f"Файл {output_file} создан пустым")
            
            print(f"Успешно создан XML-файл: {output_file}")
            return pretty_xml
            
        except Exception as e:
            error_msg = f"Ошибка генерации XML для {output_file}: {str(e)}"
            print(error_msg, file=sys.stderr)
            traceback.print_exc()
            raise


def process_latex_files(input_dir: str, output_dir: str) -> Tuple[int, int, List[dict]]:
    """Обрабатывает все LaTeX-файлы во входной директории и сохраняет XML в выходную директорию."""
    if not os.path.exists(input_dir):
        raise Exception(f"Входная директория не существует: {input_dir}")

    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
            print(f"Создана выходная директория: {output_dir}")
        except Exception as e:
            raise Exception(f"Не удалось создать выходную директорию {output_dir}: {str(e)}")

    successful = 0
    failed = 0
    failed_files = []

    for root, _, files in os.walk(input_dir):
        tex_files = [f for f in files if f.endswith('.tex')]
        
        for tex_file in tex_files:
            input_path = os.path.join(root, tex_file)
            relative_path = os.path.relpath(root, input_dir)
            output_subdir = os.path.join(output_dir, relative_path)
            output_file = os.path.join(output_subdir, f"{os.path.splitext(tex_file)[0]}.xml")
            
            try:
                print(f"\nОбработка файла: {input_path}")
                
                if not os.access(input_path, os.R_OK):
                    raise Exception(f"Нет доступа для чтения файла: {input_path}")
                if os.path.getsize(input_path) == 0:
                    raise Exception(f"Файл {input_path} пустой")
                
                tokens = tokenize_latex_from_file(input_path)
                if not tokens or (len(tokens) == 1 and tokens[0].type == TokenType.EOF):
                    raise Exception("Токенизация вернула пустой результат или только EOF")
                print(f"Получено токенов: {len(tokens)}")

                ast = parse_latex(tokens)
                if not ast or not ast.children:
                    raise Exception("Парсинг вернул пустое AST или AST без дочерних узлов")
                
                generator = XMLGenerator(ast)
                xml_result = generator.generate_xml(output_file)
                
                if not xml_result:
                    raise Exception("Генерация XML вернула пустой результат")
                
                successful += 1
                print(f"Успешно обработан: {input_path}")
            
            except Exception as e:
                failed += 1
                error_info = {
                    'file': input_path,
                    'error': str(e),
                    'traceback': traceback.format_exc()
                }
                failed_files.append(error_info)
                
                print(f"\n❌ Ошибка обработки {input_path}:", file=sys.stderr)
                print(f"Тип ошибки: {type(e).__name__}", file=sys.stderr)
                print(f"Сообщение: {str(e)}", file=sys.stderr)
                print("\nТрассировка:", file=sys.stderr)
                traceback.print_exc()
                print("-" * 50, file=sys.stderr)

    report_file = os.path.join(output_dir, "processing_report.txt")
    try:
        with open(report_file, "w", encoding="utf-8") as f:
            f.write(f"Отчет о обработке\n{'=' * 50}\n\n")
            f.write(f"Входная директория: {input_dir}\n")
            f.write(f"Выходная директория: {output_dir}\n\n")
            f.write(f"Всего обработано файлов: {successful + failed}\n")
            f.write(f"Успешно сконвертировано: {successful}\n")
            f.write(f"Неудачных попыток: {failed}\n")
            total_files = successful + failed
            success_percentage = (successful / total_files * 100) if total_files > 0 else 0
            f.write(f"Процент успешно обработанных файлов: {success_percentage:.2f}%\n\n")
            
            if failed_files:
                f.write("Детали по ошибкам:\n")
                for i, fail in enumerate(failed_files, 1):
                    f.write(f"\n{i}. Файл: {fail['file']}\n")
                    f.write(f"   Ошибка: {fail['error']}\n")
                    f.write("   Трассировка:\n")
                    f.write(f"{fail['traceback']}\n")
                    try:
                        tokens = tokenize_latex_from_file(fail['file'])
                        f.write(f"   Токены: {tokens}\n")
                        ast = parse_latex(tokens)
                        f.write(f"   AST: {ast}\n")
                    except:
                        f.write("   Не удалось получить токены или AST\n")
                    f.write("-" * 50 + "\n")
        
        print(f"\nОтчет сохранен в: {report_file}")
    except Exception as e:
        print(f"\n❌ Не удалось сохранить отчет: {str(e)}", file=sys.stderr)

    return successful, failed, failed_files


if __name__ == "__main__":
    try:
        input_directory = "output"
        output_directory = "output_xml"

        print("Запуск конвертации LaTeX в XML...")
        successful, failed, failed_files = process_latex_files(input_directory, output_directory)

        print("\nИтоговый отчет:")
        print(f"Всего обработано файлов: {successful + failed}")
        print(f"Успешно сконвертировано: {successful}")
        print(f"Ошибок конвертации: {failed}")
        total_files = successful + failed
        success_percentage = (successful / total_files * 100) if total_files > 0 else 0
        print(f"Процент успешно обработанных файлов: {success_percentage:.2f}%")
        
        if failed > 0:
            print("\nСписок ошибок:")
            for i, fail in enumerate(failed_files, 1):
                print(f"\n{i}. Файл: {fail['file']}")
                print(f"   Ошибка: {fail['error']}")
            
            print(f"\nПодробности ошибок в файле: {os.path.join(output_directory, 'processing_report.txt')}")
        
        print("\nОбработка завершена.")
    
    except Exception as e:
        print("\nКритическая ошибка при обработке:", file=sys.stderr)
        print(f"Тип ошибки: {type(e).__name__}", file=sys.stderr)
        print(f"Сообщение: {str(e)}", file=sys.stderr)
        print("\nТрассировка:", file=sys.stderr)
        traceback.print_exc()
        sys.exit(1)

Запуск конвертации LaTeX в XML...
Создана выходная директория: output_xml

Обработка файла: output\Informatics-2011-1\avtor-eng.tex
Получено токенов: 256
Создана директория: output_xml\Informatics-2011-1
Успешно создан XML-файл: output_xml\Informatics-2011-1\avtor-eng.xml
Успешно обработан: output\Informatics-2011-1\avtor-eng.tex

Обработка файла: output\Informatics-2011-1\avtor.tex
Получено токенов: 276
Успешно создан XML-файл: output_xml\Informatics-2011-1\avtor.xml
Успешно обработан: output\Informatics-2011-1\avtor.tex

Обработка файла: output\Informatics-2011-1\basha.tex
Получено токенов: 530
Успешно создан XML-файл: output_xml\Informatics-2011-1\basha.xml
Успешно обработан: output\Informatics-2011-1\basha.tex

Обработка файла: output\Informatics-2011-1\chirkunov.tex
Получено токенов: 1108
Успешно создан XML-файл: output_xml\Informatics-2011-1\chirkunov.xml
Успешно обработан: output\Informatics-2011-1\chirkunov.tex

Обработка файла: output\Informatics-2011-1\chubich.tex
Получено то


❌ Ошибка обработки output\IPI2016-2\ulianov.tex:
Тип ошибки: Exception
Сообщение: Токенизация вернула пустой результат или только EOF

Трассировка:
Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Temp\ipykernel_23388\1268899625.py", line 257, in process_latex_files
    raise Exception("Токенизация вернула пустой результат или только EOF")
Exception: Токенизация вернула пустой результат или только EOF
--------------------------------------------------


Успешно создан XML-файл: output_xml\IPI2016-2\Ushakov.xml
Успешно обработан: output\IPI2016-2\Ushakov.tex

Обработка файла: output\IPI2016-3\arkhipov.tex
Получено токенов: 1553
Создана директория: output_xml\IPI2016-3
Успешно создан XML-файл: output_xml\IPI2016-3\arkhipov.xml
Успешно обработан: output\IPI2016-3\arkhipov.tex

Обработка файла: output\IPI2016-3\avtor.tex
Получено токенов: 470
Успешно создан XML-файл: output_xml\IPI2016-3\avtor.xml
Успешно обработан: output\IPI2016-3\avtor.tex

Обработка файла: output\IPI2016-3\CHICHAGOV.tex
Получено токенов: 9359
Успешно создан XML-файл: output_xml\IPI2016-3\CHICHAGOV.xml
Успешно обработан: output\IPI2016-3\CHICHAGOV.tex

Обработка файла: output\IPI2016-3\fedoseev.tex
Получено токенов: 648
Успешно создан XML-файл: output_xml\IPI2016-3\fedoseev.xml
Успешно обработан: output\IPI2016-3\fedoseev.tex

Обработка файла: output\IPI2016-3\grusho.tex
Получено токенов: 1132
Успешно создан XML-файл: output_xml\IPI2016-3\grusho.xml
Успешно обработан: 


❌ Ошибка обработки output\ipi2022-4\dukova.tex:
Тип ошибки: Exception
Сообщение: Токенизация вернула пустой результат или только EOF

Трассировка:
Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Temp\ipykernel_23388\1268899625.py", line 257, in process_latex_files
    raise Exception("Токенизация вернула пустой результат или только EOF")
Exception: Токенизация вернула пустой результат или только EOF
--------------------------------------------------


Успешно создан XML-файл: output_xml\ipi2022-4\Grusho.xml
Успешно обработан: output\ipi2022-4\Grusho.tex

Обработка файла: output\ipi2022-4\Hatskevich.tex
Получено токенов: 2177
Успешно создан XML-файл: output_xml\ipi2022-4\Hatskevich.xml
Успешно обработан: output\ipi2022-4\Hatskevich.tex

Обработка файла: output\ipi2022-4\index-16i.tex
Получено токенов: 4866
Успешно создан XML-файл: output_xml\ipi2022-4\index-16i.xml
Успешно обработан: output\ipi2022-4\index-16i.tex

Обработка файла: output\ipi2022-4\ostrikova.tex
Получено токенов: 1775
Успешно создан XML-файл: output_xml\ipi2022-4\ostrikova.xml
Успешно обработан: output\ipi2022-4\ostrikova.tex

Обработка файла: output\ipi2022-4\peshkova.tex
Получено токенов: 2657
Успешно создан XML-файл: output_xml\ipi2022-4\peshkova.xml
Успешно обработан: output\ipi2022-4\peshkova.tex

Обработка файла: output\ipi2022-4\podgot-eng.tex
Получено токенов: 249
Успешно создан XML-файл: output_xml\ipi2022-4\podgot-eng.xml
Успешно обработан: output\ipi2022-4


❌ Ошибка обработки output\ipi2023-1\adu.tex:
Тип ошибки: Exception
Сообщение: Токенизация вернула пустой результат или только EOF

Трассировка:
Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Temp\ipykernel_23388\1268899625.py", line 257, in process_latex_files
    raise Exception("Токенизация вернула пустой результат или только EOF")
Exception: Токенизация вернула пустой результат или только EOF
--------------------------------------------------


Создана директория: output_xml\ipi2023-1
Успешно создан XML-файл: output_xml\ipi2023-1\agalarov.xml
Успешно обработан: output\ipi2023-1\agalarov.tex

Обработка файла: output\ipi2023-1\agasand.tex
Получено токенов: 7353
Успешно создан XML-файл: output_xml\ipi2023-1\agasand.xml
Успешно обработан: output\ipi2023-1\agasand.tex

Обработка файла: output\ipi2023-1\arkhipov.tex
Получено токенов: 1245
Успешно создан XML-файл: output_xml\ipi2023-1\arkhipov.xml
Успешно обработан: output\ipi2023-1\arkhipov.tex

Обработка файла: output\ipi2023-1\avtor.tex
Получено токенов: 421
Успешно создан XML-файл: output_xml\ipi2023-1\avtor.xml
Успешно обработан: output\ipi2023-1\avtor.tex

Обработка файла: output\ipi2023-1\bosov.tex
Получено токенов: 5840
Успешно создан XML-файл: output_xml\ipi2023-1\bosov.xml
Успешно обработан: output\ipi2023-1\bosov.tex

Обработка файла: output\ipi2023-1\dulin.tex
Получено токенов: 1580
Успешно создан XML-файл: output_xml\ipi2023-1\dulin.xml
Успешно обработан: output\ipi2023