In [1]:
!pip install pylatexenc

Collecting pylatexenc
  Using cached pylatexenc-2.10-py3-none-any.whl
Installing collected packages: pylatexenc
Successfully installed pylatexenc-2.10


In [2]:
!pip install google-genai

Collecting google-genai
  Using cached google_genai-1.20.0-py3-none-any.whl.metadata (35 kB)
Collecting google-auth<3.0.0,>=2.14.1 (from google-genai)
  Using cached google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting websockets<15.1.0,>=13.0.0 (from google-genai)
  Using cached websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting cachetools<6.0,>=2.0.0 (from google-auth<3.0.0,>=2.14.1->google-genai)
  Using cached cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyasn1-modules>=0.2.1 (from google-auth<3.0.0,>=2.14.1->google-genai)
  Using cached pyasn1_modules-0.4.2-py3-none-any.whl.metadata (3.5 kB)
Collecting rsa<5,>=3.1.4 (from google-auth<3.0.0,>=2.14.1->google-genai)
  Using cached rsa-4.9.1-py3-none-any.whl.metadata (5.6 kB)
Collecting pyasn1<0.7.0,>=0.6.1 (from pyasn1-modules>=0.2.1->google-auth<3.0.0,>=2.14.1->google-genai)
  Using cached pyasn1-0.6.1-py3-

In [3]:
import logging
import re
import os
from pylatexenc.latexwalker import (LatexWalker, LatexCharsNode, LatexMacroNode,
                                  LatexEnvironmentNode, LatexMathNode, LatexGroupNode)

import xml.etree.ElementTree as ET
import json
from itertools import groupby
from pathlib import Path

In [4]:
import requests

from google import genai
from google.genai import types as g_types

from helper import _finalize_prompt, _prepare_prompt_for_language, return_comparison, _ask_gemini_model, _ask_gemma_model, _ask_aristote, extract_translated_from_response

In [14]:


class LatexParser:
    """
    Parses LaTeX content using a context-aware, recursive walker. It uses the
    parser's positional information and node lengths to robustly segment the document.
    """
    def __init__(self, placeholder_commands: list = [], placeholder_envs: list = [], placeholders_with_text: list = []):
        # Configuration attributes
        self.placeholder_commands = {'ref', 'cite', 'label', 'includegraphics', 'input', 'include', 'frac', 'sqrt', 'path', 'url', 'href', 'footnote', '\\'}
        self.placeholder_envs = {'verbatim', 'Verbatim', 'lstlisting'}
        self.math_text_macros = {'text', 'mathrm'}

        if len(placeholder_commands) != 0:
            self.placeholder_commands.update(placeholder_commands)
        if len(placeholder_envs) != 0:
            self.placeholder_envs.update(placeholder_envs)
        if len(placeholders_with_text) != 0:
            self.math_text_macros.update(placeholders_with_text)
            
        
        # State attributes
        self.segments = []
        self.latex_content = ""

    def parse(self, latex_content):
        """Public method to start the parsing process."""
        self.segments = []
        self.latex_content = latex_content
        lw = LatexWalker(latex_content)
        nodelist, _, _ = lw.get_latex_nodes()
        self._walk_text_nodes(nodelist)
        return self.segments

    def _add_placeholder(self, content):
        if content: self.segments.append(('placeholder', content))

    def _add_text(self, content):
        if content.strip(): self.segments.append(('text', content))
        elif content: self.segments.append(('placeholder', content))

    def _process_chars_node(self, node):
        parts = re.split(r'(&)', node.chars)
        for part in parts:
            if not part: continue
            if part == '&': self._add_placeholder(part)
            else: self._add_text(part)

    def _walk_text_nodes(self, nodelist):
        """Recursively processes nodes in 'text' mode."""
        if nodelist is None: return
            
        for node in nodelist:
            if node.isNodeType(LatexCharsNode):
                self._process_chars_node(node)
            elif node.isNodeType(LatexMathNode):
                self._add_placeholder(node.delimiters[0])
                self._walk_math_nodes(node.nodelist)
                self._add_placeholder(node.delimiters[1])
            elif node.isNodeType(LatexGroupNode):
                self._add_placeholder('{')
                self._walk_text_nodes(node.nodelist)
                self._add_placeholder('}')
            elif node.isNodeType(LatexMacroNode):
                if node.macroname in self.placeholder_commands:
                    self._add_placeholder(node.latex_verbatim())
                else:
                    self._add_placeholder(f"\\{node.macroname}{node.macro_post_space}")
                    if node.nodeargs:
                        for arg_node in node.nodeargs:
                            self._walk_text_nodes([arg_node])
            elif node.isNodeType(LatexEnvironmentNode):
                envname = node.environmentname
                if envname in self.placeholder_envs:
                    self._add_placeholder(node.latex_verbatim())
                else:
                    if not node.nodelist:
                        self._add_placeholder(node.latex_verbatim())
                        continue

                    content_start_pos = node.nodelist[0].pos
                    last_node = node.nodelist[-1]
                    content_end_pos = last_node.pos + last_node.len

                    begin_placeholder = self.latex_content[node.pos:content_start_pos]
                    self._add_placeholder(begin_placeholder)

                    self._walk_text_nodes(node.nodelist)

                    end_placeholder = self.latex_content[content_end_pos:(node.pos + node.len)]
                    self._add_placeholder(end_placeholder)
            else:
                self._add_placeholder(node.latex_verbatim())

    def _walk_math_nodes(self, nodelist):
        """Recursively processes nodes in 'math' mode."""
        if nodelist is None: return
        for node in nodelist:
            if node.isNodeType(LatexMacroNode) and node.macroname in self.math_text_macros:
                self._add_placeholder(f"\\{node.macroname}")
                if node.nodeargs:
                    for arg_node in node.nodeargs:
                        self._walk_text_nodes([arg_node])
            else:
                self._add_placeholder(node.latex_verbatim())

def parse_latex(latex_content):
    """High-level function to instantiate and use the LatexParser."""
    parser = LatexParser()
    return parser.parse(latex_content)

In [15]:
text_contents = r'''
\begin{theorem}.
    \begin{enumerate}
        \item Soit $U_i$,  $i \in I$ une collection d'ouverts. Alors,  $\cup_{i \in I} \,U_i$ est ouvert.\\
            Translate: Une union quelconque des ensembles ouverts est ouvert.
        \item Si $U_1, \ldots, U_n$ sont ouverts
            \[
                \bigcap\limits_{i=1}^{n} \, U_i \text{ est ouvert.}
            \] 
            Translate: intersection \underline{finie} des ensembles ouverts est ouvert.
    \end{enumerate}
    \begin{enumerate}
        \item Soit $U_i$,  $i \in I$ une collection de fermés. Alors,  $\cup_{i \in I} \,U_i$ est fermé.\\
            Translate: Une union quelconque des ensembles fermés est fermé.
        \item Si $U_1, \ldots, U_n$ sont fermés 
            \[
                \bigcap\limits_{i=1}^{n} \, U_i \text{ est fermé.}
            \] 
            Translate: intersection \underline{finie} des ensembles fermés est fermé.
    \end{enumerate}
\end{theorem}
'''

In [16]:
parsed_res = parse_latex(text_contents)

In [17]:
for el in parsed_res:
    print(el[0], "|", repr(el[1]))
    print("----")

placeholder | '\n'
----
placeholder | '\\begin{theorem}'
----
text | '.\n    '
----
placeholder | '\\begin{enumerate}'
----
placeholder | '\n        '
----
placeholder | '\\item '
----
text | 'Soit '
----
placeholder | '$'
----
placeholder | 'U_i'
----
placeholder | '$'
----
text | ',  '
----
placeholder | '$'
----
placeholder | 'i '
----
placeholder | '\\in '
----
placeholder | 'I'
----
placeholder | '$'
----
text | " une collection d'ouverts. Alors,  "
----
placeholder | '$'
----
placeholder | '\\cup'
----
placeholder | '_'
----
placeholder | '{i \\in I}'
----
placeholder | ' '
----
placeholder | '\\,'
----
placeholder | 'U_i'
----
placeholder | '$'
----
text | ' est ouvert.'
----
placeholder | '\\\\'
----
text | '\n            Translate: Une union quelconque des ensembles ouverts est ouvert.\n        '
----
placeholder | '\\item '
----
text | 'Si '
----
placeholder | '$'
----
placeholder | 'U_1, '
----
placeholder | '\\ldots'
----
placeholder | ', U_n'
----
placeholder | '$'
----
te

In [18]:
def segment_for_translation(segments, output_dir):
    """
    Converts parsed segments into an XML structure with <TEXT> and <PH> tags.

    Returns the XML string and the placeholder dictionary.
    """
    root = ET.Element('document')
    placeholders = {}
    text_id = 1
    ph_id = 1

    for segment_type, content in segments:
        if segment_type == 'text' and content.strip():
            # Only create TEXT tags for non-empty, non-whitespace strings
            text_elem = ET.SubElement(root, 'TEXT', id=f'text_{text_id}')
            text_elem.text = content
            text_id += 1
        else:
            # Everything else is a placeholder
            current_ph_id = f'ph_{ph_id}'
            ph_elem = ET.SubElement(root, 'PH', id=current_ph_id, original=content)
            placeholders[current_ph_id] = content
            ph_id += 1

    xml_string = ET.tostring(root, encoding='unicode', short_empty_elements=False)
    return xml_string, placeholders
    # Save placeholders for reconstruction
    with open(output_dir / 'placeholders.json', 'w', encoding='utf-8') as f:
        json.dump(placeholders, f, indent=2, ensure_ascii=False)
        
    return xml_string, placeholders

In [19]:
def create_translation_xml(segments, output_dir: Path):
    """
    Converts parsed segments into a single <TEXT> tag containing mixed content
    (text and <PH> tags), which is ideal for translation.

    - Merges consecutive non-text segments into single <PH> tags.
    - Creates one top-level <TEXT> tag.
    - Places text nodes and <PH> elements inside the <TEXT> tag.
    - Saves a mapping of placeholder IDs to their original content.

    Returns:
        tuple[str, dict]: A tuple containing the XML string and the placeholder dictionary.
    """
    # -- Step 1: Coalesce consecutive placeholders --
    # We group segments by their type. If consecutive segments are placeholders,
    # they will be grouped together and we can join their content.
    merged_segments = []
    for seg_type, group in groupby(segments, key=lambda x: x[0]):
        content_parts = [item[1] for item in group]
        if seg_type == 'text':
            # For text, we don't merge, we just add each part.
            # This preserves whitespace between text segments if any.
            for content in content_parts:
                merged_segments.append(('text', content))
        else:
            # For placeholders, we join the content of all consecutive items.
            merged_content = "".join(content_parts)
            if merged_content: # Only add if there's content
                 merged_segments.append(('placeholder', merged_content))


    # -- Step 2: Build the Mixed-Content XML --
    root = ET.Element('document')
    # All content will go inside a single <TEXT> tag
    text_container = ET.SubElement(root, 'TEXT')
    
    placeholders = {}
    ph_id = 1
    last_element = None # Keep track of the last <PH> element added

    for seg_type, content in merged_segments:
        if seg_type == 'text':
            if last_element is not None:
                # If text follows a <PH> tag, it becomes the .tail of that tag.
                last_element.tail = (last_element.tail or '') + content
            else:
                # If it's the first piece of text, it becomes the .text of the container.
                text_container.text = (text_container.text or '') + content
        
        elif seg_type == 'placeholder':
            # Create the placeholder element
            current_ph_id = str(ph_id)
            ph_elem = ET.SubElement(text_container, 'PH', id=current_ph_id, original=content)
            
            placeholders[current_ph_id] = content
            ph_id += 1
            last_element = ph_elem # This is now the most recent element

    # -- Step 3: Finalize and Save --
    # Use method='xml' and short_empty_elements=True for self-closing tags like <PH ... />
    xml_string = ET.tostring(root, encoding='unicode', method='xml', short_empty_elements=True)
    
    # Ensure the output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)

    return xml_string, placeholders
    
    # Save placeholders for reconstruction
    with open(output_dir / 'placeholders.json', 'w', encoding='utf-8') as f:
        json.dump(placeholders, f, indent=2, ensure_ascii=False)
        
    return xml_string, placeholders

In [20]:
create_translation_xml(parse_latex(r'''
\section{Introduction}
Soit $A \subset E$ borné si $\exists R > 0$ tel que $A \subset B(x, R)$.
'''), Path(""))

('<document><TEXT><PH id="1" original="&#10;\\section{" />Introduction<PH id="2" original="}" />\nSoit <PH id="3" original="$A \\subset E$" /> borné si <PH id="4" original="$\\exists R &gt; 0$" /> tel que <PH id="5" original="$A \\subset B(x, R)$" />.\n</TEXT></document>',
 {'1': '\n\\section{',
  '2': '}',
  '3': '$A \\subset E$',
  '4': '$\\exists R > 0$',
  '5': '$A \\subset B(x, R)$'})

In [22]:
xml_doc, db = create_translation_xml(parsed_res, Path(""))

In [23]:
xml_doc

'<document><TEXT><PH id="1" original="&#10;\\begin{theorem}" />.\n    <PH id="2" original="\\begin{enumerate}&#10;        \\item " />Soit <PH id="3" original="$U_i$" />,  <PH id="4" original="$i \\in I$" /> une collection d\'ouverts. Alors,  <PH id="5" original="$\\cup_{i \\in I} \\,U_i$" /> est ouvert.<PH id="6" original="\\\\" />\n            Translate: Une union quelconque des ensembles ouverts est ouvert.\n        <PH id="7" original="\\item " />Si <PH id="8" original="$U_1, \\ldots, U_n$" /> sont ouverts\n            <PH id="9" original="\\[&#10;                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{" /> est ouvert.<PH id="10" original="}&#10;            \\]" /> \n            Translate: intersection <PH id="11" original="\\underline{" />finie<PH id="12" original="}" /> des ensembles ouverts est ouvert.\n    <PH id="13" original="\\end{enumerate}&#10;    \\begin{enumerate}&#10;        \\item " />Soit <PH id="14" original="$U_i$" />,  <PH id="15" original="$i \\in I$" /> une coll

In [24]:
for el in db:
    print(repr(db[el]))

'\n\\begin{theorem}'
'\\begin{enumerate}\n        \\item '
'$U_i$'
'$i \\in I$'
'$\\cup_{i \\in I} \\,U_i$'
'\\\\'
'\\item '
'$U_1, \\ldots, U_n$'
'\\[\n                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{'
'}\n            \\]'
'\\underline{'
'}'
'\\end{enumerate}\n    \\begin{enumerate}\n        \\item '
'$U_i$'
'$i \\in I$'
'$\\cup_{i \\in I} \\,U_i$'
'\\\\'
'\\item '
'$U_1, \\ldots, U_n$'
'\\[\n                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{'
'}\n            \\]'
'\\underline{'
'}'
'\\end{enumerate}\n\\end{theorem}\n'


## Reconstruction

In [25]:
def reconstruct_from_xml(translated_xml: str, placeholders: dict) -> str:
    """
    Rebuilds the source document from a translated XML file that uses a
    single <TEXT> tag with mixed content (text nodes and <PH> elements).

    This function correctly interprets the .text and .tail attributes of
    child elements within the <TEXT> tag to reconstruct the document in the
    correct order.

    Args:
        translated_xml (str): The XML string from the translation process.
                              It is expected to contain a <document><TEXT>...</TEXT></document> structure.
        placeholders (dict): The dictionary mapping placeholder IDs to their
                             original, non-translatable content.

    Returns:
        str: The fully reconstructed document with translated text and original placeholders.
    """
    try:
        root = ET.fromstring(translated_xml)
    except ET.ParseError as e:
        logging.error(f"Failed to parse translated XML: {e}")
        logging.error(f"XML Content that failed:\n{translated_xml}")
        raise

    # Find the main <TEXT> container tag.
    text_container = root.find('TEXT')
    if text_container is None:
        logging.warning("Could not find a <TEXT> tag in the provided XML. Returning an empty string.")
        return ""

    reconstructed_parts = []

    # 1. Start with the initial text of the <TEXT> tag itself.
    # This is the text before the very first <PH> child element.
    if text_container.text:
        reconstructed_parts.append(text_container.text)

    # 2. Iterate through all child elements (<PH> tags) within <TEXT>.
    for element in text_container:
        # A. Append the content of the placeholder itself.
        if element.tag == 'PH':
            ph_id = element.get('id')
            original_content = placeholders.get(ph_id)
            if original_content is not None:
                reconstructed_parts.append(original_content)
            else:
                # This could happen if the translator deleted a <PH> tag.
                logging.warning(f"Placeholder ID '{ph_id}' found in XML but not in the map. It will be skipped.")
        else:
            logging.warning(f"Unexpected tag <{element.tag}> found inside <TEXT>. It will be ignored.")

        # B. Append the text that immediately follows the placeholder.
        # This is the "tail" text of the element.
        if element.tail:
            reconstructed_parts.append(element.tail)

    return "".join(reconstructed_parts)

In [26]:
str(reconstruct_from_xml(xml_doc, db))

"\n\\begin{theorem}.\n    \\begin{enumerate}\n        \\item Soit $U_i$,  $i \\in I$ une collection d'ouverts. Alors,  $\\cup_{i \\in I} \\,U_i$ est ouvert.\\\\\n            Translate: Une union quelconque des ensembles ouverts est ouvert.\n        \\item Si $U_1, \\ldots, U_n$ sont ouverts\n            \\[\n                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{ est ouvert.}\n            \\] \n            Translate: intersection \\underline{finie} des ensembles ouverts est ouvert.\n    \\end{enumerate}\n    \\begin{enumerate}\n        \\item Soit $U_i$,  $i \\in I$ une collection de fermés. Alors,  $\\cup_{i \\in I} \\,U_i$ est fermé.\\\\\n            Translate: Une union quelconque des ensembles fermés est fermé.\n        \\item Si $U_1, \\ldots, U_n$ sont fermés \n            \\[\n                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{ est fermé.}\n            \\] \n            Translate: intersection \\underline{finie} des ensembles fermés est fermé.\n    \\end{enumerate}\n\\

In [165]:
with open("output.tex", "w") as f:
    f.write(reconstruct_document(xml_doc, db))

## Translation

In [27]:
def replace_tags(text: str, original: str, tagged: str, tgt_lang: str) -> str:
    """
    Replace [ORIGINAL] and [TAGGED] placeholders in a given text.

    Args:
        text (str): The input text containing [ORIGINAL] and/or [TAGGED].
        original (str): The value to replace [ORIGINAL].
        tagged (str): The value to replace [TAGGED].

    Returns:
        str: The text with placeholders replaced.
    """
    return text.replace("[ORIGINAL]", original).replace("[TAGGED]", tagged).replace("[TAGET_LANGUAGE]", tgt_lang)

In [28]:
context_priming_prompt = r'''
Below is a scientific passage in its original language.
Use it to understand the full meaning and grammar.

Then you will see the same content with tags (<TEXT>, <PH>) separating structure and translatable units.

Translate the <TEXT> elements based on the full context (original source) into [TAGET_LANGUAGE], while keeping all <PH> tags untouched.

- Do NOT remove or modify any XML tags.

---

Original source:
[ORIGINAL]

Structured version:
[TAGGED]

Return the translated structured version below (without any additional text, only the translated tagged version) (same tags, translated <TEXT> content only):
'''

In [99]:
prompt = r'''
You are tasked with translating scientific text from [SOURCE_LANGUAGE] to [TARGET_LANGUAGE] using a structured XML format.

The document is composed of <TEXT> elements that contain the full translatable content (sentences or paragraphs), interleaved with <PH> tags for non-translatable content such as LaTeX commands, math expressions, or code.
Instructions:
    - Translate only the content inside <TEXT> tags, excluding anything inside <PH> tags.
    - Do not remove, modify any <PH/> tags or their attributes.
    - Use the original attribute of each <PH/> tag to understand the context and grammar. This will help you make correct translation decisions (e.g., for plurality, case, or syntax), but you must not change or translate the contents of the <PH> tags themselves.
    - Treat each <TEXT> block as a complete sentence or paragraph. You may reorder words, adjust structure, and apply natural grammar in the target language — as long as all <PH> tags remain in place and unchanged.
    - Your response must contain only the translated XML — return the modified <TEXT> block with embedded <PH> tags and nothing else (no explanations, no markdown, no prefix/suffix text).
    - All <PH> tags must be self-closing and written in the form: 
        <PH id="..." original="..."/>
    - Do not produce </PH> closing tags, and do not place content inside <PH> elements. Any other structure is invalid and will break XML parsing.
Output Format:
<document>
<TEXT>
  ...translated text and inline <PH id="..." original="..."/> tags...
</TEXT>
</document>

The document is provided below:
'''

In [65]:
orig_contents = r'''
\begin{theorem}.
    \begin{enumerate}
        \item Soit $U_i$,  $i \in I$ une collection d'ouverts. Alors,  $\cup_{i \in I} \,U_i$ est ouvert.\\
            Translate: Une union quelconque des ensembles ouverts est ouvert.
        \item Si $U_1, \ldots, U_n$ sont ouverts
            \[
                \bigcap\limits_{i=1}^{n} \, U_i \text{ est ouvert.}
            \] 
            Translate: intersection \underline{finie} des ensembles ouverts est ouvert.
    \end{enumerate}
    \begin{enumerate}
        \item Soit $U_i$,  $i \in I$ une collection de fermés. Alors,  $\cup_{i \in I} \,U_i$ est fermé.\\
            Translate: Une union quelconque des ensembles fermés est fermé.
        \item Si $U_1, \ldots, U_n$ sont fermés 
            \[
                \bigcap\limits_{i=1}^{n} \, U_i \text{ est fermé.}
            \] 
            Translate: intersection \underline{finie} des ensembles fermés est fermé.
    \end{enumerate}
\end{theorem}
'''

In [66]:
parsed_res = parse_latex(orig_contents)

In [67]:
xml_doc, db = create_translation_xml(parsed_res, Path(""))

In [68]:
full_question = replace_tags(context_priming_prompt, orig_contents, xml_doc, "Ukrainian").replace("[SOURCE_LANGUAGE]", "French")

In [38]:
xml_doc

'\nBelow is a scientific passage in its original language.\nUse it to understand the full meaning and grammar.\n\nThen you will see the same content with tags (<TEXT>, <PH>) separating structure and translatable units.\n\nTranslate the <TEXT> elements based on the full context (original source) into Ukrainian, while keeping all <PH> tags untouched.\n\n- Do NOT remove or modify any XML tags.\n\n---\n\nOriginal source:\n\n\\begin{theorem}.\n    \\begin{enumerate}\n        \\item Soit $U_i$,  $i \\in I$ une collection d\'ouverts. Alors,  $\\cup_{i \\in I} \\,U_i$ est ouvert.\\\\\n            Translate: Une union quelconque des ensembles ouverts est ouvert.\n        \\item Si $U_1, \\ldots, U_n$ sont ouverts\n            \\[\n                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{ est ouvert.}\n            \\] \n            Translate: intersection \\underline{finie} des ensembles ouverts est ouvert.\n    \\end{enumerate}\n    \\begin{enumerate}\n        \\item Soit $U_i$,  $i \\in I$ un

In [256]:
answer2 = await _ask_aristote(full_question)

In [109]:
answer2 = await _ask_gemma_model(full_question)

In [260]:
translated_tex = reconstruct_from_xml(answer2, db)

In [261]:
with open("output.tex", "w") as f:
    f.write(translated_tex)

## Without context

In [100]:
parsed_res = parse_latex(orig_contents)

In [101]:
xml_doc, db = create_translation_xml(parsed_res, Path(""))

In [102]:
xml_doc

'<document><TEXT><PH id="1" original="&#10;\\begin{theorem}" />.\n    <PH id="2" original="\\begin{enumerate}&#10;        \\item " />Soit <PH id="3" original="$U_i$" />,  <PH id="4" original="$i \\in I$" /> une collection d\'ouverts. Alors,  <PH id="5" original="$\\cup_{i \\in I} \\,U_i$" /> est ouvert.<PH id="6" original="\\\\" />\n            Translate: Une union quelconque des ensembles ouverts est ouvert.\n        <PH id="7" original="\\item " />Si <PH id="8" original="$U_1, \\ldots, U_n$" /> sont ouverts\n            <PH id="9" original="\\[&#10;                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{" /> est ouvert.<PH id="10" original="}&#10;            \\]" /> \n            Translate: intersection <PH id="11" original="\\underline{" />finie<PH id="12" original="}" /> des ensembles ouverts est ouvert.\n    <PH id="13" original="\\end{enumerate}&#10;    \\begin{enumerate}&#10;        \\item " />Soit <PH id="14" original="$U_i$" />,  <PH id="15" original="$i \\in I$" /> une coll

In [127]:
full_question_without_context = prompt.replace("[TARGET_LANGUAGE]", "English").replace("[SOURCE_LANGUAGE]", "French") + xml_doc

In [128]:
full_question_without_context

'\nYou are tasked with translating scientific text from French to English using a structured XML format.\n\nThe document is composed of <TEXT> elements that contain the full translatable content (sentences or paragraphs), interleaved with <PH> tags for non-translatable content such as LaTeX commands, math expressions, or code.\nInstructions:\n    - Translate only the content inside <TEXT> tags, excluding anything inside <PH> tags.\n    - Do not remove, modify any <PH/> tags or their attributes.\n    - Use the original attribute of each <PH/> tag to understand the context and grammar. This will help you make correct translation decisions (e.g., for plurality, case, or syntax), but you must not change or translate the contents of the <PH> tags themselves.\n    - Treat each <TEXT> block as a complete sentence or paragraph. You may reorder words, adjust structure, and apply natural grammar in the target language — as long as all <PH> tags remain in place and unchanged.\n    - Your response

In [130]:
answer2_without_context = await _ask_aristote(full_question_without_context)

In [110]:
answer2_without_context = await _ask_gemma_model(full_question_without_context)


In [373]:
answer2_without_context = answer2_without_context[6:-3]

In [367]:
answer2_without_context = answer2_without_context.replace("Here is the translated XML:\n\n\n", "")

In [131]:
answer2_without_context

'<document>\n<TEXT>\n    <PH id="1" original="&#10;\\begin{theorem}" />.\n    <PH id="2" original="\\begin{enumerate}&#10;        \\item " />Let <PH id="3" original="$U_i$" />,  <PH id="4" original="$i \\in I$" /> be a collection of open sets. Then,  <PH id="5" original="$\\cup_{i \\in I} \\,U_i$" /> is open.<PH id="6" original="\\\\" />\n    Any union of open sets is open.\n    <PH id="7" original="\\item " />If <PH id="8" original="$U_1, \\ldots, U_n$" /> are open\n    <PH id="9" original="\\[&#10;                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{" /> is open.<PH id="10" original="}&#10;            \\]" /> \n    The <PH id="11" original="\\underline{" />finite<PH id="12" original="}" /> intersection of open sets is open.\n    <PH id="13" original="\\end{enumerate}&#10;    \\begin{enumerate}&#10;        \\item " />Let <PH id="14" original="$U_i$" />,  <PH id="15" original="$i \\in I$" /> be a collection of closed sets. Then,  <PH id="16" original="$\\cup_{i \\in I} \\,U_i$" />

In [132]:
translated_tex_without_context = reconstruct_from_xml(answer2_without_context, db)

In [133]:
with open("output_without_context.tex", "w") as f:
    f.write(translated_tex_without_context)

In [114]:
answer_gemini_without_context = await _ask_gemini_model(full_question_without_context, "gemini-2.0-flash")

In [117]:
answer_gemini_without_context = answer_gemini_without_context[6:-3]

In [118]:
answer_gemini_without_context

'\n<document><TEXT><PH id="1" original="&#10;\\begin{theorem}" />.\n    <PH id="2" original="\\begin{enumerate}&#10;        \\item " />Нехай <PH id="3" original="$U_i$" />,  <PH id="4" original="$i \\in I$" /> — це колекція відкритих множин. Тоді,  <PH id="5" original="$\\cup_{i \\in I} \\,U_i$" /> є відкритою множиною.<PH id="6" original="\\\\" />\n            Будь-яке об\'єднання відкритих множин є відкритою множиною.\n        <PH id="7" original="\\item " />Якщо <PH id="8" original="$U_1, \\ldots, U_n$" /> — відкриті множини, то\n            <PH id="9" original="\\[&#10;                \\bigcap\\limits_{i=1}^{n} \\, U_i \\text{" /> є відкритою множиною.<PH id="10" original="}&#10;            \\]" /> \n            Перетин <PH id="11" original="\\underline{" />скінченної<PH id="12" original="}" /> кількості відкритих множин є відкритою множиною.\n    <PH id="13" original="\\end{enumerate}&#10;    \\begin{enumerate}&#10;        \\item " />Нехай <PH id="14" original="$U_i$" />,  <PH id=

In [119]:
translated_tex_without_context_gemini = reconstruct_from_xml(answer_gemini_without_context, db)

In [120]:
with open("output_without_context_gemini.tex", "w") as f:
    f.write(translated_tex_without_context_gemini)