In [35]:
import re
import numpy as np
from pathlib import Path
from warnings import warn
from collections import defaultdict

In [189]:
import glob
pos = glob.glob("./*/*/LC_MESSAGES/main.po")
versions = [re.sub("\./(\w+)/(\w+)/.*", "\g<1>-\g<2>", x) for x in pos]

In [504]:
def _get_header_and_version(po):
    # Extract the header
    header_end = np.where(np.array([bool(re.match("# TEXT VERSIONS\n", x)) for x in po]))[0][0]
    header = po[0:header_end]
    po = po[header_end:]

    # Extract the versions
    version_end = np.where(np.invert([bool(re.match("^#", x)) for x in po]))[0][0]
    versions = [x.removeprefix("# ").removesuffix("\n") for x in po[1:version_end]]
    po = po[version_end:]
    return header, versions, po

def split_allversion_po(file):
    with open(file, "r") as f:
        po = f.readlines()
        
        # Extract header and versions
        header, versions, po = _get_header_and_version(po)

        # Split the different version texts
        texts = {}
        ns = {}
        for version in versions:
            _po = np.array(po, dtype=str)
            ns[version] = np.sum([bool(re.search(f"^{version}\s+msgstr", x)) for x in _po])
            _po = np.array([re.sub(f"^{version}\s+msgstr", "msgstr", x) for x in _po])

            # Get the lines that should be deleted
            # Lines may be broken and extended by "\"
            others = np.array([bool(re.search(f"(?<!{version})\s+msgstr", x)) for x in _po])
            breaks = np.array([bool(re.search("\\\\\s*\n$|\\\\n\s*[\'\"]$", x)) for x in _po])
            extended = np.logical_and(others, breaks)

            if any(extended):
                for i in np.where(extended)[0]:
                    for j in range(i, len(_po)):
                        if breaks[j]:
                            others[j+1] = True
                        else:
                            break            

            texts[version] = header + list(_po[np.invert(others)])
    ns_val = np.array(list(ns.values()))
    if np.any(ns_val < ns_val.max()):
        warn(f"fewer replacements in version(s) {np.array(list(ns.keys()))[ns_val < ns_val.max()]}")

    return texts

def save_split_po(pos, locales_dir="."):
    for nam, text in pos.items():
        vers, lang = nam.split("-")
        path = Path(locales_dir) / vers / lang / "LC_MESSAGES" / "main.po"
        with open(path, "w") as f:
            f.writelines(text)

def _default_entry():
    return {"header": None}

def read_single_po(file, version="", entries=None, add_header=False, verbose=False):
    if entries is None:
        entries = defaultdict(_default_entry)

    with open(file, "r") as f:
        po = f.readlines()

        # Get the msgids
        ids = np.where(np.array([bool(re.match(f"^msgid", x)) for x in po]))[0]

        for id in ids:
            # Get the msgid string
            nam = re.sub("^msgid\s*[\"\'](.*)[\"\']\s*\n", "\g<1>", po[id])
            if nam in ["","\n"]:
                continue
            else:
                if entries[nam]["header"] is None:
                    entries[nam]["header"] = [po[id-1]]
                elif po[id-1] in entries[nam]["header"]:
                    pass
                else:
                    if verbose:
                        warn(f"multiple headers for entry {nam}")
                    entries[nam]["header"] += [po[id-1]]

                # Check if the next line has the right structure
                if not re.search("^msgstr", po[id+1]):
                    raise ValueError(fr"the msgstr following msgid '{nam}' in line {id} has the wrong format")

                # Remove the prefix and, if it is not multiline, the suffix
                string = re.sub("^msgstr\s*[\"\']", "", po[id+1])
                if not re.search("\\\\n\s*[\'\"]$", po[id+1]):
                    string = re.sub("[\"\']?\s*\n$","", string)
                entries[nam][version] = string

                for i in range(1,len(po)-id):
                    if re.search("\\\\\s*\n$", po[id+i]):
                        if i == 1:
                            entries[nam][version] += "\n"
                        entries[nam][version] += re.sub('[\'\"][\s\n]*$',"", po[id+i+1])
                    elif re.search("\\\\n\s*[\'\"]$", po[id+i]):
                        string = po[id+i+1]
                        if not re.search("\\\\n\s*[\'\"]$", string):
                            string = re.sub('[\'\"][\s\n]*$',"", string)
                        entries[nam][version] += string
                    else:
                        break
        versions = entries.get("_versions_",[])
        if version not in versions:
            entries["_versions_"] = list(entries.get("_versions_",[]) + [version])
    return entries

def make_allversion_po(po, versions=None):
    if versions is None:
        versions = po["_versions_"]

    text = "\n\n\n# TEXT VERSIONS\n" + "# " + "\n# ".join(versions) + "\n\n"
    for nam, items in po.items():
        if nam in ["_versions_"]:
            continue
        else:
            text += "".join(items["header"]) + f'msgid "{nam}"\n'
            for vers in versions:
                text += f'{vers} msgstr "{items.get(vers, "")}"\n'
            text += "\n"
    return text

def write_allversion_po(po, path = Path("allversion.po"), versions=None):
    text = make_allversion_po(po, versions)
    with open(path, "w") as f:
        f.write(text)

In [503]:
po = read_single_po("simple/en/LC_MESSAGES/main.po", "simple-en")
# po = read_single_po("simple/de/LC_MESSAGES/main.po", "simple-de", entries=po, verbose=False)
# write_allversion_po(po)

In [476]:
with open("simple/en/LC_MESSAGES/main.po", "r") as f:
    po = f.readlines()
    print(po[208][-20:])
    print(re.search("[\"\']?\s*\n$", po[208]))

dies, including:\n"

<re.Match object; span=(249, 251), match='"\n'>


In [505]:
po = split_allversion_po("allversion.po")

In [508]:
po["simple-en"][230:260]

['msgstr "A photograph of a plant of the wild type of the model organism **Arabidopsis thaliana**."\n',
 '\n',
 '#: pages/photosynthesis.py:87\n',
 'msgid "EXPANDER_MODEL_ORGANISM_EXPLANATION"\n',
 'msgstr "<em>Arabidopsis</em> (<em>Arabidopsis thaliana</em>) is referred to as a model organism because it is often used as a model system in plant biology. It has several characteristics that make it well suited for laboratory studies, including:\\n"\n',
 '"1. Small size and fast life cycle: <em>Arabidopsis thaliana</em> has a short life cycle that allows researchers to study multiple generations in a relatively short period of time.\\n"\n',
 '"2. Simple anatomy: The structure of the plant is simple and well-understood, making it easier for researchers to study specific developmental processes.\\n"\n',
 '"3. Genome sequencing: The complete genome of <em>Arabidopsis thaliana</em> was the first plant genome to be sequenced, which has provided researchers with a wealth of genetic information 

In [217]:
np.logical_and(*po)

True

In [203]:
write_allversion_po(po)