In [None]:
import os

os.system("tlgu -b -c utf-8 ../assets/beta_files/TLG0563.TXT ../assets/beta_files/output.txt")

In [1]:
import re

# Sample beta code text (a short segment for demonstration)
# This will be replaced with the actual content from the file in the final implementation
beta_code_sample = "*A/B(GD*E*F*G*H*I*K*L*M*N*O*P*Q*R*S*T*U*V*W*X*Y*Z"

# Mapping of Greek beta code to Unicode characters
# This is a simplified version based on the provided manual; it can be expanded for full coverage
beta_to_unicode = {
    "*A": "\u0391", "A": "\u03B1",  # Alpha
    "*B": "\u0392", "B": "\u03B2",  # Beta
    "*C": "\u039E", "C": "\u03BE",  # Xi
    "*D": "\u0394", "D": "\u03B4",  # Delta
    "*E": "\u0395", "E": "\u03B5",  # Epsilon
    "*F": "\u03A6", "F": "\u03C6",  # Phi
    "*G": "\u0393", "G": "\u03B3",  # Gamma
    "*H": "\u0397", "H": "\u03B7",  # Eta
    "*I": "\u0399", "I": "\u03B9",  # Iota
    "*K": "\u039A", "K": "\u03BA",  # Kappa
    "*L": "\u039B", "L": "\u03BB",  # Lambda
    "*M": "\u039C", "M": "\u03BC",  # Mu
    "*N": "\u039D", "N": "\u03BD",  # Nu
    "*O": "\u039F", "O": "\u03BF",  # Omicron
    "*P": "\u03A0", "P": "\u03C0",  # Pi
    "*Q": "\u0398", "Q": "\u03B8",  # Theta
    "*R": "\u03A1", "R": "\u03C1",  # Rho
    "*S": "\u03A3", "S": "\u03C3",  # Sigma
    "*T": "\u03A4", "T": "\u03C4",  # Tau
    "*U": "\u03A5", "U": "\u03C5",  # Upsilon
    "*V": "\u03DC", "V": "\u03DD",  # Digamma
    "*W": "\u03A9", "W": "\u03C9",  # Omega
    "*X": "\u03A7", "X": "\u03C7",  # Chi
    "*Y": "\u03A8", "Y": "\u03C8",  # Psi
    "*Z": "\u03A6", "Z": "\u03B6",  # Zeta
    # Add diacritics and other characters as needed
}

# Function to convert beta code to Unicode
def beta_code_to_unicode(beta_text, mapping):
    # Sort the keys by length in descending order to replace longer sequences first
    sorted_keys = sorted(mapping, key=len, reverse=True)
    for key in sorted_keys:
        beta_text = beta_text.replace(key, mapping[key])
    return beta_text

# Convert the sample beta code text to Unicode
decoded_text = beta_code_to_unicode(beta_code_sample, beta_to_unicode)
decoded_text



'Α/β(γδΕΦΓΗΙΚΛΜΝΟΠΘΡΣΤΥϜΩΧΨΦ'

In [2]:
# Expanding the script to handle citations and reserved escape codes based on the new information provided

# Append the new knowledge about citation system to the existing beta to unicode mapping
beta_to_unicode.update({
    # Assuming some example mappings for citation and escape codes (to be replaced with actual mappings)
    "~a": "<author>", "~b": "<work>", "~c": "<abbreviation>",
    "~n": "<non-hierarchical-citation>", "~v": "<hierarchical-field-v>",
    "~z": "<line-number>",  # and so on for other fields
    # Add more mappings for reserved escapes and additional characters
    "$50": "<reserved-papyri-1>", "$59": "<reserved-papyri-10>",  # and so on
    "&50": "<reserved-latin-1>", "&59": "<reserved-latin-10>",  # and so on
    # More mappings as needed
})

# Function to increment values for implicit citation changes
def increment_value(value):
    if value.isdigit():
        return str(int(value) + 1)
    else:
        # Incrementing the ASCII character
        return chr(ord(value[-1]) + 1)

# Updated function to handle citations and implicit value changes
def decode_beta_code(beta_text, mapping):
    # Split the text into lines for processing citations
    lines = beta_text.splitlines()
    decoded_lines = []
    last_citation = {}

    for line in lines:
        if line.startswith("~"):
            # Process citation line
            citation_parts = re.findall(r'~[a-z]"?([^"]*)"?', line)
            for i, part in enumerate(citation_parts):
                field = chr(ord('a') + i)  # Calculate field identifier ('a', 'b', 'c', ...)
                if part:
                    # Explicit change in citation
                    last_citation[field] = part
                else:
                    # Implicit change (increment)
                    last_citation[field] = increment_value(last_citation.get(field, "1"))
            # Construct the citation string (e.g., "<author>1<work>2<line-number>3")
            citation_string = "".join([f"<{mapping.get('~' + k, '')}>{v}" for k, v in last_citation.items()])
            decoded_lines.append(citation_string)
        else:
            # Regular text line
            decoded_line = beta_text
            for key in sorted(mapping, key=len, reverse=True):
                decoded_line = decoded_line.replace(key, mapping[key])
            decoded_lines.append(decoded_line)

    return "\n".join(decoded_lines)

# Sample text including citations (to be replaced with actual file content)
sample_text_with_citations = "~a\"0001\"~b\"001\"~c\"Arg\"\n*A/B(GD*E*F*G*H*I*K*L*M*N*O*P*Q*R*S*T*U*V*W*X*Y*Z"

# Decode the sample text
decoded_text_with_citations = decode_beta_code(sample_text_with_citations, beta_to_unicode)
decoded_text_with_citations



'<<author>>0001<<work>>001<<abbreviation>>Arg\n<author>"0001"<work>"001"<abbreviation>"αrg"\nΑ/β(γδΕΦΓΗΙΚΛΜΝΟΠΘΡΣΤΥϜΩΧΨΦ'

In [3]:
# Expanding the script to include formatting beta codes

# Update the beta to unicode mapping with formatting codes
beta_to_unicode.update({
    "^": "<quarter-space>",
    "@": "<tab>",
    "@1": "<page-end>",
    "@2": "<column-end>",
    "@3": "<omitted-graphic>",
    "@4": "<start-of-table>",
    "@5": "<end-of-table>",
    "@6": "<blank-line>",
    "@7": "<short-horizontal-rule>",
    "@8": "<mid-line-citation>",
    "@9": "<break-in-text>",
    # Add more mappings as needed
    "{": "<speaker-stage-direction>",
    "{1": "<title>",
    "{2": "<marginal-text>",
    # ... and so on for other text formatting codes
    "<": "<combining-overline>", # And other text formatting symbols
    # ... and so on for other text formatting codes
    # Quotation marks and other additional punctuation and characters
    "\"": "<left-double-quotation-mark>",
    "\"1": "<left-low-double-quotation-mark>",
    # ... and so on
})

# We'll also need to handle the brackets, but these are placeholders for now
beta_to_unicode.update({
    "[": "<left-square-bracket>",
    "]": "<right-square-bracket>",
    # ... and so on for other brackets
})

# The updated function now includes handling for formatting beta codes
def decode_beta_code_with_formatting(beta_text, mapping):
    decoded_text = beta_text
    for key in sorted(mapping, key=len, reverse=True):
        decoded_text = decoded_text.replace(key, mapping[key])
    return decoded_text

# Sample text including formatting codes (to be replaced with actual file content)
sample_text_with_formatting = "@1*A/B(GD*E*F*G@4H*I*K*L*M@5N*O*P*Q*R*S*T*U*V*W*X*Y*Z@1"

# Decode the sample text
decoded_text_with_formatting = decode_beta_code_with_formatting(sample_text_with_formatting, beta_to_unicode)
decoded_text_with_formatting



'<combining-overline>page-end>Α/β(γδΕΦΓ<combining-overline>start-of-table>ηΙΚΛΜ<combining-overline>end-of-table>νΟΠΘΡΣΤΥϜΩΧΨΦ<combining-overline>page-end>'

In [None]:
decode_beta_code_with_formatting(../assets/beta_files/TLG0563.TXT ../assets/beta_files/output.txt)

In [19]:
import chardet

def process_file(input_file_path, output_file_path, mapping):
    # Detect the encoding of the file
    with open('../assets/beta_files/TLG0563.TXT', 'rb') as file:
        result = chardet.detect(file.read())

    # Open the file with the detected encoding
    with open('../assets/beta_files/TLG0563.TXT', 'r', encoding=result['encoding']) as file:
        lines = file.read()

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for line in lines:
            # Apply the conversion to each line
            decoded_line = decode_beta_code_with_formatting(line, mapping)
            output_file.write(decoded_line + '\n')

In [20]:
# Define the file paths
input_file_path = '../assets/beta_files/TLG0563.TXT'  # Replace with the path to your file
output_file_path = '../assets/beta_files/output.txt'  # Replace with the path to your output file

# Process the entire file
process_file(input_file_path, output_file_path, beta_to_unicode)


In [13]:
import chardet

# Detect the encoding of the file
with open('../assets/beta_files/TLG0563.TXT', 'rb') as file:
    result = chardet.detect(file.read())

# Open the file with the detected encoding
with open('../assets/beta_files/TLG0563.TXT', 'r', encoding=result['encoding']) as file:
    data = file.read()
    
# Convert Beta Code to Unicode
unicode_text = beta_to_uni(beta_code)

print(unicode_text)

NameError: name 'beta_to_uni' is not defined

In [12]:
print(result)

{'encoding': 'MacRoman', 'confidence': 0.643100264793705, 'language': ''}
