In [153]:
!pip install python-tmx
import re
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [154]:
import PythonTmx as tmx
from datetime import UTC, datetime
import lxml.etree as etree

def read_tmx(filepath, target_lang="ko-KR", limit = None):
    tree = etree.parse(filepath, etree.XMLParser(encoding="utf-8"))
    root = tree.getroot()

    tmx_obj = tmx.from_element(root)

    source, target = [], []
    count = 0

    for tu in tmx_obj.tus:
        if limit and count >= limit:
          break

        s, t = None, None
        for tuv in tu.tuvs:
            lang = tuv.lang.lower()
            text = "".join([str(c) for c in tuv.content if isinstance(c, str)])
            if lang.startswith("en"):
                s = text
            elif lang.startswith(target_lang.lower()):
                t = text
        if s and t:
            source.append(s)
            target.append(t)
            count+=1

    return source, target

file1 = "/content/drive/MyDrive/Korean/14 320 - Korean - FamilyHistory - Recent.tmx"
source1, target1 = read_tmx(file1, limit=100000)



In [155]:
file2 = "/content/drive/MyDrive/Korean/14 320 - Korean - Legacy - 1.tmx"
source2, target2 = read_tmx(file2, limit=100000)

In [156]:
file2 = "/content/drive/MyDrive/Korean/14 320 - Korean - Recent.tmx"
source3, target3 = read_tmx(file2, limit=100000)
full_source = source1 + source2 + source3
full_target = target1 + target2 + target3

In [157]:
#step 1
def remove_extra_cr_lf(sources, targets):
  cleaned_sources = []
  cleaned_targets = []

  for source, target in zip(sources, targets):
    source = re.sub(r'[\r\n]+', ' ', source).strip()
    target = re.sub(r'[\r\n]+', ' ', target).strip()

    if source and target:
      cleaned_sources.append(source)
      cleaned_targets.append(target)

  return cleaned_sources, cleaned_targets


In [158]:
def remove_control_characters_and_normalize_whitespace(sources, targets):
  cleaned_sources = []
  cleaned_targets = []

  for source, target in zip(sources, targets):
    source = re.sub(r'\s+', ' ', source).strip()
    target = re.sub(r'\s+', ' ', target).strip()

    if source and target:
      cleaned_sources.append(source)
      cleaned_targets.append(target)

  return cleaned_sources, cleaned_targets

In [159]:
#step 2
def remove_empty_segments(sources, targets):
  filtered_sources = []
  filtered_targets = []

  for source, target in zip(sources, targets):
      if source and target and source.strip() and target.strip():
          filtered_sources.append(source)
          filtered_targets.append(target)

  return filtered_sources, filtered_targets

In [160]:
#step 3
import html
def normalize_escaped_entities_list(sources, targets):
  def normalize_escaped_entities(text):
    if not text:
        return text

    text = re.sub(r"&(\d+);", r"&#\1;", text)

    text = re.sub(r"&\\#", "&#", text)

    text = html.unescape(text)

    helper = r"(?:^|\W)(&#?[A-Za-z0-9]+;?|&#?[a-zA-Z0-9]+)"

    text = re.sub(helper, "", text)

    return html.unescape(text)

  return [normalize_escaped_entities(s) for s in sources], [normalize_escaped_entities(t) for t in targets]

In [161]:
import string
def remove_duplicate_segements(source, target):
    seen = set()
    cleaned_source = []
    cleaned_target = []

    for s, t in zip(source, target):
        s_clean = re.sub(f"[{re.escape(string.punctuation)}]", "", s).lower().strip()
        t_clean = re.sub(f"[{re.escape(string.punctuation)}]", "", t).lower().strip()

        pair_key = (s_clean, t_clean)

        if pair_key not in seen:
            seen.add(pair_key)
            cleaned_source.append(s)
            cleaned_target.append(t)

    return cleaned_source, cleaned_target

In [162]:
def remove_tags(source, target):
  tag_remover = r"<.+?>"
  tag_with_only_digit_remover = r"<.+?>\d+<.+?>"
  div_remover = r".<div.+"
  weird_tag = r".+/>"
  two_weird_signs = r"<>"

  new_source, new_target = [], []
  for s, t in zip(source, target):
    s_cleaned = re.sub(tag_with_only_digit_remover, "", s)
    t_cleaned = re.sub(tag_with_only_digit_remover, "", t)

    s_cleaned = re.sub(tag_remover, "", s_cleaned)
    t_cleaned = re.sub(tag_remover, "", t_cleaned)

    s_cleaned = re.sub(div_remover, "", s_cleaned)
    t_cleaned = re.sub(div_remover, "", t_cleaned)

    s_cleaned = re.sub(weird_tag, "", s_cleaned)
    t_cleaned = re.sub(weird_tag, "", t_cleaned)

    s_cleaned = re.sub(two_weird_signs, "", s_cleaned)
    t_cleaned = re.sub(two_weird_signs, "", t_cleaned)

    new_source.append(s_cleaned.strip())
    new_target.append(t_cleaned.strip())

  return new_source, new_target



In [163]:
def remove_equal_segments(source, target):
    cleaned_sources = []
    cleaned_targets = []

    for s, t in zip(source, target):
        s_clean = re.sub(f"[{re.escape(string.punctuation)}]", "", s).lower().strip()
        t_clean = re.sub(f"[{re.escape(string.punctuation)}]", "", t).lower().strip()

        if s_clean != t_clean:
            cleaned_sources.append(s)
            cleaned_targets.append(t)

    return cleaned_sources, cleaned_targets

In [164]:
def remove_mostly_non_text_segments(source, target):
  cleaned_sources = []
  cleaned_targets = []

  text_helper = r"[A-Za-z\uAC00-\uD7A3]";

  threshold = 0.5

  for s, t in zip(source, target):
    s_text_chars = len(re.findall(text_helper, s))
    t_text_chars = len(re.findall(text_helper, t))
    s_total_chars = len(s.strip())
    t_total_chars = len(t.strip())

    if s_total_chars == 0 or t_total_chars == 0:
      continue

    s_ratio = s_text_chars / s_total_chars
    t_ratio = t_text_chars / t_total_chars

    if s_ratio >= threshold and t_ratio >= threshold:
      cleaned_sources.append(s)
      cleaned_targets.append(t)

  return cleaned_sources, cleaned_targets


In [165]:
def remove_brackets_helper(text):
    helpermap = {")": "(", "]": "[", "}": "{", ">": "<"}
    opener_brackets = set(helpermap.values())
    stack = []
    to_remove = set()

    for i, ch in enumerate(text):
        if ch in opener_brackets:
            stack.append((ch, i))
        elif ch in helpermap:
            if stack and stack[-1][0] == helpermap[ch]:
                stack.pop()
            else:
                to_remove.add(i)

    to_remove.update(i for _, i in stack)
    return "".join(ch for i, ch in enumerate(text) if i not in to_remove)

def remove_unbalanced_brackets(source, target):
  cleaned_sources = []
  cleaned_targets = []

  remove_brackets_regex = r"[\(\)\[\]\{\}\<\>]"

  for s, t in zip(source, target):
    cleaned_sources.append(remove_brackets_helper(s))
    cleaned_targets.append(remove_brackets_helper(t))

  return cleaned_sources, cleaned_targets


In [166]:
def remove_too_long(source, target):
  cleaned_sources = []
  cleaned_targets = []

  max_amount = 100

  for s, t in zip(source, target):
    if len(s.split()) <= max_amount and len(t.split()) <= max_amount:
      cleaned_sources.append(s)
      cleaned_targets.append(t)

  return cleaned_sources, cleaned_targets


In [167]:
def remove_too_short(source, target):
  cleaned_sources = []
  cleaned_targets = []

  min_amount = 3

  for s, t in zip(source, target):
    if len(s.split()) >=min_amount and len(t.split()) >=min_amount:
      cleaned_sources.append(s)
      cleaned_targets.append(t)

  return cleaned_sources, cleaned_targets

In [168]:
def remove_ratio_check(source, target):
  cleaned_sources = []
  cleaned_targets = []

  ratio = 0.3

  for s, t in zip(source, target):
    len_s = len(s)
    len_t = len(t)
    if min(len_s, len_t) / max(len_s, len_t) >= ratio:
      cleaned_sources.append(s)
      cleaned_targets.append(t)
  return cleaned_sources, cleaned_targets

In [169]:
def normalize_quotes(source, target):
  single_quote = r'[‘’‚‛‹›´`]'
  double_quote = r'[«»„“„”「」﹁﹂﹃﹄]'

  cleaned_sources = []
  cleaned_targets = []

  for s, t in zip(source, target):
    normalized_source_double = re.sub(double_quote, '"', s)
    normalized_target_double = re.sub(double_quote, '"', t)

    normalized_source = re.sub(single_quote, "'", normalized_source_double)
    normalized_target = re.sub(single_quote, "'", normalized_target_double)

    cleaned_sources.append(normalized_source)
    cleaned_targets.append(normalized_target)

  return cleaned_sources, cleaned_targets

In [170]:
def clean_up_weird_stuff_extra(source, target):
  cleaned_sources = []
  cleaned_targets = []

  url_pattern = r"(https?://\S+|www\.\S+|[\w-]+\.[a-z]{2,}\S*)"

  valid_start = re.compile(r"^[0-9A-Za-z가-힣]")

  for s, t in zip(source, target):
    new_s = re.sub(r"\\([:;,.!?])", r"\1", s)
    new_t = re.sub(r"\\([:;,.!?])", r"\1", t)

    new_s = re.sub(url_pattern, "", new_s, flags=re.IGNORECASE)
    new_t = re.sub(url_pattern, "", new_t, flags=re.IGNORECASE)

    new_s = re.sub(r"®", "", new_s)
    new_t = re.sub(r"®", "", new_t)

    new_s = re.sub(r"©", "", new_s)
    new_t = re.sub(r"©", "", new_t)

    new_s = re.sub(r"%(?!\d)", "", new_s)
    new_t = re.sub(r"%(?!\d)", "", new_t)

    new_s = re.sub(r"\$\{[^}]+\}", "", new_s)
    new_t = re.sub(r"\$\{[^}]+\}", "", new_t)

    new_s = re.sub(r"^[^0-9A-Za-z가-힣'\"]+", "", new_s)
    new_t = re.sub(r"^[^0-9A-Za-z가-힣'\"]+", "", new_t)

    new_s = re.sub(r"^\s*([0-9a-zA-Z]+)\.\s*", "", new_s)
    new_t = re.sub(r"^\s*([0-9a-zA-Z]+)\.\s*", "", new_t)

    cleaned_sources.append(new_s)
    cleaned_targets.append(new_t)

  return cleaned_sources, cleaned_targets

In [174]:
def remove_unbalanced_double_quotes(text):
    stack = []
    to_remove = set()

    for i, ch in enumerate(text):
        if ch == '"':
            if stack:
                stack.pop()
            else:
                stack.append(i)
    to_remove.update(stack)

    return "".join(ch for i, ch in enumerate(text) if i not in to_remove)

def remove_unbalanced_double_quotes_batch(source, target, only_korean=False):
    cleaned_sources = []
    cleaned_targets = []

    for s, t in zip(source, target):
        s_cleaned = remove_unbalanced_double_quotes(s)

        t_cleaned = remove_unbalanced_double_quotes(t)

        cleaned_sources.append(s_cleaned)
        cleaned_targets.append(t_cleaned)

    return cleaned_sources, cleaned_targets

In [175]:
clean_source, clean_target = remove_extra_cr_lf(full_source, full_target)
clean_source, clean_target = normalize_escaped_entities_list(clean_source, clean_target)
clean_source, clean_target = remove_tags(clean_source, clean_target)
clean_source, clean_target = clean_up_weird_stuff_extra(clean_source, clean_target)
clean_source, clean_target = remove_unbalanced_brackets(clean_source, clean_target)
clean_source, clean_target = normalize_quotes(clean_source, clean_target)
clean_source, clean_target = remove_unbalanced_double_quotes_batch(clean_source, clean_target)
clean_source, clean_target = remove_control_characters_and_normalize_whitespace(clean_source, clean_target)
clean_source, clean_target = remove_duplicate_segements(clean_source, clean_target)
clean_source, clean_target = remove_equal_segments(clean_source, clean_target)
clean_source, clean_target = remove_mostly_non_text_segments(clean_source, clean_target)
clean_source, clean_target = remove_too_long(clean_source, clean_target)
clean_source, clean_target = remove_too_short(clean_source, clean_target)
clean_source, clean_target = remove_ratio_check(clean_source, clean_target)
clean_source, clean_target = remove_empty_segments(clean_source, clean_target)

print(len(clean_source))
print(len(clean_target))
#214255-926
#212866
#3603
#211332

211344
211344


In [176]:
with open("source.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(clean_source))

with open("target.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(clean_target))