## Imports

In [2]:
import re
import os

## Import data and save as source/target sentences

In [27]:
src_path = "uncleaned/thai/en-th.src"
tgt_path = "uncleaned/thai/en-th.tgt"
out_folder_path = "cleaned/"
out_path = "en-th"

if not os.path.exists(out_folder_path):
    os.mkdir(out_folder_path)

source_data = []
target_data = []

with open(src_path) as f:
  for line in f:
    source_data.append(line)

with open(tgt_path) as f:
  for line in f:
    target_data.append(line)

In [28]:
print(len(source_data))

1271197


## Remove Empty Segments

In [29]:
def remove_empty_segments():
    indeces_to_remove = []
    for i in range(len(source_data)):
      if (source_data[i] == "" and target_data[i] == ""):
        indeces_to_remove.append(i)

    for i in range(len(indeces_to_remove)-1, -1, -1):
      del source_data[indeces_to_remove[i]]
      del target_data[indeces_to_remove[i]]

## Helper functions

In [30]:
# Replace function using regular expressions
def normalize(expressions):
  for item in expressions:
    for i in range(len(source_data)):
      #source_data[i] = source_data[i].replace(item[0], item[1])
      source_data[i] = re.sub(item[0], item[1], source_data[i])

    for i in range(len(target_data)):
      #target_data[i] = target_data[i].replace(item[0], item[1])
      target_data[i] = re.sub(item[0], item[1], target_data[i])
        
def printAlignment(indexes):
  for i in indexes:
    print("Src: " + source_data[i] + "\nTgt: " + target_data[i] + "\n")

def printLengths():
  print(str(len(source_data)) +  "\n" + str(len(target_data)))

## Normalize escaped characters/entities, control characters, whitespace, quotes

In [31]:
# Normalize escaped characters/entities
charAndEntities = [(r"&nbsp;?", ""),
                   (r"&reg;?", "®"), 
                   (r"&\\#39;?", "`"),
                   (r"&ndash;?", "–"), 
                   (r"&mdash;?", "—"), 
                   (r"&acute;?", "´"),
                   (r"&\\#8217;?", "’"),
                   (r"&rsquo;?", "’"),
                   (r"&rdquo;?", "”"),
                   (r"&lsquo;?", "‘"),
                   (r"&ldquo;?", "“"),
                   (r"&hellip;?", ""),
                   (r"&trade;?", "™"),
                   (r"&gt;?", ">"),
                   (r"&lt;?", "<"),
                   (r"&amp;?", "&"),
                   (r"#*?", ""),
                   ("\u2013", "-"),
                   ("\u00a0", " ")]

# Normalize certain control characters/Normalize whitespaces
controlChars = [("\t", " "),
                (r"\n|\\n", " "),
                (r"\s{2,}", " "),
                (r"^ *", "")]      # Remove leading whitespace

quoteChars = [(r"`|’|‘", "'"),
              (r"”|“|''\b|\b''", "\""),
              (r"\.''", ".\"")]

tags = [(r"<.*?>", "")]

other = [(r"\|v.*?(\s|$)", ""),
         (r"%%", "")]

brackets = [(r"{{.*}}", ""),
            (r"\\{.*\\}", "")]

footnotes = [(r"(\d|10) ?$", "")]

## Indentify and remove duplicates

In [32]:
def remove_duplicates():
    global source_data
    global target_data
    
    # Put source and target data into one list of tuples
    source_target = []
    for i in range(len(source_data)):
        item = (source_data[i], target_data[i])
        source_target.append(item)

    # Put all data into a set to remove duplicates, then put it back into a list
    source_target = list(set(source_target))

    # Save the items back in the source_data and target_data lists
    new_source = []
    new_target = []
    for i in range(len(source_target)):
        new_source.append(source_target[i][0])
        new_target.append(source_target[i][1])

    source_data = new_source
    target_data = new_target

# Check for segments that are mostly non-text content

In [33]:
def remove_non_text():
    # If 40% or more of the sentence is non-word characters, remove it
    indeces_to_remove = []

    regx = re.compile('\W')
    for i in range(len(source_data)):
        if len(source_data[i]) == 0:
            #print(i)
            continue

        result = regx.findall(source_data[i])
        ratio = len(result) / len(source_data[i])

        if ratio > .4:
            indeces_to_remove.append(i)

    # Remove items
    for i in range(len(indeces_to_remove)-1, -1, -1):
        del source_data[i]
        del target_data[i]

## Remove segments where source=target

In [34]:
def remove_source_equals_target():
    indeces_to_remove = []
    for i in range(len(source_data)):
        if source_data[i] == target_data[i]:
            indeces_to_remove.append(i)

    # Remove items
    for i in range(len(indeces_to_remove)-1, -1, -1):
        del source_data[i]
        del target_data[i]

## Remove segments that are too long (>100 words) or too short (<3)

In [35]:
def remove_long_short():
    indeces_to_remove = []
    for i in range(len(source_data)):
        if len(source_data[i].split()) > 100 or len(source_data[i].split()) < 3:
            indeces_to_remove.append(i)
            
    # Remove items
    for i in range(len(indeces_to_remove)-1, -1, -1):
        del source_data[indeces_to_remove[i]]
        del target_data[indeces_to_remove[i]]

# Data cleaning pipeline

In [36]:
normalize(charAndEntities)
normalize(controlChars)
normalize(quoteChars)
normalize(tags)
normalize(other)
normalize(brackets)
remove_empty_segments()
remove_duplicates()
remove_non_text()
remove_source_equals_target()
remove_long_short()
normalize(footnotes)
remove_empty_segments()

In [37]:
printLengths()

738641
738641


# Write to files

In [38]:
with open(out_folder_path + out_path + ".src", 'w') as f:
    for s in source_data:
        f.write(s + "\n")
        
with open(out_folder_path + out_path + ".tgt", 'w') as f:
    for s in target_data:
        f.write(s + "\n")

In [None]:
#indeces = [i for i in range(len(target_data))]
#printAlignment(indeces)