In [45]:
from indicnlp.tokenize import sentence_tokenize as sentence_tokenize_indic
import hasami

source_lang = "en"
target_lang = "ja"

sentencizer_source_language = "english"
sentencizer_target_language = "japanese"

split = "test"

if split == "train":
    domains = ["telephony", "topical_chat"]
elif split == "test":
    domains = ["telephony", "topical_chat", "call_center"]  

In [46]:
from formality.data_loader import load_data

source_texts, target_texts_formal, _, _ = load_data(source_lang, target_lang, split, "all", "formal")
source_texts, target_texts_informal, _, _ = load_data(source_lang, target_lang, split, "all", "informal")
target_texts_formal_tagged, target_texts_informal_tagged = [], []

for domain in domains:
    base_path = f"./formality/CoCoA-MT/{split}/{source_lang}-{target_lang}/"
    tagged_formal_path = f"formality-control.{split}.{domain}.{source_lang}-{target_lang}.formal.annotated"
    if target_lang != "ja" and split == "train":
        tagged_formal_path += f".{target_lang}"
    tagged_informal_path = f"formality-control.{split}.{domain}.{source_lang}-{target_lang}.informal.annotated"
    if target_lang != "ja" and split == "train":
        tagged_informal_path += f".{target_lang}"
    with open(base_path + tagged_formal_path, "r") as f:
        for line in f:
            target_texts_formal_tagged.append(line.strip())
    with open(base_path + tagged_informal_path, "r") as f:
        for line in f:
            target_texts_informal_tagged.append(line.strip())


In [47]:
from nltk import tokenize

def split_into_sentences(text, sentencizer_language):
    if sentencizer_language == "hindi":
        sent_tokenize = lambda text : sentence_tokenize_indic.sentence_split(text, "hi")
    elif sentencizer_language == "japanese":
        sent_tokenize = hasami.segment_sentences
    else:
        sent_tokenize = lambda text : tokenize.sent_tokenize(text, sentencizer_language)
    if isinstance(text, list):
        sentencized_examples = []
        num_sentences = None
        for example in text:
            sentences = sent_tokenize(example)
            if isinstance(sentences, str):
                sentences = [sentences]
            sentencized_examples.append(sentences)
            if num_sentences is None:
                num_sentences = len(sentences)
            elif num_sentences != len(sentences):
                print("ERROR: Number of sentences in examples is not consistent.")
                print(text)
    
        return [[examples[i] for examples in sentencized_examples] for i in range(len(sentencized_examples[0]))]
    else:
        return sent_tokenize(text)


In [48]:
source_texts_new, target_texts_formal_new, target_texts_informal_new = [], [], []
target_texts_formal_tagged_new, target_texts_informal_tagged_new = [], []
bad_examples = []
for source, target_formal, target_informal, tagged_formal, tagged_informal in zip(source_texts, target_texts_formal, target_texts_informal, target_texts_formal_tagged, target_texts_informal_tagged):
    source_sentences = split_into_sentences(source, sentencizer_source_language)
    target_formal_sentences = split_into_sentences(target_formal, sentencizer_target_language)
    target_informal_sentences = split_into_sentences(target_informal, sentencizer_target_language)
    tagged_formal_sentences = split_into_sentences(tagged_formal, sentencizer_target_language)
    tagged_informal_sentences = split_into_sentences(tagged_informal, sentencizer_target_language)
    
    if len(source_sentences) == len(target_formal_sentences) == len(target_informal_sentences) == len(tagged_formal_sentences) == len(tagged_informal_sentences):
        source_texts_new.extend(source_sentences)
        target_texts_formal_new.extend(target_formal_sentences)
        target_texts_informal_new.extend(target_informal_sentences)
        target_texts_formal_tagged_new.extend(tagged_formal_sentences)
        target_texts_informal_tagged_new.extend(tagged_informal_sentences)
    else:
        bad_examples.append((source_sentences, target_formal_sentences, target_informal_sentences, tagged_formal_sentences, tagged_informal_sentences))
        print("Different number of sentences found in the following example:")
        print(source_sentences)
        print(target_formal_sentences)
        print(target_informal_sentences)
        print(tagged_formal_sentences)
        print(tagged_informal_sentences)
        print()

print("Number of bad examples:", len(bad_examples))

Different number of sentences found in the following example:
["but it is crazy, I can't get along with my family; in fact, they had a great Christmas this year, Christmas is really good.", 'You will be doing the holiday stuff']
['が、おかしくて、私は家族と仲良くできません。', '実際、彼らは今年素晴らしいクリスマスを過ごしました。', 'クリスマスって、本当にいいですね。', 'さあクリスマスの準備ですね']
['が、おかしくて、私は家族と仲良くできない。', '実際、彼らは今年素晴らしいクリスマスを過ごした。', 'クリスマスって、本当にいいね。', 'さあクリスマスの準備だね']
['が、おかしくて、私は家族と仲良く[F]できません[/F]。', '実際、彼らは今年素晴らしいクリスマスを[F]過ごしました[/F]。', 'クリスマスって、本当に[F]いいです[/F]ね。', 'さあクリスマスの準備[F]です[/F]ね']
['が、おかしくて、私は家族と仲良く[F]できない[/F]。', '実際、彼らは今年素晴らしいクリスマスを[F]過ごした[/F]。', 'クリスマスって、本当に[F]いい[/F]ね。', 'さあクリスマスの準備[F]だ[/F]ね']

Different number of sentences found in the following example:
["but I think they are, I just think that sooner or later it's gonna crumble."]
['しかし、彼らはそうだと思います。', 'ただ、遅かれ早かれ崩れていくと思います。']
['しかし、彼らはそうだと思う。', 'ただ、遅かれ早かれ崩れていくと思う。']
['しかし、彼らはそうだと[F]思います[/F]。', 'ただ、遅かれ早かれ崩れていくと[F]思います[/F]。']
['しかし、彼らはそうだと[F]思う[/F]。', 'ただ、遅かれ早かれ崩れていくと[F]思う[/F]。']

Dif

In [49]:
# write split texts into files
base_path = f"./formality/CoCoA-MT/{split}/{source_lang}-{target_lang}/"
source_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.{source_lang}"
target_formal_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.formal.{target_lang}"
target_formal_feminine_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.formal.feminine.{target_lang}"
target_informal_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.informal.{target_lang}"
target_informal_feminine_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.informal.feminine.{target_lang}"
target_formal_tagged_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.formal.annotated.{target_lang}"
target_informal_tagged_file = f"formality-control.{split}.all.{source_lang}-{target_lang}.informal.annotated.{target_lang}"

with open(base_path + source_file, "w") as f:
    for example in source_texts_new:
        if isinstance(example, list):
            f.write("\n".join(example) + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_formal_file, "w") as f:
    for example in target_texts_formal_new:
        if isinstance(example, list):
            f.write(example[0] + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_formal_feminine_file, "w") as f:
    for example in target_texts_formal_new:
        if isinstance(example, list):
            f.write(example[1] + "\n")

with open(base_path + target_informal_file, "w") as f:
    for example in target_texts_informal_new:
        if isinstance(example, list):
            f.write(example[0] + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_informal_feminine_file, "w") as f:
    for example in target_texts_informal_new:
        if isinstance(example, list):
            f.write(example[1] + "\n")

with open(base_path + target_formal_tagged_file, "w") as f:
    for example in target_texts_formal_tagged_new:
        if isinstance(example, list):
            f.write("\n".join(example) + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_informal_tagged_file, "w") as f:
    for example in target_texts_informal_tagged_new:
        if isinstance(example, list):
            f.write("\n".join(example) + "\n")
        else:
            f.write(example + "\n")
