In [14]:
from indicnlp.tokenize import sentence_tokenize as sentence_tokenize_indic
import hasami

source_lang = "en"
target_lang = "de"

sentencizer_source_language = "english"
sentencizer_target_language = "german"

split = "train"

if split == "train":
    domains = ["telephony", "topical_chat"]
elif split == "test":
    domains = ["telephony", "topical_chat", "call_center"]  

In [15]:
from formality.data_loader import load_data

source_texts = []
target_texts_formal = []
target_texts_informal = []
target_texts_formal_tagged, target_texts_informal_tagged = [], []

for domain in domains:
    domain_source_texts, domain_target_texts_formal, _, _ = load_data(source_lang, target_lang, split, "all", "formal")
    domain_source_texts, domain_target_texts_informal, _, _ = load_data(source_lang, target_lang, split, "all", "informal")
    source_texts.extend(domain_source_texts)
    target_texts_formal.extend(domain_target_texts_formal)
    target_texts_informal.extend(domain_target_texts_informal)

    base_path = f"./formality/CoCoA-MT/{split}/{source_lang}-{target_lang}/"
    tagged_formal_path = f"formality-control.{split}.{domain}.{source_lang}-{target_lang}.formal.annotated"
    if target_lang != "ja" and split == "train":
        tagged_formal_path += f".{target_lang}"
    tagged_informal_path = f"formality-control.{split}.{domain}.{source_lang}-{target_lang}.informal.annotated"
    if target_lang != "ja" and split == "train":
        tagged_informal_path += f".{target_lang}"
    with open(base_path + tagged_formal_path, "r") as f:
        for line in f:
            target_texts_formal_tagged.append(line.strip())
    with open(base_path + tagged_informal_path, "r") as f:
        for line in f:
            target_texts_informal_tagged.append(line.strip())


In [19]:
len(source_texts)

713

In [16]:
import re
FORMALITY_PHRASES = re.compile("(\[F\](.*?)\[/F\])")

count = 0
for text in target_texts_formal_tagged:
    if len(FORMALITY_PHRASES.findall(text)) > 1:
        count += 1

print(count)

179


In [3]:
from nltk import tokenize

def split_into_sentences(text, sentencizer_language):
    if sentencizer_language == "hindi":
        sent_tokenize = lambda text : sentence_tokenize_indic.sentence_split(text, "hi")
    elif sentencizer_language == "japanese":
        sent_tokenize = hasami.segment_sentences
    else:
        sent_tokenize = lambda text : tokenize.sent_tokenize(text, sentencizer_language)
    if isinstance(text, list):
        sentencized_examples = []
        num_sentences = None
        for example in text:
            sentences = sent_tokenize(example)
            if isinstance(sentences, str):
                sentences = [sentences]
            sentencized_examples.append(sentences)
            if num_sentences is None:
                num_sentences = len(sentences)
            elif num_sentences != len(sentences):
                print("ERROR: Number of sentences in examples is not consistent.")
                print(text)
    
        return [[examples[i] for examples in sentencized_examples] for i in range(len(sentencized_examples[0]))]
    else:
        return sent_tokenize(text)


In [12]:
import re
FORMALITY_PHRASES = re.compile("(\[F\](.*?)\[/F\])")

source_texts_new, target_texts_formal_new, target_texts_informal_new = [], [], []
target_texts_formal_tagged_new, target_texts_informal_tagged_new = [], []
bad_examples = []
examples_with_no_formality_phrase = []
for source, target_formal, target_informal, tagged_formal, tagged_informal in zip(source_texts, target_texts_formal, target_texts_informal, target_texts_formal_tagged, target_texts_informal_tagged):
    source_sentences = split_into_sentences(source, sentencizer_source_language)
    target_formal_sentences = split_into_sentences(target_formal, sentencizer_target_language)
    target_informal_sentences = split_into_sentences(target_informal, sentencizer_target_language)
    tagged_formal_sentences = split_into_sentences(tagged_formal, sentencizer_target_language)
    tagged_informal_sentences = split_into_sentences(tagged_informal, sentencizer_target_language)

    indices_to_delete = []
    for i, tagged_formal_sentence in enumerate(tagged_formal_sentences):
        if len(re.findall(FORMALITY_PHRASES, tagged_formal_sentence)) == 0:
            print("No formality phrases found in the following example:", tagged_formal_sentence)
            indices_to_delete.append(i)    
    
    if len(source_sentences) == len(target_formal_sentences) == len(target_informal_sentences) == len(tagged_formal_sentences) == len(tagged_informal_sentences):
        for i in sorted(indices_to_delete, reverse=True):
            del source_sentences[i]
            del target_formal_sentences[i]
            del target_informal_sentences[i]
            del tagged_formal_sentences[i]
            del tagged_informal_sentences[i]
        
        if len(source_sentences) == 0:
            examples_with_no_formality_phrase.append((source, target_formal, target_informal, tagged_formal, tagged_informal))
            continue

        source_texts_new.extend(source_sentences)
        target_texts_formal_new.extend(target_formal_sentences)
        target_texts_informal_new.extend(target_informal_sentences)
        target_texts_formal_tagged_new.extend(tagged_formal_sentences)
        target_texts_informal_tagged_new.extend(tagged_informal_sentences)
    else:
        bad_examples.append((source_sentences, target_formal_sentences, target_informal_sentences, tagged_formal_sentences, tagged_informal_sentences))
        print("Different number of sentences found in the following example:")
        print(source_sentences)
        print(target_formal_sentences)
        print(target_informal_sentences)
        print(tagged_formal_sentences)
        print(tagged_informal_sentences)
        print()

print("Number of bad examples:", len(bad_examples))
print("Number of examples with no formality phrase:", len(examples_with_no_formality_phrase))

No formality phrases found in the following example: Es muss eine Kundenservice-Kultur sein und es muss für die Leute ein Erlebnis sein, anstatt einfach reinzukommen
No formality phrases found in the following example: Weil das handgemacht ist oder, ich meine, es ist dazu gemacht, sich im Grunde innerhalb von einem Jahr zu zersetzen.
No formality phrases found in the following example: Sie müssen das, das ist ihr Deal, sie müssen machen, es ist das Gleiche, das mit Home Depot passiert ist.
No formality phrases found in the following example: Nein, auf dem Rückweg von Budapest hatte ich tatsächlich die Möglichkeit, als meine Oma noch lebte.
No formality phrases found in the following example: Sie wuchs in Budapest auf.
Different number of sentences found in the following example:
['No, going back from Budapest, I actually had the opportunity when my grandmother was still alive.']
[['Nein, auf dem Rückweg von Budapest hatte ich tatsächlich die Möglichkeit, als meine Oma noch lebte.', 'Ne

In [13]:
# write split texts into files
base_path = f"./formality/CoCoA-MT/{split}/{source_lang}-{target_lang}/"
source_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.{source_lang}"
target_formal_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.formal.{target_lang}"
target_formal_feminine_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.formal.feminine.{target_lang}"
target_informal_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.informal.{target_lang}"
target_informal_feminine_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.informal.feminine.{target_lang}"
target_formal_tagged_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.formal.annotated.{target_lang}"
target_informal_tagged_file = f"formality-control.{split}.only_with_formality.{source_lang}-{target_lang}.informal.annotated.{target_lang}"

with open(base_path + source_file, "w") as f:
    for example in source_texts_new:
        if isinstance(example, list):
            f.write("\n".join(example) + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_formal_file, "w") as f:
    for example in target_texts_formal_new:
        if isinstance(example, list):
            f.write(example[0] + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_formal_feminine_file, "w") as f:
    for example in target_texts_formal_new:
        if isinstance(example, list):
            f.write(example[1] + "\n")

with open(base_path + target_informal_file, "w") as f:
    for example in target_texts_informal_new:
        if isinstance(example, list):
            f.write(example[0] + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_informal_feminine_file, "w") as f:
    for example in target_texts_informal_new:
        if isinstance(example, list):
            f.write(example[1] + "\n")

with open(base_path + target_formal_tagged_file, "w") as f:
    for example in target_texts_formal_tagged_new:
        if isinstance(example, list):
            f.write("\n".join(example) + "\n")
        else:
            f.write(example + "\n")
with open(base_path + target_informal_tagged_file, "w") as f:
    for example in target_texts_informal_tagged_new:
        if isinstance(example, list):
            f.write("\n".join(example) + "\n")
        else:
            f.write(example + "\n")
