### Simple python script to postprocess FLAN-T5 translations and create a list of triplets dictionary

In [1]:
with open("DE-EN-alligned.txt", "r", encoding='utf-8') as src_file, \                      
        open("DE-EN-pred-groundtruth_FLANT5_1.txt", "r", encoding='utf-8') as mt_ref_file_1, \
        open("DE-EN-pred-groundtruth_FLANT5_66154.txt", "r", encoding='utf-8') as mt_ref_file_2, \
        open("DE-EN-pred-groundtruth_FLANT5_77505.txt", "r", encoding='utf-8') as mt_ref_file_3:     # As we can see, the mt_ref pairs were stored in different txt files with the same format <mt \t ref>
    mt_ref_lines_1 = [line.rstrip("\n") for line in mt_ref_file_1.readlines()]
    mt_ref_lines_2 = [line.rstrip("\n") for line in mt_ref_file_2.readlines()]
    mt_ref_lines_3 = [line.rstrip("\n") for line in mt_ref_file_3.readlines()]
    mt_ref_lines = mt_ref_lines_1.copy()  # Create a copy of mt_ref_lines_1 to preserve its contents
    mt_ref_lines.extend(mt_ref_lines_2)
    mt_ref_lines.extend(mt_ref_lines_3)
    src_lines = [line.rstrip("\n") for line in src_file.readlines()]

In [2]:
print(len(mt_ref_lines))
print(mt_ref_lines[:10])

77506
['<pad> 25. Oktober 2000 ENDGÜLTIG A5-0313/2000 *</s>\t25 October 2000 FINAL A5-0313/2000 *', '<pad> about an initiative by the French Republic in the light of the establishment of a framework decision of the Council on the issue of money laundering, evasion, extortion, evasion and the increase in the number of weapons and contracts of war</s>\ton the initiative of the French Republic with a view to adopting a Council Framework Decision on money laundering, the identification, tracing, freezing, seizing and confiscation of instrumentalities and the proceeds from crime', "<pad> Committee on the Freedoms and Rights of the People, Justice and Inner Affairs</s>\tCommittee on Citizens' Freedoms and Rights, Justice and Home Affairs", '<pad> Luis Marinho</s>\tLuis Marinho', '<pad> PE 294.241</s>\tPE 294.241/fin.', '<pad> Erklärung of the used symbols</s>\tSymbols for procedures', '<pad> * Consultation Procedures</s>\t* Consultation procedure', '<pad> The majority of the voted</s>\tmajor

In [13]:
def filter_repeated_lines(lines, threshold=30):
    '''
    Filters the bugs/hallucinations T5 experiments
    '''
    filtered_lines = []
    for line in lines:
        word_count = {}
        words = line.split()
        for word in words:
            word_count[word] = word_count.get(word, 0) + 1
        if not any(count > threshold for count in word_count.values()):
            filtered_lines.append(line)
        else:
            filtered_lines.append('skip this')
    return filtered_lines

def remove_substrings(string):
    '''
    Postprocessing of translated sentences
    '''
    string = string.replace("</s>", "")
    string = string.replace("<pad>", "")
    return string

In [14]:
# Create a list of dictionaries
data = []
filtered_mt_ref = filter_repeated_lines(mt_ref_lines)
for mt_ref, src_line in zip(filtered_mt_ref, src_lines):
    stripped = mt_ref.strip()
    if stripped != "skip this" and '\t' in stripped:
        mt_pre, ref = stripped.split("\t")
        mt = remove_substrings(mt_pre)
        src, _ = src_line.strip().split("\t")  # Ignore extra content after reference string
        data.append({
            "src": src.strip(),
            "mt": mt.strip(),
            "ref": ref.strip()
        })
    else:
        continue

In [16]:
print(len(data))
print(data[:10])

76450
[{'src': '25. Oktober 2000 ENDGÜLTIG A5-0313/2000 *', 'mt': '25. Oktober 2000 ENDGÜLTIG A5-0313/2000 *', 'ref': '25 October 2000 FINAL A5-0313/2000 *'}, {'src': 'über eine Initiative der Französischen Republik im Hinblick auf den Erlass eines Rahmenbeschlusses des Rates über Geldwäsche, die Ermittlung, das Einfrieren, die Beschlagnahme und die Einziehung von Tatwerkzeugen und Erträgen aus Straftaten', 'mt': 'about an initiative by the French Republic in the light of the establishment of a framework decision of the Council on the issue of money laundering, evasion, extortion, evasion and the increase in the number of weapons and contracts of war', 'ref': 'on the initiative of the French Republic with a view to adopting a Council Framework Decision on money laundering, the identification, tracing, freezing, seizing and confiscation of instrumentalities and the proceeds from crime'}, {'src': 'Ausschuss für die Freiheiten und Rechte der Bürger, Justiz und innere Angelegenheiten', 'mt

In [None]:
# Save the list of dictionaries to a file
with open("flant5_dict.txt", "w", encoding='utf-8') as output_file:
    for entry in data:
        output_file.write(str(entry) + "\n")