In [1]:
import os
import time
from datetime import date

from llm_political_analysis.modules.translate import batch_translate_text, detect_language
from llm_political_analysis.modules.store import list_gcs_folder, read_gcs_file, download_gcs_file

In [2]:
def download_translate_output_file(folder: str, filename: str, base_output_folder: str, download_folder: str = "./"):
    translation_output = f"{base_output_folder}/translation-{filename.split('/')[-1].split('.')[0].replace(' ', '').lower()}/llms-as-experts_{folder.replace('/', '_')}{filename.replace(folder, '').split('.')[0]}_en_translations.txt"

    download_output = os.path.join(
        download_folder,
        f"{filename.replace(folder, '').split('.')[0]}_en_translation.txt"
    )
    print(f"Downloading {download_output} ...")
    download_gcs_file(
        download_output,
        translation_output
    )

def translate_file_in_folder(
        folder: str, base_output_folder: str, log_file:str=None, download_folder: str="./"
):
    for filename in list_gcs_folder(folder):
        if filename == folder:
            # Skip the folder itself
            continue
        print(filename)
        # Google batch translate requires the result to be output to an empty folder, so we create a unique folder for each file 
        file_output_folder = f"{base_output_folder}/translation-{filename.split('/')[-1].split('.')[0].replace(' ', '').lower()}"
        source_language_code = detect_language(
            read_gcs_file(blob_name=filename)
        )
        if source_language_code == "en":
            print(f"Skip {filename}.")
            continue
        if log_file:
            try:
                with open(
                        os.path.join(download_folder, "detected_languages.txt"), "a", encoding="utf-8"
                ) as f:
                    f.write(str((filename, source_language_code)))
                    f.write("\n")
            except:
                pass
        translation_res = batch_translate_text(
            filename,
            file_output_folder,
            source_language_code=source_language_code
        )
        print(translation_res)
        download_translate_output_file(folder, filename, base_output_folder, download_folder)

In [3]:
folders = ["plaintext/new_calibration/", "plaintext/new_test/"]
base_output_folder_ = f"translation-{date.today()}-{int(time.time())}"
download_folder_ = "../data/translation"
log_file_ = os.path.join(download_folder_, "detected_languages.txt")
for folder_name in folders:
    translate_file_in_folder(folder_name, base_output_folder_, log_file_, download_folder_)

plaintext/new_calibration/Austria - AU 2019 - AU 2019 SDP.txt
Waiting for operation to complete...
Total Characters: 240694
Translated Characters: 240694
total_characters: 240694
translated_characters: 240694
submit_time {
  seconds: 1723162019
}
end_time {
  seconds: 1723162111
}

Downloading ../data/translation\Austria - AU 2019 - AU 2019 SDP_en_translation.txt ...
plaintext/new_calibration/Belgium - BEL 2014 - BEL 2014  New Flemish Alliance N VA.txt
Waiting for operation to complete...
Total Characters: 325639
Translated Characters: 325639
total_characters: 325639
translated_characters: 325639
submit_time {
  seconds: 1723162116
}
end_time {
  seconds: 1723162216
}

Downloading ../data/translation\Belgium - BEL 2014 - BEL 2014  New Flemish Alliance N VA_en_translation.txt ...
plaintext/new_calibration/Czech - CZ 2010 - CZ 2010 Civ Dem ODS.txt
Waiting for operation to complete...
Total Characters: 112325
Translated Characters: 112325
total_characters: 112325
translated_characters: 11