In [None]:
!pip install "unbabel-comet>=2.0.0" -U

In [None]:
!pip install -U huggingface

In [None]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token_value = user_secrets.get_secret("HF_TOKEN")
os.system(f"huggingface-cli login --token {token_value}")

In [None]:
import json
import os
from huggingface_hub import HfApi, hf_hub_download
from comet import download_model, load_from_checkpoint
from tqdm import tqdm

# Hugging Face dataset repository details
dataset_repo = "cstr/Capybara-de-snippets"
dataset_files = [
    #"Capybara_de_GPT4.jsonl",
    #"Capybara_de_Claude-3-Opus.jsonl",
    #"Capybara_de_GPT3.5.jsonl",
    #"Capybara_de_deepl.jsonl",
    #"Capybara_de_mixtral.jsonl",
    #"Capybara_de_occiglot.jsonl",
    "Capybara_de_original (english).jsonl",
    "Capybara_de_nbbl.jsonl",
    #"Capybara_de_t5madlad.jsonl",
    #"Capybara_de_discolm.jsonl"
]

# Download the dataset files from Hugging Face
file_paths = {}
for file_name in dataset_files:
    file_path = hf_hub_download(repo_id=dataset_repo, filename=file_name, repo_type="dataset")
    file_paths[file_name] = file_path
    print(f"Downloaded {file_name} to {file_path}")

# Download and load the COMET model
model_path = download_model("Unbabel/wmt22-cometkiwi-da")
model = load_from_checkpoint(model_path)

# Open the original English file
with open(file_paths["Capybara_de_original (english).jsonl"], "r", encoding="utf-8") as file:
    original_data = [json.loads(line) for line in file]

# Process each translation file
for translation_file in dataset_files:
    if translation_file == "Capybara_de_original (english).jsonl":
        continue  # Skip the original English file

    print(f"Processing {translation_file}...")

    # Open the translation file
    with open(file_paths[translation_file], "r", encoding="utf-8") as file:
        translation_data = [json.loads(line) for line in file]

    # Create a new list to store the updated data with scores
    updated_data = []

    # Iterate over each conversation in the translation data with a progress bar
    for conv_idx, conv in enumerate(tqdm(translation_data, desc="Evaluating conversations")):
        updated_conv = {"source": conv["source"], "conversation": []}

        # Iterate over each turn in the conversation
        for turn_idx, turn in enumerate(conv["conversation"]):
            # Get the corresponding turn from the original data
            original_turn = original_data[conv_idx]["conversation"][turn_idx]

            # Prepare the data for COMET evaluation
            comet_data = [
                {
                    "src": original_turn["input"],
                    "mt": turn["input"]
                },
                {
                    "src": original_turn["output"],
                    "mt": turn["output"]
                }
            ]

            # Perform COMET evaluation
            comet_scores = model.predict(comet_data, batch_size=8, gpus=1)

            # Add the scores to the turn data
            updated_turn = {
                "input": turn["input"],
                "output": turn["output"],
                "input_score": comet_scores[0],
                "output_score": comet_scores[1]
            }

            # Append the updated turn to the conversation
            updated_conv["conversation"].append(updated_turn)

        # Append the updated conversation to the updated data
        updated_data.append(updated_conv)

    # Save the updated data to a new JSONL file with UTF-8 encoding
    output_file = f"{translation_file[:-6]}_scored.jsonl"
    with open(output_file, "w", encoding="utf-8") as file:
        for conv in updated_data:
            file.write(json.dumps(conv, ensure_ascii=False) + "\n")

    # Upload the scored JSONL file to the dataset repository
    api = HfApi()
    api.upload_file(
        path_or_fileobj=output_file,
        path_in_repo=output_file,
        repo_id=dataset_repo,
        repo_type="dataset"
    )
    print(f"Uploaded {output_file} to the dataset repository.")

print("Evaluation completed. Scored files have been generated and uploaded.")