In [1]:
import pandas as pd
import ast
from langdetect import detect, DetectorFactory
import deepl

# Ensure consistent results from langdetect
DetectorFactory.seed = 0

# Read CSV data
df = pd.read_csv('src/deepLCompare/filtered.csv')

# Convert 'translation_json' column to dictionaries once
df['translation_json'] = df['translation_json'].apply(ast.literal_eval)

# Function to process each translation_json entry
def process_translation_json(translation_dict):
    data_for_df = [
        {
            'id': f"{key}_{value['sequenceid']}",
            'mt_content': value['mt']['content'] if value.get('mt') else None,
            'source_content': value['source']['content'] if value.get('source') else None,
            'user_content': value['user']['content'] if value.get('user') else None
        }
        for key, value in translation_dict.items()
        if value.get('user')  # Ensure user_content is not None
    ]
    return pd.DataFrame(data_for_df)

# Process each title in the DataFrame
all_titles_df = [
    process_translation_json(translation_json).assign(title=title)
    for title in df['title'].unique()
    for translation_json in df.loc[df['title'] == title, 'translation_json']
]

# Concatenate all processed DataFrames
final_df = pd.concat(all_titles_df, ignore_index=True)

# Filter out rows where source_content is equal to user_content
edit = final_df[final_df['source_content'] != final_df['user_content']]

# Function to detect if the user content is in English
def is_english(content):
    try:
        return detect(content) == 'en'
    except:
        return False

# Filter out rows where user_content is in English
edit = edit[~edit['user_content'].apply(is_english)]

# if user_content is under 45 words then remove it
edit = edit[edit['user_content'].apply(lambda x: len(x.split()) >= 45)]


# Replace with your DeepL auth key
auth_key = "b948ea92-bc0b-49b5-b953-9b3cfc79fab8:fx"  
translator = deepl.Translator(auth_key)


# Function to translate a list of samples
def translate_samples(content, target_lang="TR"):
    result = translator.translate_text(content, source_lang="EN", target_lang=target_lang)
    return result.text
# Apply the translation to each row in the DataFrame
edit['deepl'] = edit['user_content'].apply(translate_samples)


In [3]:
edit.to_csv('src/deepLCompare/deepl_vs_human.csv', index=False)