In [None]:
import pandas as pd
import torch
import os
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer

# Loading Helsinki English-to-Swahili translation model
helsinki_model_name = "Helsinki-NLP/opus-mt-en-sw"
helsinki_tokenizer = MarianTokenizer.from_pretrained(helsinki_model_name)
helsinki_model = MarianMTModel.from_pretrained(helsinki_model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Loading Bildad English-to-Swahili translation model
bildad_model_name = "Bildad-Model/en-sw" 
bildad_tokenizer = MarianTokenizer.from_pretrained(bildad_model_name)
bildad_model = MarianMTModel.from_pretrained(bildad_model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Loading dataset
input_file = "/content/sample_data/Fake Clean.csv"
output_file = "/content/sample_data/fake_Translated_Swahili.csv"
df = pd.read_csv(input_file)

def load_existing_translations(file_path, df_part):
    """We Load existing translations if it is available, otherwise we initialize the column."""
    if os.path.exists(file_path):
        translated_df = pd.read_csv(file_path)
        if "translated_text" in translated_df.columns:
            df_part["translated_text"] = translated_df["translated_text"]
        else:
            df_part["translated_text"] = ""  # Ensuring column exists
    else:
        df_part["translated_text"] = ""  # Ensuring column exists
    return df_part

# Loading existing translations if available
df = load_existing_translations(output_file, df)

def get_confident_translation(text, model1, tokenizer1, model2, tokenizer2, confidence_threshold=0.5):
    """Translate using two models and return the translation with the highest minimum token confidence."""
    if pd.isna(text) or text.strip() == "":
        return text  # Skipping empty values
    
    try:
        # Translateing using Model 1 (Helsinki)
        inputs1 = tokenizer1(text, return_tensors="pt", padding=True, truncation=True).to(model1.device)
        outputs1 = model1.generate(**inputs1, return_dict_in_generate=True, output_scores=True)
        translation1 = tokenizer1.decode(outputs1.sequences[0], skip_special_tokens=True)
        score1 = min([t.item() for t in outputs1.scores[0]])  # Minimum token confidence

        # Translating using Model 2 (Bildad)
        inputs2 = tokenizer2(text, return_tensors="pt", padding=True, truncation=True).to(model2.device)
        outputs2 = model2.generate(**inputs2, return_dict_in_generate=True, output_scores=True)
        translation2 = tokenizer2.decode(outputs2.sequences[0], skip_special_tokens=True)
        score2 = min([t.item() for t in outputs2.scores[0]])

        # Select the translation with the highest minimum confidence
        if score1 >= confidence_threshold and score2 >= confidence_threshold:
            return translation1 if score1 > score2 else translation2
        elif score1 >= confidence_threshold:
            return translation1
        elif score2 >= confidence_threshold:
            return translation2
        else:
            return "Low confidence translation. Needs review."

    except Exception as e:
        print(f"Error translating: {e}")
        return text  # Returns original text if translation fails

def translate_and_save(df_part, file_path):
    """Translating the text and saving it only if it is not already translated."""
    tqdm.pandas()
    for index, row in df_part.iterrows():
        if pd.isna(row.get("translated_text", "")) or row["translated_text"].strip() == "":
            translated_text = get_confident_translation(row["text"], helsinki_model, helsinki_tokenizer, bildad_model, bildad_tokenizer)
            df_part.at[index, "translated_text"] = translated_text
            df_part.to_csv(file_path, index=False)  # Saves after each translation

# calling Translate and save method
translate_and_save(df, output_file)

print("Dataset successfully translated and saved!")
