In [1]:
import pandas as pd
import torch
import os
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer

# Loading Helsinki English-to-Swahili translation model
model_name = "Helsinki-NLP/opus-mt-en-sw"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Loading dataset
input_file = "/content/sample_data/True Clean.csv"
output_file = "/content/sample_data/true_Translated_Swahili_updated.csv"
df = pd.read_csv(input_file)

def load_existing_translations(file_path, df_part):
    """Loading existing translations if available, otherwise we initialize the column."""
    if os.path.exists(file_path):
        translated_df = pd.read_csv(file_path)
        if "translated_text" in translated_df.columns:
            df_part["translated_text"] = translated_df["translated_text"]
        else:
            df_part["translated_text"] = ""  # Ensuring column exists
    else:
        df_part["translated_text"] = ""  # Ensuring column exists
    return df_part

# Calling the Load existing translations if available
df = load_existing_translations(output_file, df)

def translate_text(text):
    """Translating the text while skipping already translated entries."""
    if pd.isna(text) or text.strip() == "":
        return text  # Skipping empty values
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
        outputs = model.generate(**inputs)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error translating: {e}")
        return text  # Returning original text if translation fails

def translate_and_save(df_part, file_path):
    """Translating text and saving only if it's not already translated."""
    tqdm.pandas()
    for index, row in df_part.iterrows():
        if pd.isna(row.get("translated_text", "")) or row["translated_text"].strip() == "":
            translated_text = translate_text(row["text"])
            df_part.at[index, "translated_text"] = translated_text
            df_part.to_csv(file_path, index=False)  # Saves after each translation

# Calling the Translate and save method
translate_and_save(df, output_file)

print("Dataset successfully translated and saved!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/821k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/300M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

KeyboardInterrupt: 