### Script translation text(en) to malagasy(mg)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import torch
from google.colab import drive
from tqdm.auto import tqdm

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [1]:
# Définir les index de début et de fin pour cette exécution
start_index = 0
end_index = 200  # Exclut cette ligne
batch_size = 32  # Taille du lot pour la traduction

# Vérifier si un GPU est disponible et le définir comme device, sinon utiliser le CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Utilisation de l'appareil : {device}")

# modèle MADLAD-400-7B-MT-BT
model_name = "google/madlad400-7b-mt-bt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Monter Google Drive pour le chargement et les sauvegardes des données
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/hate_speech/data_combined.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Erreur: Le fichier '{file_path}' n'a pas été trouvé.")
    exit()

# Vérifier que la colonne 'text' existe
if 'text' not in df.columns:
    print("Erreur: La colonne 'text' n'existe pas dans le fichier CSV.")
    exit()

# Sélectionner la portion du DataFrame à traiter
df_subset = df[start_index:end_index]
print(f"Traitement des lignes de {start_index} à {end_index - 1} par lots de {batch_size}.")

def translate_batch(texts):
    input_texts = [f"<2mg> {text}" for text in texts]  # Ajouter le préfixe de langue
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            max_length=512
        )
    translated_texts = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated_texts

translations = [''] * len(df_subset)  # Pré-allouer la liste des traductions

# Appliquer la traduction par lots avec une barre de progression
for i in tqdm(range(0, len(df_subset), batch_size), desc="Traduction par lots"):
    batch = df_subset['text'][i:i + batch_size].tolist()
    translated_batch = translate_batch(batch)
    translations[i:i + batch_size] = translated_batch

# Ajouter la liste des traductions comme une nouvelle colonne au sous-ensemble
df_subset['translated'] = translations

# Concaténer le sous-ensemble traduit avec le reste du DataFrame
df.loc[start_index:end_index - 1, 'translated'] = df_subset['translated']

output_file_path = f'/content/drive/MyDrive/hate_speech/hate_speech_mg_translated_part_{start_index}_{end_index - 1}.csv'
df.to_csv(output_file_path, index=False)
print(f"\nLes traductions pour les lignes {start_index} à {end_index - 1} ont été sauvegardées sous '{output_file_path}'")

Utilisation de l'appareil : cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Traitement des lignes de 0 à 199 par lots de 32.


  df = pd.read_csv(file_path)


Traduction par lots:   0%|          | 0/7 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['translated'] = translations



Les traductions pour les lignes 0 à 199 ont été sauvegardées sous '/content/drive/MyDrive/hate_speech/hate_speech_mg_translated_part_0_199.csv'


In [2]:
pd.read_csv(output_file_path)

  pd.read_csv(output_file_path)


Unnamed: 0,text,label,translated
0,!!! RT @mayasolovely: As a woman you shouldn't...,0,!!! RT @mayasolovely: Amin'ny maha-vehivavy an...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,!!!!! RT @mleew17: zazalahy dats mangatsiaka.....
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,!!!!!!!!! RT @C_G_Anderson: @viva_based toa mp...
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,!!!!!!!!!!!!! RT @ShenikaRoberts: Mety ho mari...
...,...,...,...
534659,we should forcibly remove all the foreigners f...,1,
534660,we should forcibly remove all the immigrants f...,1,
534661,dalits are unworthy of the Gods love,1,
534662,dalits are worthless,1,
