<a href="https://colab.research.google.com/github/Baroka-wp/dive_ML/blob/master/waama_to_french.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install sacremoses
%pip install datasets
%pip install transformers[torch]
%pip install accelerate -U
%pip install datasets

In [None]:
import pandas as pd
import transformers
import accelerate
from sklearn.model_selection import train_test_split
from transformers import MarianMTModel, MarianTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login


In [None]:
print(transformers.__version__)
print(accelerate.__version__)

In [11]:
# Lecture du fichier et extraction des paires
file_path = 'waama.txt'

waama_phrases = []
french_phrases = []

with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines:
        if '\t' in line:  # Les phrases sont séparées par une tabulation
            waama, french = line.strip().split('\t')
            waama_phrases.append(waama)
            french_phrases.append(french)


In [12]:
# Création du DataFrame
data = {'waama': waama_phrases, 'french': french_phrases}
df = pd.DataFrame(data)

# Vérification des données
print(df.head())

          waama                french
0            aa                   non
1  ammɛnɛ yuubu  le régime de bananes
2  ammɛnɛ yuuna  le régime de bananes
3         arima                  mais
4          ayaa             vaut rien


In [13]:
df.to_csv('waama_french.csv', index=False)


In [15]:
# Utilisation de votre jeton d'accès
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [24]:

# Création du DataFrame
data = {'waama': waama_phrases, 'french': french_phrases}
df = pd.DataFrame(data)


# Division des données en ensembles d'entraînement et de validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Création des datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

datasets

DatasetDict({
    train: Dataset({
        features: ['waama', 'french', '__index_level_0__'],
        num_rows: 56
    })
    validation: Dataset({
        features: ['waama', 'french', '__index_level_0__'],
        num_rows: 15
    })
})

In [25]:
# Chargement des données
dataset = load_dataset('csv', data_files={'train': 'waama_french.csv'})

# Tokenizer et modèle
model_name = 'Helsinki-NLP/opus-mt-mul-en'
tokenizer = MarianTokenizer.from_pretrained(model_name, token="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") # Added use_auth_token
model = MarianMTModel.from_pretrained(model_name, token="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") # Added use_auth_token




In [26]:
# Préparation des données pour l'entraînement
def preprocess_function(examples):
    inputs = [example for example in examples['waama']]
    targets = [example for example in examples['french']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length').input_ids
    model_inputs['labels'] = labels
    return model_inputs


tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Entraînement

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)

trainer.train()

# Évaluation
results = trainer.evaluate()
print(results)

# Sauvegarde du modèle
model.save_pretrained('./waama_to_french_model')
tokenizer.save_pretrained('./waama_to_french_tokenizer')


Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.66446
2,No log,0.455838
3,No log,0.348459
4,No log,0.310885
5,No log,0.297658
6,No log,0.28882
7,No log,0.281651
8,No log,0.276282
9,No log,0.273016
10,No log,0.271789


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


{'eval_loss': 0.2717890739440918, 'eval_runtime': 0.0998, 'eval_samples_per_second': 150.317, 'eval_steps_per_second': 20.042, 'epoch': 10.0}


('./waama_to_french_tokenizer/tokenizer_config.json',
 './waama_to_french_tokenizer/special_tokens_map.json',
 './waama_to_french_tokenizer/vocab.json',
 './waama_to_french_tokenizer/source.spm',
 './waama_to_french_tokenizer/target.spm',
 './waama_to_french_tokenizer/added_tokens.json')

In [27]:
from transformers import MarianMTModel, MarianTokenizer

# Charger le modèle et le tokenizer
model_path = './waama_to_french_model'
tokenizer_path = './waama_to_french_tokenizer'
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(tokenizer_path)

# Phrase de test en Waama
test_phrases = [
    "baari",
    "baa wan",
    "baaka"
    # Ajoutez d'autres phrases de test ici
]

# Traduire les phrases
def translate(phrases):
    # Tokenization des phrases en Waama
    inputs = tokenizer(phrases, return_tensors='pt', padding=True, truncation=True, max_length=128)
    # Génération de la traduction
    translated_tokens = model.generate(**inputs)
    # Décodage des tokens en français
    translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
    return translations

# Traduction des phrases de test
translated_phrases = translate(test_phrases)

# Affichage des résultats
for waama, french in zip(test_phrases, translated_phrases):
    print(f"Waama: {waama}\nFrench: {french}\n")



Waama: baari
French: yes

Waama: baa wan
French: le

Waama: baaka
French: le

