## GPU

In [None]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()

device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())

gpu_name = torch.cuda.get_device_name(0)
print('We will use the GPU:', gpu_name)

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# Tesla V100-SXM2-16GB
# NVIDIA A100-SXM4-40GB

# Install required packages

In [None]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

In [None]:
!pip install accelerate -U

In [None]:
import transformers
print(transformers.__version__)

4.30.2


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import datasets
from datasets import DatasetDict
from datasets import load_dataset, load_metric

csv_dataset = datasets.load_dataset("csv", data_files="deepl_translated_df_106.csv", keep_default_na=False, delimiter=';')
dataset = csv_dataset['train']
dataset = dataset.rename_columns({'text': 'ja', 'translation': 'fr'})

# 90% train, 10% test
dataset_train_test = dataset.train_test_split(test_size=0.1, seed=42)

dataset = datasets.DatasetDict({
    'train': dataset_train_test['train'],
    'test': dataset_train_test['test']
})



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
metric = load_metric("sacrebleu")

# Preprocessing

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-ja-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



In [None]:
prefix = ""
max_input_length = 256
max_target_length = 256
source_lang = "ja"
target_lang = "fr"

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples["ja"]]
    targets = [ex for ex in examples["fr"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/128142 [00:00<?, ? examples/s]

Map:   0%|          | 0/14239 [00:00<?, ? examples/s]

## Fine-tuning

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

To instantiate a Seq2SeqTrainer, we will need to define three more things. The most important is the [Seq2SeqTrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html#transformers.Seq2SeqTrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    "model-106df-10epochs",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    num_train_epochs=10,
    predict_with_generate=True,
    save_total_limit=5
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    return result

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))



In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

We can now finetune our model by just calling the train method:

In [None]:
# 2h - 10 epochs / 47 df
# 10h - 10 epochs / 106 df

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,1.5314,1.463513,8.782944
2,1.318,1.393841,15.079684
3,1.1401,1.367178,16.048774
4,1.0304,1.352096,18.84485
5,0.9492,1.346642,17.647195
6,0.8662,1.345413,23.277036
7,0.8132,1.34771,19.500789
8,0.7649,1.348009,18.745416


Epoch,Training Loss,Validation Loss,Bleu
1,1.5314,1.463513,8.782944
2,1.318,1.393841,15.079684
3,1.1401,1.367178,16.048774
4,1.0304,1.352096,18.84485
5,0.9492,1.346642,17.647195
6,0.8662,1.345413,23.277036
7,0.8132,1.34771,19.500789
8,0.7649,1.348009,18.745416
9,0.7441,1.348443,21.755882
10,0.7159,1.350373,22.460444


TrainOutput(global_step=80090, training_loss=0.9946027221565376, metrics={'train_runtime': 29113.7065, 'train_samples_per_second': 44.014, 'train_steps_per_second': 2.751, 'total_flos': 8690815660326912.0, 'train_loss': 0.9946027221565376, 'epoch': 10.0})

In [None]:
trainer.save_model("drive/MyDrive/models/model-106df-10epochs_bis/")

In [None]:
# import shutil
# import os

# folder_path = 'model-106df-10epochs/'

# for folder_name in os.listdir(folder_path):
#     if folder_name.startswith('checkpoint-'):
#         checkpoint_number = int(folder_name.split('-')[1])
#         if checkpoint_number < 30000:
#             folder_to_delete = os.path.join(folder_path, folder_name)
#             shutil.rmtree(folder_to_delete)

# print("Deletion completed.")

Deletion completed.


In [None]:
import zipfile
import os

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))

folder_to_zip = 'model-106df-10epochs/'
zip_output_path = 'model-106df-10epochs.zip'

zip_folder(folder_to_zip, zip_output_path)

In [None]:
# for folder_name in os.listdir(folder_path):
#     if folder_name.startswith('checkpoint-'):
#         checkpoint_number = int(folder_name.split('-')[1])
#         if checkpoint_number < 60000 or checkpoint_number % 1000 == 500:
#             folder_to_delete = os.path.join(folder_path, folder_name)
#             shutil.rmtree(folder_to_delete)

# folder_to_zip = 'model-106df-10epochs/'
# zip_output_path = 'model-106df-10epochs-checkpoints-sup-60k-without-500.zip'
# zip_folder(folder_to_zip, zip_output_path)

# Load and predict

In [None]:
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer
text = ['記||ま|んま|じしulい還以']
# original_texts =  {0: '革族だから', 1: '同日にし とか っいでなも', 2: 'わ', 3: '記||ま|んま|じしulい還以', 4: 'とでりきあまえあ#'}


model_name = 'opus-mt-ja-fr-finetuned-ja-to-fr/checkpoint-36500'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True), repetition_penalty=20.)
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Compare predictions

In [None]:
fr_valid = [
    "Je vous le dis !",
    "Je ne sais pas de quoi vous parlez !",
    "je ne vous dois rien !",
    "Eh bien, je suis désolé...",
    "nous avons des preuves ici...",
    "Vous savez que nous avons besoin que vous nous remboursiez...",
    "Comment le saurais-je ?",
    "Comment puis-je connaître la dette de mon père dans les casinos...",
    "Vous êtes membre de la famille Tortillano ?",
    "nous avons une conversation ici ! Perdez la tête !"
 ]

basic_translations = [
    "C'est pour ça que...",
    "Je te l'ai dit, je ne sais pas.",
    "Je n'arrive pas à croire que tu me dois ça!",
    "Je n'ai pas dit ça.",
    "J'ai une carte de crédit.",
    "Il a dit qu'il n'avait pas besoin de récupérer l'argent que j'avais emprunté à Trujano.",
    "Je ne sais pas.",
    "Je ne peux pas croire que mon père ait créé une dette dans un casino.",
    "C'est la famille Turtiliano.",
    "Je suis en train de m'en occuper. Rebrousse-la!"
 ]

finetuned_translations = [
    "C'est pour cela que...",
    "Je vous dis que je ne sais pas.",
    "Je n'arrive pas à croire que vous ayez une telle dette!",
    "Je n'ai pas dit ça.",
    "J'ai un compte de dépôt ici.",
    "L'argent que j'ai prêté à Trujano, je dois le rendre.",
    "Je ne sais pas.",
    "Je n'arrive pas à croire que mon père ait créé une dette dans un casino.",
    "Si c'est le cas, est-ce la famille Turtiliano?",
    "Je suis en train de le faire. Répare-toi!"
 ]

In [None]:
from sacrebleu import CHRF
import sacrebleu

bleu_score_sb = sacrebleu.corpus_bleu(finetuned_translations, fr_valid).score
bleu_score_sb

chrf = evaluate.load("chrf")

bleu_basic = sacrebleu.corpus_bleu(basic_translations, [fr_valid])
bleu_finetuned = sacrebleu.corpus_bleu(finetuned_translations, [fr_valid])

print(f'SacreBLEU score for basic model: {bleu_basic.score}')
print(f'SacreBLEU score for finetuned model: {bleu_finetuned.score}')

chrf_basic = sacrebleu.corpus_chrf(basic_translations, fr_valid)
chrf_finetuned = sacrebleu.corpus_chrf(finetuned_translations, fr_valid)

print(f'CHRF score for basic model: {chrf_basic.score}')
print(f'CHRF score for finetuned model: {chrf_finetuned.score}')

print(f"CHRF score for basic model: {chrf.compute(predictions=basic_translations, references=fr_valid)['score']}")
print(f"CHRF score for finetuned model: {chrf.compute(predictions=finetuned_translations, references=fr_valid)['score']}")

## Model.tar.gz

In [None]:
import shutil
#copytree() copie dossier
shutil.copy('/content/drive/MyDrive/opus-mt-ja-fr-finetuned-ja-to-fr_10epochs.zip', '/content')

In [None]:
shutil.copy('opus-mt-ja-fr-finetuned-ja-to-fr_10epochs.zip', 'opus-mt-ja-fr-finetuned-ja-to-fr_10epochs-copy.zip')

'opus-mt-ja-fr-finetuned-ja-to-fr_10epochs-copy.zip'

In [None]:
shutil.copy('/content/model/model.tar.gz', '/content/model/model_copy.tar.gz')

'/content/model/model_copy.tar.gz'

In [None]:
!unzip /content/opus-mt-ja-fr-finetuned-ja-to-fr_10epochs.zip -d /

Archive:  /content/opus-mt-ja-fr-finetuned-ja-to-fr_10epochs.zip
replace /content/opus-mt-ja-fr-finetuned-ja-to-fr_10epochs/runs/Jun24_17-13-25_5e38ed92f9b2/events.out.tfevents.1687626811.5e38ed92f9b2.3573.0? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# !cd /content/opus-mt-ja-fr-finetuned-ja-to-fr_10epochs && tar zcvf /content/model.tar.gz *
# !cd /content/opus-mt-ja-fr-finetuned-ja-to-fr_10epochs/checkpoint-36500 && tar zcvf /content/checkpoint-36500.tar.gz *

In [None]:
import shutil
import tarfile
import os

source_folder = 'model/'
destination_folder = 'model_copy/'

shutil.copytree(source_folder, destination_folder)

def compress(tar_dir=None,output_file="/content/model/model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
          print(item)
          tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress("/content/opus-mt-ja-fr-finetuned")

config.json
tokenizer_config.json
vocab.json
training_args.bin
pytorch_model.bin
source.spm
target.spm
special_tokens_map.json
generation_config.json
