In [37]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset_builder
from datasets import load_dataset
import numpy as np
import evaluate
import torch

In [38]:
tokenizer = AutoTokenizer.from_pretrained("KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en",model_max_length=128)
model = AutoModelForSeq2SeqLM.from_pretrained("KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en")

Generate config GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.31.0"
}

All model checkpoint weights were used when initializing LongT5ForConditionalGeneration.

All the weights of LongT5ForConditionalGeneration were initialized from the model checkpoint at KETI-AIR-Downstream/long-ke-t5-base-translation-aihub-ko2en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LongT5ForConditionalGeneration for predictions without further training.
Generation config file not found, using a generation config created from the model config.


In [None]:
def prepare_dataset(data):
    source_language = [value['ko'] for key, value in data.items()]
    target_language = [value['en'] for key, value in data.items()]
    return source_language, target_language

In [None]:
train  = load_dataset("Moo/korean-parallel-corpora", split="train")
#test = load_dataset("Moo/korean-parallel-corpora", split="test")
validation = load_dataset("Moo/korean-parallel-corpora", split="validation")

In [None]:
train[0]

{'ko': '개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"',
 'en': 'Much of personal computing is about "can you top this?"'}

In [None]:
inputs = tokenizer(train['ko'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs = tokenizer(train['en'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [None]:
inputs2 = tokenizer(validation['ko'],return_tensors="pt", max_length=128, truncation=True,padding=True)
outputs2 = tokenizer(validation['en'],return_tensors="pt",max_length=128, truncation=True,padding=True)

In [None]:
dataset = torch.utils.data.TensorDataset(inputs.input_ids, inputs.attention_mask, outputs.input_ids, outputs.attention_mask)

In [None]:
dataset2 = torch.utils.data.TensorDataset(inputs2.input_ids, inputs2.attention_mask, outputs2.input_ids, outputs2.attention_mask)

In [None]:
dataset[0]

(tensor([  381, 11023,   832,    54,     5,  1310,   202,    12, 20004, 20023,
          6704,   121,  2265,    33,  1739,    19, 20016,  1577, 20787,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [None]:
from transformers import TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
    remove_unused_columns=False,
    logging_dir="./logs",
)

def data_collator(batch):
        return {
            "input_ids": torch.stack([item[0] for item in batch]),
            "attention_mask": torch.stack([item[1] for item in batch]),
            "labels": torch.stack([item[2] for item in batch]),
        }

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset2,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


import transformers
transformers.logging.set_verbosity_info()

trainer.train()

# Save the trained model
output_dir = "./train_translatorKO_EN"
trainer.save_model(output_dir)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 96,215
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6,014
  Number of trainable parameters = 296,696,448


Epoch,Training Loss,Validation Loss


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Configuration saved in ./results/checkpoint-500/generation_config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Configuration saved in ./results/checkpoint-1000/generation_config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-5500] due to args.save_total_limit
Saving model checkpoin

Epoch,Training Loss,Validation Loss
1,0.6462,0.811003


Saving model checkpoint to ./results/checkpoint-4500
Configuration saved in ./results/checkpoint-4500/config.json
Configuration saved in ./results/checkpoint-4500/generation_config.json
Model weights saved in ./results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4500/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-3000] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json
Configuration saved in ./results/checkpoint-5000/generation_config.json
Model weights saved in ./results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5000/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-3500] due to args.save_total_limit
Saving model che

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [39]:
!ls "/content/drive/My Drive"

 1950972215_3-V2_2023-06_Document_Inscription_20221114.pdf
'ADHERENT(1).sql'
'AP2 ALGO.zip'
 Book.gslides
 Book.pptx
'cahierdescharges_ap2 (1).gdoc'
 cahierdescharges_ap2.gdoc
 ColabNotebooks
 cours
'Depense 2022 - 2023.gsheet'
 E4
 eclipse-workspace
 léonie
'Mod�le attestation de stage - SIO.gdoc'
 OPEN
'php array.php'
 poursuite-etudes-inge.pdf
 poursuite-etudes-pas-inge.pdf
 Releve_de_Notes_195097221500002.pdf
 TD4.zip
 UTILISATEUR.sql
