<a href="https://colab.research.google.com/github/Cicciokr/latin-ai-model/blob/main/Fine_Tuning_Transformer_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install evaluate
!pip install rouge_score

import pandas as pd
import numpy as np
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollatorForWholeWordMask, AutoTokenizer, AutoModelForMaskedLM, TrainerCallback
from sklearn.model_selection import train_test_split
import math
import torch
import evaluate
rouge = evaluate.load('rouge')

#print(torch.cuda.device_count())
#print(torch.cuda.get_device_name(0))
#dataset testo
#dataset = load_dataset('text', data_files='la.txt')
#dataset parquet
#dataset = load_dataset("Cicciokr/CC-100-Latin", revision="refs/convert/parquet")
MODEL_NAME = "ClassCat/roberta-base-latin-v2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Applicare la tokenizzazione
model.to(device)
#dataset = load_dataset("parquet", data_dir="./parquet", trust_remote_code=True)
#tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=4)
#tokenized_dataset.save_to_disk("./dataset/tokenized_dataset")
#tokenized_dataset = load_from_disk("./dataset_light/tokenized_dataset")
dataset = load_dataset("pstroe/cc100-latin", data_files="la.nolorem.tok.latalphabetonly.v2.json", field="train")
#dataset_split_train = dataset['train'][:100]
#dataset_split = dataset['test'][:100]
dataset_split = dataset['train'].train_test_split(test_size=0.001, shuffle=True)
print(dataset_split['test'])
dataset_split_train = dataset['train'].train_test_split(test_size=0.01, shuffle=True)
print(dataset_split_train['test'])
tokenized_datasets_test = dataset_split['test'].map(preprocess_function, batched=True, num_proc=4)
tokenized_datasets_train = dataset_split_train['test'].map(preprocess_function, batched=True, num_proc=4)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,                   # Abilita il mascheramento
    mlm_probability=0.15        # Percentuale di token da mascherare
)

#il 20% dei dati viene usato come test e l'80% viene usato come train, per evitare overfitting
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/TrainingLog",
    save_strategy="steps",
    eval_strategy="steps",
    save_steps=500,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/TrainingLog/logs",
    fp16=True,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=24,
    logging_steps=100,
    warmup_steps=1000,
    save_total_limit=2,
    greater_is_better=False,
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    optim="adamw_torch"
)

#metric = evaluate.load("accuracy")
def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

def compute_metrics(pred):

    labels_ids = pred.label_ids
    pred_ids = pred.predictions[0]

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    min_length = min(len(pred_str), len(label_str))
    pred_str = pred_str[:min_length]
    label_str = label_str[:min_length]

    rouge_output = rouge.compute(
        predictions=pred_str,
        references=label_str,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"],
    )

    return {
        "R1": round(rouge_output["rouge1"], 4),
        "R2": round(rouge_output["rouge2"], 4),
        "f1": round(rouge_output["rougeL"], 4),
        "RLsum": round(rouge_output["rougeLsum"], 4),
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels


class MyTrainer(Trainer):
    def training_step(self, model, inputs, optimizer): # Aggiungi optimizer
        # Esegui il training step standard
        outputs = super().training_step(model, inputs, optimizer) # Aggiungi optimizer
        # Svuota la cache della GPU
        torch.cuda.empty_cache()
        return outputs


trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
#    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
#    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/latin-mlm-roberta")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/latin-mlm-roberta")

results_eval = trainer.evaluate()
print(results_eval)

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/845k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/505k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


la.nolorem.tok.latalphabetonly.v2.json:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 9331
})
Dataset({
    features: ['text'],
    num_rows: 93301
})


Map (num_proc=4):   0%|          | 0/9331 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/93301 [00:00<?, ? examples/s]

  trainer = MyTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,14.4094,3.478824
1000,14.149,3.442185
1500,13.7853,3.388824
2000,13.6589,3.316238
2500,13.4697,3.329963


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


{'eval_loss': 3.3744475841522217, 'eval_runtime': 57.5128, 'eval_samples_per_second': 162.242, 'eval_steps_per_second': 6.764, 'epoch': 3.0}
