<a href="https://colab.research.google.com/github/Cicciokr/latin-ai-model/blob/main/Fine_Tuning_Transformer_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install datasets
!pip install evaluate
!pip install rouge_score

import pandas as pd
import numpy as np
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollatorForWholeWordMask, AutoTokenizer, AutoModelForMaskedLM, TrainerCallback
from sklearn.model_selection import train_test_split
import math
import torch
import evaluate
rouge = evaluate.load('rouge')

#print(torch.cuda.device_count())
#print(torch.cuda.get_device_name(0))
#dataset testo
#dataset = load_dataset('text', data_files='la.txt')
#dataset parquet
#dataset = load_dataset("Cicciokr/CC-100-Latin", revision="refs/convert/parquet")
MODEL_NAME = "ClassCat/roberta-base-latin-v2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

# Applicare la tokenizzazione
model.to(device)
#dataset = load_dataset("parquet", data_dir="./parquet", trust_remote_code=True)
#tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=4)
#tokenized_dataset.save_to_disk("./dataset/tokenized_dataset")
#tokenized_dataset = load_from_disk("./dataset_light/tokenized_dataset")
dataset = load_dataset("pstroe/cc100-latin", data_files="la.nolorem.tok.latalphabetonly.v2.json", field="train")
#dataset_split_train = dataset['train'][:100]
#dataset_split = dataset['test'][:100]
dataset_split = dataset['train'].train_test_split(test_size=0.001, shuffle=True)
print(dataset_split)
dataset_split_train = dataset['train'].train_test_split(test_size=0.01, shuffle=True)
print(dataset_split_train)
tokenized_datasets_test = dataset_split['test'].map(preprocess_function, batched=True, num_proc=4)
tokenized_datasets_train = dataset_split_train['test'].map(preprocess_function, batched=True, num_proc=4)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,                   # Abilita il mascheramento
    mlm_probability=0.15        # Percentuale di token da mascherare
)

#il 20% dei dati viene usato come test e l'80% viene usato come train, per evitare overfitting
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/TrainingLog",
    save_strategy="steps",
    eval_strategy="steps",
    save_steps=500,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=3,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/TrainingLog/logs",
    fp16=True,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=24,
    logging_steps=100,
    warmup_steps=1000,
    save_total_limit=2,
    greater_is_better=False,
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    optim="adamw_torch"
)

#metric = evaluate.load("accuracy")
def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

def compute_metrics(pred):

    labels_ids = pred.label_ids
    pred_ids = pred.predictions[0]

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    min_length = min(len(pred_str), len(label_str))
    pred_str = pred_str[:min_length]
    label_str = label_str[:min_length]

    rouge_output = rouge.compute(
        predictions=pred_str,
        references=label_str,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"],
    )

    return {
        "R1": round(rouge_output["rouge1"], 4),
        "R2": round(rouge_output["rouge2"], 4),
        "f1": round(rouge_output["rougeL"], 4),
        "RLsum": round(rouge_output["rougeLsum"], 4),
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids, labels


class MyTrainer(Trainer):
    def training_step(self, model, inputs, optimizer): # Aggiungi optimizer
        # Esegui il training step standard
        outputs = super().training_step(model, inputs, optimizer) # Aggiungi optimizer
        # Svuota la cache della GPU
        torch.cuda.empty_cache()
        return outputs


trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
#    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
#    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/latin-mlm-roberta")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/latin-mlm-roberta")

results_eval = trainer.evaluate()
print(results_eval)



Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9325372
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4666
    })
})
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9320707
    })
    test: Dataset({
        features: ['text'],
        num_rows: 9331
    })
})


Map (num_proc=4):   0%|          | 0/4666 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/9331 [00:00<?, ? examples/s]

  trainer = MyTrainer(


Epoch,Training Loss,Validation Loss,R1,R2,F1,Rlsum
1,16.2662,3.719595,0.0225,0.0012,0.0216,0.0216


{'eval_loss': 3.7445733547210693, 'eval_R1': 0.0232, 'eval_R2': 0.0012, 'eval_f1': 0.0222, 'eval_RLsum': 0.0223, 'eval_runtime': 37.2019, 'eval_samples_per_second': 125.424, 'eval_steps_per_second': 7.849, 'epoch': 1.0}
