<a href="https://colab.research.google.com/github/Cicciokr/latin-ai-model/blob/main/Fine_Tuning_Transformer_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
#!pip install evaluate
#!pip install rouge_score
!pip install wandb
!pip install transformers==4.35.2
#!pip install datasets==2.15.0
#!pip install evaluate==0.4.1
!pip install accelerate==0.25.0
#!pip install tqdm==4.66.1
!pip install "numpy<2.0"


import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, RobertaTokenizerFast, XLMRobertaTokenizerFast, XLMRobertaForMaskedLM, RobertaForMaskedLM, TrainerCallback, EarlyStoppingCallback, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
#from numba import cuda
import math
import torch
import wandb
import os
from accelerate import notebook_launcher

os.environ["WANDB_PROJECT"] = "Training-Perseus"
wandb.login(key="e115f18967efc277b293c042cb216e776d82a741")
wandb.init()
#import evaluate
#rouge = evaluate.load('rouge')

#dataset testo
#dataset = load_dataset('text', data_files='la.txt')
#dataset parquet
#dataset = load_dataset("Cicciokr/CC-100-Latin", revision="refs/convert/parquet")
#MODEL_NAME = "ClassCat/roberta-base-latin-v2"
#MODEL_NAME = "FacebookAI/xlm-roberta-base"
MODEL_NAME = "Cicciokr/XLM-Roberta-Base-Latin-Uncased"
#device_cuda = cuda.get_current_device()
#device_cuda.reset()

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#torch.cuda.set_per_process_memory_fraction(0.95)
#torch.cuda.empty_cache()


def notebook_train_launch():
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=True, model_max_length=512)
    model = XLMRobertaForMaskedLM.from_pretrained(MODEL_NAME, device_map='auto')
    model.gradient_checkpointing_enable()
    #model.to(device)
    
    #def preprocess_function(examples):
    #    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)


    def preprocess_function(examples):
       label_mapping = {'i': 0, 'g': 1}
       inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
       return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
       }
    
    #dataset = load_dataset("parquet", data_dir="./parquet", trust_remote_code=True)
    #tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=4)
    #tokenized_dataset.save_to_disk("./dataset/tokenized_dataset")
    #tokenized_dataset = load_from_disk("./dataset_light/tokenized_dataset")
    #dataset = load_dataset("pstroe/cc100-latin", data_files="la.nolorem.tok.latalphabetonly.v2.json", field="train")
    dataset = load_dataset("text", data_files="/kaggle/input/the-latin-library/perseus_complete_cleaned.txt")
    print(dataset)
    #Eseguo lo split tra train e test applicando lo shuffle
    dataset_split = dataset['train'].train_test_split(test_size=0.8, train_size=0.2, shuffle=True)
    #dataset_split_train = dataset['train'].train_test_split(test_size=0.0001, shuffle=True)
    # Applicare la tokenizzazione
    tokenized_datasets_test = dataset_split['test'].map(preprocess_function, batched=True, num_proc=4)
    tokenized_datasets_train = dataset_split['train'].map(preprocess_function, batched=True, num_proc=4)
    
    tokenized_datasets_test=tokenized_datasets_test.remove_columns(["text"])
    #tokenized_datasets_test=tokenized_datasets_test.rename_column("text", "labels")
    tokenized_datasets_test.set_format("torch")
    
    tokenized_datasets_train=tokenized_datasets_train.remove_columns(["text"])
    #tokenized_datasets_train=tokenized_datasets_train.rename_column("text", "labels")
    tokenized_datasets_train.set_format("torch")
    
    print(tokenized_datasets_train)
    print(tokenized_datasets_test)
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,                   # Abilita il mascheramento
        mlm_probability=0.15        # Percentuale di token da mascherare
    )
    
    #il 20% dei dati viene usato come test e l'80% viene usato come train, per evitare overfitting
    training_args = TrainingArguments(
        output_dir="/kaggle/working/TrainingLog",
        run_name="latinroberta",
        save_strategy="steps",
        evaluation_strategy="steps",
        save_steps=200,
        eval_steps=200,
        learning_rate=2e-5,
        auto_find_batch_size=True,
        num_train_epochs=3,
        weight_decay=0.05,
        max_grad_norm=1.0,
        logging_dir="/kaggle/working/TrainingLog/logs",
        fp16=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=8,
        eval_accumulation_steps=16,
        logging_steps=200,
        warmup_steps=200,
        save_total_limit=2,
        greater_is_better=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        overwrite_output_dir=True,
        report_to="wandb",
        optim="adamw_torch",
        lr_scheduler_type="linear",
        save_safetensors=False
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets_train,
        eval_dataset=tokenized_datasets_test,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        tokenizer=tokenizer
    )
    
    trainer.train()
    
    model.save_pretrained("/kaggle/working/xlmlatinroberta")
    tokenizer.save_pretrained("/kaggle/working/xlmlatinroberta")
    
    #results_eval = trainer.evaluate()
    #print(results_eval)
    
    wandb.finish()

notebook_launcher(notebook_train_launch, num_processes=2)