<a href="https://colab.research.google.com/github/Cicciokr/latin-ai-model/blob/main/Fine_Tuning_Transformer_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import streamlit as st
import numpy as np
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from sklearn.model_selection import train_test_split
import torch

st.title("Training")
#print(torch.cuda.device_count())
#print(torch.cuda.get_device_name(0))
#dataset testo
#dataset = load_dataset('text', data_files='la.txt')
#dataset parquet
#dataset = load_dataset("Cicciokr/CC-100-Latin", revision="refs/convert/parquet")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = RobertaTokenizerFast(
    vocab_file="./latinroberta-vocab.json",
    merges_file="./latinroberta-merges.txt",
)
config = RobertaConfig(
    vocab_size=len(tokenizer),
    max_position_embeddings=514,  # Lunghezza massima della sequenza
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)
print(tokenizer.mask_token)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Applicare la tokenizzazione
model = RobertaForMaskedLM(config=config)
model.to(device)
#dataset = load_dataset("parquet", data_dir="./parquet", trust_remote_code=True)
#tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=4)
#tokenized_dataset.save_to_disk("./dataset/tokenized_dataset")
#tokenized_dataset = load_from_disk("./dataset_light/tokenized_dataset")
dataset = load_dataset("pstroe/cc100-latin", data_files="la.nolorem.tok.latalphabetonly.v2.json", field="train")
dataset_split = dataset['train'].train_test_split(test_size=0.0001, train_size=0.0003, shuffle=True)
print(dataset_split)
tokenized_datasets_test = dataset_split['test'].map(preprocess_function, batched=True, num_proc=4)
tokenized_datasets_train = dataset_split['train'].map(preprocess_function, batched=True, num_proc=4)

data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer,
    mlm=True,                   # Abilita il mascheramento
    mlm_probability=0.15        # Percentuale di token da mascherare
)

#il 20% dei dati viene usato come test e l'80% viene usato come train, per evitare overfitting
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
    gradient_accumulation_steps=2,
    logging_steps=100,
    save_total_limit=2,
    metric_for_best_model="perplexity",
    optim="adamw_torch"
)

#metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Calcola la perplexità
    perplexity = math.exp(torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), reduction='mean'))
    return {"perplexity": perplexity}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    tokenizer=tokenizer
)

#dataset1 = tokenized_dataset['train'].select(range(0, 50000))
trainer.train()

model.save_pretrained("lat-roberta")
tokenizer.save_pretrained("lat-roberta")

results = trainer.evaluate()
print(results)