In [1]:
# !pip install -q --upgrade torch accelerate kernels
# !pip install -q git+https://github.com/huggingface/transformers triton==3.4 git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
# !pip uninstall -q torchvision torchaudio -y

In [2]:
gpt = False

In [3]:
import sys
sys.path.append('../src')
import paths

/home/user/mnlp/notebooks/../src/paths.py


In [4]:
if gpt:
    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, Mxfp4Config

    model_id = "openai/gpt-oss-20b"

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    config = AutoConfig.from_pretrained(model_id)

    quantization_config=Mxfp4Config.from_dict(config.quantization_config)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        torch_dtype="auto",
        device_map="cuda",
    )

In [5]:
if gpt:
    import pandas as pd
    import dataset
    import importlib
    importlib.reload(dataset)
    import gptoss_sent_split
    importlib.reload(gptoss_sent_split)
    from gptoss_sent_split import BOSConfig, read_token_label_file, build_bos_jobs_by_n_sentences, run_bos_labeling, sentences_from_word_seq

    cfg = BOSConfig(max_new_tokens=256)

    pairs = read_token_label_file(paths.data/"manzoni_train_tokens.csv")
    jobs = build_bos_jobs_by_n_sentences(pairs, tokenizer, cfg)
    y_pred = run_bos_labeling(jobs, model, tokenizer, cfg)

    tokens = [t for (t, _) in pairs]
    gold = [y for (_, y) in pairs]

    # Align lengths, just in case
    n = min(len(tokens), len(y_pred))
    tokens, gold, y_pred = tokens[:n], gold[:n], y_pred[:n]
    sents = sentences_from_word_seq(tokens, y_pred)
    import pickle
    with open(paths.results/'gptpredtrain.pkl', 'wb') as f:
        pickle.dump(y_pred, f)

In [6]:
if gpt:
    import pandas as pd
    import dataset
    import importlib
    importlib.reload(dataset)
    import gptoss_sent_split
    importlib.reload(gptoss_sent_split)
    from gptoss_sent_split import BOSConfig, read_token_label_file, build_bos_jobs_by_n_sentences, run_bos_labeling, sentences_from_word_seq

    cfg = BOSConfig(max_new_tokens=256)

    pairs = read_token_label_file(paths.data/"manzoni_dev_tokens.csv")
    jobs = build_bos_jobs_by_n_sentences(pairs, tokenizer, cfg)
    y_pred = run_bos_labeling(jobs, model, tokenizer, cfg)

    tokens = [t for (t, _) in pairs]
    gold = [y for (_, y) in pairs]

    # Align lengths, just in case
    n = min(len(tokens), len(y_pred))
    tokens, gold, y_pred = tokens[:n], gold[:n], y_pred[:n]
    sents = sentences_from_word_seq(tokens, y_pred)
    import pickle
    with open(paths.results/'gptpredval.pkl', 'wb') as f:
        pickle.dump(y_pred, f)

In [7]:
from gptoss_sent_split import BOSConfig, read_token_label_file, build_bos_jobs_by_n_sentences, run_bos_labeling, sentences_from_word_seq, SPECIAL_MARKER
from minerva_lora import load_tokenizer_and_model

MINERVA7B = "sapienzanlp/Minerva-7B-base-v1.0"
bf16 = True
tokenizer, model = load_tokenizer_and_model(MINERVA7B, qlora=True, use_bf16=bf16)

  from .autonotebook import tqdm as notebook_tqdm


Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  9.08s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
import minerva_lora
import importlib
importlib.reload(minerva_lora)
from minerva_lora import build_examples_from_pairs, make_splits, lora_cfg

pairs = read_token_label_file(paths.data/"manzoni_dev_tokens.csv")
jobs = build_examples_from_pairs(pairs, 5, 1)
ds = make_splits(jobs, 0.1)
ds

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 73
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 8
    })
})

In [None]:
from transformers import EarlyStoppingCallback   # NEW
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
import torch
import paths

# --- Training config ---
from transformers import EarlyStoppingCallback, TrainerCallback

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs: 
            return
        # drop the huge/boring keys
        drop = {"total_flos","train_runtime","train_samples_per_second","train_steps_per_second"}
        clean = {k: v for k, v in logs.items() if k not in drop}
        print(f"[step {state.global_step}/{state.max_steps}] {clean}")

cfg = SFTConfig(
    output_dir=paths.chekpoints/"minerva",
    num_train_epochs=2,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    # <- logging every optimizer step
    logging_strategy="steps",
    logging_steps=1,
    logging_first_step=True,
    # <- eval + early stopping
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    # make stdout prints instead of only a tqdm bar:
    disable_tqdm=True,
    log_level="info",
    report_to=None,  # or "none"
    gradient_checkpointing=True,
    bf16=True,
    dataset_num_proc=2,
    dataset_kwargs={"prompt_column":"prompt","completion_column":"completion"},
    completion_only_loss=True,
)

peft_config = lora_cfg()

trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    train_dataset=ds["train"],
    eval_dataset=ds.get("validation"),
    args=cfg,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0),
        ConsoleLogger(),
    ],
)
trainer.train()

# Save PEFT adapters + tokenizer
trainer.model.save_pretrained(paths.chekpoints/"minerva")
tokenizer.save_pretrained(paths.chekpoints/"minerva")


Adding EOS to train dataset (num_proc=2): 100%|██████████| 73/73 [00:00<00:00, 215.51 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 73/73 [00:00<00:00, 117.51 examples/s]
Truncating train dataset (num_proc=2): 100%|██████████| 73/73 [00:00<00:00, 189.94 examples/s]
Adding EOS to eval dataset (num_proc=2): 100%|██████████| 8/8 [00:00<00:00, 22.73 examples/s]
Tokenizing eval dataset (num_proc=2): 100%|██████████| 8/8 [00:00<00:00, 14.11 examples/s]
Truncating eval dataset (num_proc=2): 100%|██████████| 8/8 [00:00<00:00, 21.88 examples/s]


RuntimeError: TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or install tensorboardX.

In [None]:
# %% [markdown]
# ### Quick inference check with the 7B LoRA adapter

import transformers, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from gptoss_sent_split import SPECIAL_MARKER, SYSTEM_PROMPT

base_id = MINERVA7B
adapter_dir = "/home/user/mnlp/checkpoints/minerva"

# 1) Load tokenizer that includes <BOS> (saved during training)
tok = AutoTokenizer.from_pretrained(adapter_dir, use_fast=True)

# 2) Load base model
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                             bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
base = AutoModelForCausalLM.from_pretrained(
    base_id,
    quantization_config=bnb_cfg,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# 3) If sizes differ, resize embeddings to tokenizer size (adds the new row)
if base.get_input_embeddings().weight.shape[0] != len(tok):
    base.resize_token_embeddings(len(tok))
    try:
        base.tie_weights()   # safe if the model ties lm_head <-> embeddings
    except Exception:
        pass

# 4) Now load the LoRA adapter
model = PeftModel.from_pretrained(base, adapter_dir)
model.eval()

def generate_bos(text: str, max_new_tokens: int = 2048):
    prompt = (
        "### System\n" + SYSTEM_PROMPT + "\n"
        "### User\n" + text + "\n"
        "### Assistant\n"
    )
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
    gen = tok.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return gen



Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  9.05s/it]


In [None]:
def generate_bos(text: str, max_new_tokens: int = 2048):
    prompt = (
        "### System\n" + SYSTEM_PROMPT + "\n"
        "### User\n" + text + "\n"
        "### Assistant\n"
    )
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
    gen = tok.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return gen

sample = 'Ci vuol degli uomini fatti apposta.» Altre volte Renzo si risolveva d\' andar di nascosto "," travestito "," e con un nome finto. Ma anche da questo "," Bortolo seppe svolgerlo ogni volta "," con ragioni troppo facili a indovinarsi. Scoppiata poi la peste nel milanese "," e appunto "," come abbiam detto "," sul confine del bergamasco "," non tardò molto a passarlo; e… non vi sgomentate "," ch\' io non vi voglio raccontar la storia anche di questa: chi la volesse "," la c\' è "," scritta per ordine pubblico da un certo Lorenzo Ghirardelli: libro raro però e sconosciuto "," quantunque contenga forse più roba che tutte insieme le descrizioni più celebri di pestilenze: da tante cose dipende la celebrità de\' libri! Quel ch\' io volevo dire è che Renzo prese anche lui la peste "," si curò da sé "," cioè non fece nulla; ne fu in fin di morte "," ma la sua buona complessione vinse la forza del male: in pochi giorni "," si trovò fuor di pericolo.'
print(generate_bos(sample))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<B>Ci vuol degli uomini fatti apposta.» Altre volte Renzo si risolveva d' andar di nascosto "," travestito "," e con un nome finto. Ma anche da qu<B>esto "," Bortolo seppe svolgerlo ogni volta "," con ragioni troppo facili a indovinarsi. Scoppiata poi la peste<B> nel milanese "," e appunto "," come abbiam detto "," sul confine del bergamasco "," non tardò molto a passarlo; e... non vi sgomentate "," ch' io non vi voglio raccontar la storia anche di questa: chi la
