<a href="https://colab.research.google.com/github/24p11/recode-icd/blob/main/final_finetuning_different_backbone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install gliner==0.2.21
# !pip install accelerate -U
# !pip install transformers==4.48.0

In [None]:
HF_TOKEN = "xxx"

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
import argparse
import random
random.seed(42)
import json

from transformers import AutoTokenizer
import torch

from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset

from transformers import TrainerCallback

In [None]:
# class MetricCollector(TrainerCallback):
#     def __init__(self):
#         self.train_losses = []
#         self.grad_norms = []
#         self.eval_losses = []

#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs:
#             if "loss" in logs:
#                 self.train_losses.append((state.global_step, float(logs["loss"])))
#             if "grad_norm" in logs:
#                 self.grad_norms.append((state.global_step, float(logs["grad_norm"])))

#     def on_evaluate(self, args, state, control, metrics=None, **kwargs):
#         if metrics is not None:
#             if "eval_loss" in metrics:
#                 self.eval_losses.append(metrics["eval_loss"])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_path = "/content/drive/MyDrive/Scenario/modern_data_1000_ner.json"

print ("Loading data...")
with open(data_path, 'r', encoding="utf-8") as f:
    data = json.load(f)
print ("Data loaded!!!")

print('Dataset size:', len(data))
print ("Shuffling data...")
random.shuffle(data)
print ("Data shuffled!!!")

print ("Dividing data...")
train_data = data[:int(len(data)*0.9)]
dev_data = data[int(len(data)*0.9):]
print ("Data divided!!!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading data...
Data loaded!!!
Dataset size: 995
Shuffling data...
Data shuffled!!!
Dividing data...
Data divided!!!


In [None]:
config = load_config_as_namespace("config-different-backbone.yaml")
config.log_dir = "gliner_modernbert/"
# config.num_steps = 4
# config.train_batch_size = 4
# config.eval_every = 2
config.output_dir = config.log_dir+f"lossgamma{config.loss_gamma}_lrencoder{config.lr_encoder}_lrothers{config.lr_others}"

model_config = GLiNERConfig(**vars(config))
tokenizer = AutoTokenizer.from_pretrained(model_config.model_name, add_prefix_space=True)
words_splitter = WordsSplitter(model_config.words_splitter_type)
model = GLiNER(model_config, tokenizer=tokenizer, words_splitter=words_splitter)


model_config.class_token_index=len(tokenizer)
tokenizer.add_tokens([model_config.ent_token, model_config.sep_token], special_tokens=True)
model_config.vocab_size = len(tokenizer)
model.resize_token_embeddings([model_config.ent_token, model_config.sep_token],
                              set_class_token_index=False,
                              add_tokens_to_tokenizer=False)

model.model.token_rep_layer.bert_layer.model.requires_grad_(True)

data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)

In [None]:
training_args = TrainingArguments(
        output_dir=config.output_dir,
        learning_rate=float(config.lr_encoder),
        weight_decay=float(config.weight_decay_encoder),
        others_lr=float(config.lr_others),
        others_weight_decay=float(config.weight_decay_others),
        focal_loss_gamma=float(config.loss_gamma),
        focal_loss_alpha=float(config.loss_alpha),
        loss_reduction=config.loss_reduction,
        lr_scheduler_type=config.scheduler_type,
        warmup_ratio=config.warmup_ratio,
        per_device_train_batch_size=config.train_batch_size,
        per_device_eval_batch_size=config.train_batch_size*2,
        max_grad_norm=config.max_grad_norm,
        max_steps=config.num_steps,
        eval_strategy="steps",
        eval_steps=config.eval_every,
        save_strategy="steps",
        save_steps = config.eval_every,
        logging_strategy="steps",
        logging_steps=100,
        # logging_steps=1,
        save_total_limit=config.save_total_limit,
        dataloader_num_workers=0,
        use_cpu=False,
        report_to="tensorboard",
        seed=42,
        eval_do_concat_batches=True,
        eval_on_start=True,
        save_only_model=True
        )

# collector = MetricCollector()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=dev_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # callbacks=[collector],
)

  trainer = Trainer(


In [None]:
print ("Training...")
trainer.train()
print ("Done training!!!")

Training...


Step,Training Loss,Validation Loss
0,No log,11686057.0
4000,352.206800,1035.311646
8000,89.007600,2783.84082
12000,30.985600,4544.87207


KeyboardInterrupt: 

In [None]:
# !rm -rf gliner_modernbert

In [None]:
with open(f"{config.output_dir}/metrics.json", "w", encoding="utf-8") as f:
      json.dump(
          {
              "train_losses": collector.train_losses,
              "grad_norms": collector.grad_norms,
              "eval_losses": collector.eval_losses,
          },
          f
      )

In [None]:
!zip -r gliner_modernbert.zip gliner_modernbert

  adding: gliner_modernbert/ (stored 0%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/ (stored 0%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/metrics.json (deflated 61%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/ (stored 0%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/tokenizer.json (deflated 82%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/tokenizer_config.json (deflated 95%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/gliner_config.json (deflated 64%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/special_tokens_map.json (deflated 79%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/pytorch_model.bin (deflated 7%)
  adding: gliner_modernbert/lossgamma0.0_lrencoder1e-5_lrothers5e-5/checkpoint-8000/trainer_s

In [None]:
from huggingface_hub import HfApi, login

login(token=HF_TOKEN)

api = HfApi()

api.upload_file(
    path_or_fileobj="gliner_modernbert.zip",
    path_in_repo="gliner_modernbert.zip",
    repo_id="anhthuw01/gliner_onco",
    repo_type="model"
)