# FineTune first task

In [1]:
!pip install gliner==0.1.12
!pip install omegaconf
!pip install boto3 --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting boto3
  Downloading boto3-1.34.148-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.35.0,>=1.34.148 (from boto3)
  Downloading botocore-1.34.148-py3-none-any.whl.metadata (5.7 kB)
Downloading boto3-1.34.148-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading botocore-1.34.148-py3-none-any.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# utils
import copy
import random
import os
from omegaconf import OmegaConf
from matplotlib import pyplot as plt

# library with comfort api for model
from gliner import GLiNER
from construct_dataset import construct_dataset, get_span_tags

# import learning features
import torch
from tqdm import tqdm
from transformers import get_cosine_schedule_with_warmup

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
OmegaConf.register_new_resolver('switcher', lambda swtch, a, b: a if swtch else b, replace=True)

# setup config
config = OmegaConf.create({
    # global constraints
    'seed': 42,
    'use_large_context': False,
    'model': "${switcher: ${use_large_context}, 'numind/NuNerZero_long_context', 'numind/NuNerZero'}",
    'model_context_len': "${switcher: ${use_large_context}, 2048, 384}",
    'train_path': '/kaggle/input/unified-context-analysis/swda_parsed.json',
    'test_path': 0.1,
    'max_labels': 10,

    # training config
    'log_dir': './logs',
    'device': 'cuda',
    'warmup_ratio': 0.1, 
    'epoches': 100,
    'trace_on_each': 10,
    'train_batch_size': 2,

    # train instances
    'lr_encoder': 1e-5,
    'lr_others': 5e-5,
    'freeze_token_rep': False,
    'shuffle_types': True,
    'random_drop': True,
    'max_types': 3,
    'max_neg_type_ratio': 1,
})

if not os.path.exists(config.log_dir):
    os.mkdir(config.log_dir)
    
print(OmegaConf.to_yaml(config, resolve=True))

use_large_context: false
model: numind/NuNerZero
model_context_len: 384
train_path: ../datasets/sciERC/scierc_train.json
test_path: ../datasets/sciERC/scierc_test.json
log_dir: ./logs
device: cuda
warmup_ratio: 0.1
epoches: 50
trace_on_each: 5
train_batch_size: 2
lr_encoder: 1.0e-05
lr_others: 5.0e-05
freeze_token_rep: false
shuffle_types: true
random_drop: true
max_types: 1
max_neg_type_ratio: 1



In [4]:
# change config if you wish here :)
# config.model = 'urchade/gliner_multi-v2.1'
# config.model_context_len = 384

In [4]:
model = GLiNER.from_pretrained(config.model)



In [5]:
# wrapper over model tokenizer
token_parser_func = lambda s: model.token_splitter(s)
dataset_train = construct_dataset(
    config.train_path, 
    config.model_context_len, 
    token_parser_func, 
    max_labels=config.max_labels, 
    verbose=True
)

if isinstance(config.test_path, str):
    dataset_test  = construct_dataset(config.test_path, config.model_context_len, token_parser_func, verbose=True)
    span_tags = get_span_tags(config.test_path)
else:
    # shuffle train dataset for train construction
    random.seed(config.seed)
    random.shuffle(dataset_train)
    
    # get amount of objects to train dataset
    fraction = config.test_path if isinstance(config.test_path, float) else 0.2
    test_size = int(len(dataset_train) * fraction)
    
    # splitting dataset on train and test
    dataset_train, dataset_test = dataset_train[:-test_size], dataset_train[-test_size:]
    span_tags = get_span_tags(config.train_path)

dataset proccessing: 100%|██████████| 350/350 [00:00<00:00, 4773.99it/s]
dataset proccessing: 100%|██████████| 350/350 [00:00<00:00, 4765.22it/s]


In [None]:
def train(
    model: torch.nn.Module,
    train_dt: dict,
    eval_dt: dict | None = None,
):
    model = model.to(config.device)
    model.train()

    # Set sampling parameters from config
    model.set_sampling_params(
        max_types=config.max_types,
        shuffle_types=config.shuffle_types,
        random_drop=config.random_drop,
        max_neg_type_ratio=config.max_neg_type_ratio,
        max_len=config.model_context_len
    )

    # Create dataloader instance
    train_loader = model.create_dataloader(train_dt, batch_size=config.train_batch_size, shuffle=True)
    num_steps = config.epoches * len(train_loader)

    # Create optimizer instance
    optimizer = model.get_optimizer(config.lr_encoder, config.lr_others, config.freeze_token_rep)
    
    # Create learning scheduler
    if config.warmup_ratio < 1:
        num_warmup_steps = int(num_steps * config.warmup_ratio)
    else:
        num_warmup_steps = int(config.warmup_ratio)

    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_steps
    )
    
    
    tqdm_display = tqdm(range(config.epoches))
    for epoch in tqdm_display:
        accum_loss = 0

        for x in train_loader:
            x_copy = copy.deepcopy(x)
            for k, v in x_copy.items():
                if isinstance(v, torch.Tensor):
                    x_copy[k] = v.to(config.device)

            loss = model(x_copy)  # Forward pass

            # Check if loss is nan
            if torch.isnan(loss):
                continue

            optimizer.zero_grad()  # Reset gradients
            loss.backward()        # Compute gradients
            optimizer.step()       # Update parameters
            scheduler.step()       # Update learning rate schedule

            # adding batch_loss on step
            accum_loss += loss.item()


        description = f"epoch: {epoch} loss: {accum_loss / len(train_loader):.2f}"
        tqdm_display.set_description(description)

        if (epoch + 1) % config.trace_on_each == 0 and eval_dt is not None:
            model.eval()

            results, f1 = model.evaluate(eval_dt["samples"], flat_ner=True, threshold=0.5, batch_size=12,
                                     entity_types=eval_dt["entity_types"])
            print(f"Epoch={epoch}\n{results}")
            
            model.save_pretrained(f"{config.log_dir}/finetuned_{epoch}")

            model.train()


In [None]:
eval_ds = {
    'entity_types': span_tags,
    'samples': dataset_test
}

train(model, dataset_train, eval_ds)

In [74]:
lines = [
    'Artem Artem Fedorov, hellow, how Artem Fedorov Artem Fedorov are you, Artem Fedorov',
    'Im fine bro whats the matter',
    'oh yeah man'
]

labels = ['name']

In [75]:
from model_wrapping import wrap_unifed_dataset
wrap_unifed_dataset(model, labels, lines)

{'assessors': ['model'],
 'dataset': {'span_tags': ['name'],
  'relation_tags': [],
  'markups': [{'assessor': 0,
    'text': 'Artem Artem Fedorov, hellow, how Artem Fedorov Artem Fedorov are you, Artem Fedorov',
    'relations': [],
    'spans': [{'begin': 0, 'end': 19, 'id': 0, 'tags': [0]},
     {'begin': 33, 'end': 60, 'id': 1, 'tags': [0]},
     {'begin': 70, 'end': 83, 'id': 2, 'tags': [0]}]},
   {'assessor': 0,
    'text': 'Im fine bro whats the matter',
    'relations': [],
    'spans': [{'begin': 8, 'end': 11, 'id': 0, 'tags': [0]}]},
   {'assessor': 0,
    'text': 'oh yeah man',
    'relations': [],
    'spans': [{'begin': 8, 'end': 11, 'id': 0, 'tags': [0]}]}]}}