# Дообучение предтренированной MLM модели

In [None]:
import os
import sys
import json
import random
import warnings
import torch

sys.path.append("../../..")

from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertModel, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

import local_trainer.utils as utils
import numpy as np
import pandas as pd
import mlflow
import mlflow.pytorch
from datetime import datetime
from local_trainer.func import get_metrics, ClassificationTrainer, HFDataset, HFModel, ManualScheduler, HFCollator, DefaultTextProcessing, DummyAugmentation
from transformers import AutoModel, AutoTokenizer, RobertaForSequenceClassification, RobertaModel

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [5]:
utils.set_seed(1337)

In [None]:
roberta = RobertaModel.from_pretrained(os.getenv('ROBERTA_PATH'), output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(os.getenv('TOKENIZER_PATH'), truncation_side='left')

tokenizer_args = {
    'padding': True,
    'truncation': True,
    'max_length': 256,
}

Some weights of the model checkpoint at ../../../mnt/datastore/u_m25hx/distillation_dir were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../../../mnt/datastore/u_m25hx/distillation_dir and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRA

In [None]:
experiment_name = "context_jaicp_model"
current_date: str = datetime.now().strftime("%Y-%m-%d %k:%M")

EXPERIMENT_NAME_TEMPLATE: str = "{model}"
RUN_NAME_TEMPLATE: str = "{date}"

experiment_name: str = EXPERIMENT_NAME_TEMPLATE.format(model=experiment_name)
run_name: str = RUN_NAME_TEMPLATE.format(date=current_date)

mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlflow/953', creation_time=1720979156829, experiment_id='953', last_update_time=1720979156829, lifecycle_stage='active', name='context_jaicp_model', tags={}>

In [None]:
run_config = {
    "exp_name": experiment_name,
    "run_name": run_name,
    "device": "cuda:0",
    "dataset": {
        "inference": False,
        "aug": DummyAugmentation(),
        "aug_type": "add_symbols",
        "tokenizer": tokenizer,
        "tokenizer_args": tokenizer_args,
        "tokenizer_output_keys": tokenizer.model_input_names,
        "text_preprocessing": DefaultTextProcessing()
    },
    "dataloader": {
        "batch_size": 32,
        "sequence_bucketing": True,
        "num_buckets": 10,
        "collate_fn": HFCollator(
            model_input_names=tokenizer.model_input_names,
            pad_token_id=tokenizer.pad_token_id,
            inference=False,
        ),
        "pin_memory": True,
        "num_workers": 8
    },
    "metrics": {
        "acc": {
            "value": "accuracy_score",
            "output_key": "output",
            "target_key": "target",
            "args": {}
        },
        "weighted_f0.5": {
            "value": "fbeta_score",
            "output_key": "output",
            "target_key": "target",
            "args": {
                "average": "weighted",
                "beta": 0.5
            }
        },
        "f1": {
            "value": "f1_score",
            "output_key": "output",
            "target_key": "target",
            "args": {
                "average": "macro"
            }
        },
    },
    "main_metric": "weighted_f0.5",
    "model": {
        "avg": "mean-attention",
        "backbone": roberta,
        "init_": nn.init.xavier_uniform_,
        "dropout_after_backbone": 0.5,
        "attn_avg": "default_attn_avg",
        'proj_pair': "default_proj_pair",
        'out_layers': "default_layers",
        'layer_wise_token_pooling': 4
    },
    "optimizer": {
        "name": "AdamW"
    }
}

In [None]:
dfs = utils.load_dataframes(os.getenv('DATA_PATH_DEFAULT'), parts=["train", "valid"])
datasets = utils.get_datasets(dfs, HFDataset, **run_config["dataset"])

train info:
	shape: (356303, 5)
	nintents: 293
valid info:
	shape: (62877, 5)
	nintents: 293


100%|██████████| 356303/356303 [09:10<00:00, 647.36it/s]


Map:   0%|          | 0/356303 [00:00<?, ? examples/s]

100%|██████████| 62877/62877 [01:42<00:00, 613.29it/s]


Map:   0%|          | 0/62877 [00:00<?, ? examples/s]

In [10]:
loaders = utils.get_loaders(datasets, **run_config["dataloader"])

In [11]:
import math

EPOCHS = 10
accumulation_steps = 1
grad_norm = 1.0

metrics = get_metrics(run_config["metrics"])
model = HFModel(label_size=datasets["train"].intents.__len__(), **run_config['model'])

In [None]:
def roberta_base_AdamW_grouped_LLRD(model, run_config):
        
    opt_parameters = []
    named_parameters = list(model.named_parameters()) 
    
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    set_2 = ["layer.3", "layer.4", "layer.5"]
    set_3 = ["layer.6", "layer.7", "layer.8"]
    set_4 = ["layer.9", "layer.10", "layer.11"]
    init_lr = 3e-5
    mu = 0.95
    
    for i, (name, params) in enumerate(named_parameters):  
        
        weight_decay = 0.0 if any(p in name for p in no_decay) else 0.01
 
        if name.startswith("BACKBONE.embeddings") or name.startswith("BACKBONE.encoder"):            
           
            lr = init_lr * mu * mu * mu * mu       
            
            # For set_2
            lr = init_lr * mu * mu * mu if any(p in name for p in set_2) else lr
            
            # For set_3
            lr = init_lr * mu * mu if any(p in name for p in set_3) else lr
            
            # For set_4
            lr = init_lr * mu if any(p in name for p in set_4) else lr
            
            
        elif name.startswith("proj") or name.startswith("BACKBONE.pooler") or name.startswith("OUT") or name.startswith("ATTN"):               
            lr = init_lr
        
        else:
            # Check whether we proceed all the parameters nor not
            print(name)
            
        opt_parameters.append({"params": params,
                               "weight_decay": weight_decay,
                               "lr": lr})    
    return getattr(torch.optim, run_config["optimizer"]["name"])(opt_parameters, lr=init_lr)

def criterion(output, batch):
    return nn.functional.cross_entropy(output["output"], batch["target"])

t_total = len(loaders["train"]) // accumulation_steps * EPOCHS
warmup_ratio = 0.1

optimizer = roberta_base_AdamW_grouped_LLRD(model, run_config)
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=math.ceil(t_total * warmup_ratio),
    num_training_steps=t_total
)

In [None]:
from torch.optim.swa_utils import AveragedModel

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
swa_model = AveragedModel(model).to(device)

swa = {
    "start_swa": 7,
    "model": swa_model,
}

In [None]:
with mlflow.start_run(run_name=run_config["run_name"]):
    
    artifact_uri = mlflow.get_artifact_uri(run_config["exp_name"])
    print(f"artifact uri: {artifact_uri}")
    
    mlflow.log_param(
        'Epochs', EPOCHS
    )
    mlflow.log_param(
        'BatchSize', run_config['dataloader']['batch_size']
    )
    mlflow.log_param(
        'Sheduler', 'linear'
    )
    mlflow.log_param(
        'Avg', run_config['model']['avg']
    )
    
    
    trainer = ClassificationTrainer(
        exp_name=run_config["exp_name"],
        dataloaders=loaders,
        keys_to_device=datasets["train"].keys_to_device,
        target_keys=datasets["train"].target_keys,
        model=model,
        criterion=criterion,
        metrics=metrics, 
        main_metric=run_config["main_metric"],
        optimizer=optimizer,
        scheduler=scheduler,
        scheduler_style="step",
        swa=swa,
        accumulation_steps=accumulation_steps,
        grad_norm=grad_norm,
        logging_step=5,
        device=run_config["device"],
        logging="mlflow",
        save_on_device=False
    )
    
    trainer.train(EPOCHS)

artifact uri: s3://mlflow/953/2385ad6331db40548722322d242c3401/artifacts/context_jaicp_model


Train 1/10, loss 1.1353, acc 0.8092, weighted_f0.5 0.8073, f1 0.7593, lr_group_0 0.0000300: 100%|██████████| 11134/11134 [57:34<00:00,  3.22it/s] 


Valid 1/10, loss 0.1846, acc 0.9658, weighted_f0.5 0.9649, f1 0.8298



Train 2/10, loss 0.1394, acc 0.9706, weighted_f0.5 0.9710, f1 0.9448, lr_group_0 0.0000291: 100%|██████████| 11134/11134 [58:05<00:00,  3.19it/s] 


Valid 2/10, loss 0.1168, acc 0.9783, weighted_f0.5 0.9783, f1 0.9206



Train 3/10, loss 0.0854, acc 0.9816, weighted_f0.5 0.9820, f1 0.9652, lr_group_0 0.0000265: 100%|██████████| 11134/11134 [57:36<00:00,  3.22it/s] 


Valid 3/10, loss 0.1010, acc 0.9826, weighted_f0.5 0.9826, f1 0.9330



Train 4/10, loss 0.0577, acc 0.9872, weighted_f0.5 0.9876, f1 0.9758, lr_group_0 0.0000241:  64%|██████▍   | 7149/11134 [38:22<21:18,  3.12it/s]  IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Train 5/10, loss 0.0432, acc 0.9904, weighted_f0.5 0.9907, f1 0.9817, lr_group_0 0.0000179:  95%|█████████▍| 10544/11134 [53:54<03:12,  3.06it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Train 7/10, loss 0.0193, acc 0.9959, weighted_f0.5 0.9961, f1 0.9921, lr_

Valid 9/10, loss 0.1007, acc 0.9867, weighted_f0.5 0.9867, f1 0.9685



Train 10/10, loss 0.0149, acc 0.9968, weighted_f0.5 0.9969, f1 0.9938, lr_group_0 0.0000124:  70%|███████   | 7815/11134 [45:58<24:55,  2.22it/s]  IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
torch.optim.swa_utils.update_bn(loaders['train'], swa_model)