##  Импорт библиотек

In [6]:
import json
import os
import random
import sys
from typing import Callable, Dict, Iterable, Tuple, Type

import pandas as pd
import torch
from sentence_transformers import InputExample, SentenceTransformer, losses
from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers.model_card_templates import ModelCardTemplate
from sentence_transformers.util import batch_to_device, fullname
from torch import nn
from torch.optim import Optimizer
from torch.utils.data import DataLoader
from tqdm import tqdm
from tqdm.autonotebook import trange
from transformers import is_torch_npu_available

sys.path.append(os.path.pardir)

from project_consts import PROJECT_ROOT

##  Подключаем wandb

In [7]:
import wandb

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [30]:
wandb.init(
    project="intents_classifier",
)

VBox(children=(Label(value='1816.707 MB of 1816.707 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_TripletLoss,██▇▇█▇▇▆▆▆▅▅▅▄▄▅▃▃▆▄▃▄▃▄▃▄▂▂▄▁▃▄▂▄▁▃▃▁▂▂

0,1
train_TripletLoss,4.0821


## Константы

In [31]:
FILE_NAME = "intents_chat_bot_pervaya_lin-1000067115-HTA.json"
RAW_DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", FILE_NAME)
PROCESSED_FOLDER_PATH = os.path.join(PROJECT_ROOT, "data", "processed")

##  Получение данных

In [32]:
intents_df = pd.read_json(os.path.join(PROCESSED_FOLDER_PATH, "intents_prepared.json"))
intents_df.head()

Unnamed: 0,intent_id,intent_path,phrase
0,24174474,/Пересекающиеся/Продлить полис,продлить полис осаго
1,24174474,/Пересекающиеся/Продлить полис,мне нужно продлить полис
2,24174474,/Пересекающиеся/Продлить полис,нам нужно продлить полис страхования
3,24174474,/Пересекающиеся/Продлить полис,каким образом я могу продлить полис осаго от
4,24174474,/Пересекающиеся/Продлить полис,помогите продлить страховку


## Файнтюним модель на своих данных

Так как sentence-transformers все еще не позволяет нормально подключать логгеры, немного перепишем код и добавим логирование лоссов в wandb

In [33]:
class CustomSentenceTransformer(SentenceTransformer):
    def fit(
        self,
        train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
        evaluator: SentenceEvaluator = None,
        epochs: int = 1,
        steps_per_epoch=None,
        scheduler: str = "WarmupLinear",
        warmup_steps: int = 10000,
        optimizer_class: Type[Optimizer] = torch.optim.AdamW,
        optimizer_params: Dict[str, object] = {"lr": 2e-5},
        weight_decay: float = 0.01,
        evaluation_steps: int = 0,
        output_path: str = None,
        save_best_model: bool = True,
        max_grad_norm: float = 1,
        use_amp: bool = False,
        callback: Callable[[float, int, int], None] = None,
        show_progress_bar: bool = True,
        checkpoint_path: str = None,
        checkpoint_save_steps: int = 500,
        checkpoint_save_total_limit: int = 0,
    ):

        ##Add info to model card
        # info_loss_functions = "\n".join(["- {} with {} training examples".format(str(loss), len(dataloader)) for dataloader, loss in train_objectives])
        info_loss_functions = []
        for dataloader, loss in train_objectives:
            info_loss_functions.extend(
                ModelCardTemplate.get_train_objective_info(dataloader, loss)
            )
        info_loss_functions = "\n\n".join([text for text in info_loss_functions])

        info_fit_parameters = json.dumps(
            {
                "evaluator": fullname(evaluator),
                "epochs": epochs,
                "steps_per_epoch": steps_per_epoch,
                "scheduler": scheduler,
                "warmup_steps": warmup_steps,
                "optimizer_class": str(optimizer_class),
                "optimizer_params": optimizer_params,
                "weight_decay": weight_decay,
                "evaluation_steps": evaluation_steps,
                "max_grad_norm": max_grad_norm,
            },
            indent=4,
            sort_keys=True,
        )
        self._model_card_text = None
        self._model_card_vars["{TRAINING_SECTION}"] = (
            ModelCardTemplate.__TRAINING_SECTION__.replace(
                "{LOSS_FUNCTIONS}", info_loss_functions
            ).replace("{FIT_PARAMETERS}", info_fit_parameters)
        )

        if use_amp:
            if is_torch_npu_available():
                scaler = torch.npu.amp.GradScaler()
            else:
                scaler = torch.cuda.amp.GradScaler()
        self.to(self.device)

        dataloaders = [dataloader for dataloader, _ in train_objectives]

        # Use smart batching
        for dataloader in dataloaders:
            dataloader.collate_fn = self.smart_batching_collate

        loss_models = [loss for _, loss in train_objectives]
        for loss_model in loss_models:
            loss_model.to(self.device)

        self.best_score = -9999999

        if steps_per_epoch is None or steps_per_epoch == 0:
            steps_per_epoch = min([len(dataloader) for dataloader in dataloaders])

        num_train_steps = int(steps_per_epoch * epochs)

        # Prepare optimizers
        optimizers = []
        schedulers = []
        for loss_model in loss_models:
            param_optimizer = list(loss_model.named_parameters())

            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [
                        p
                        for n, p in param_optimizer
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": 0.0,
                },
            ]

            optimizer = optimizer_class(
                optimizer_grouped_parameters, **optimizer_params
            )
            scheduler_obj = self._get_scheduler(
                optimizer,
                scheduler=scheduler,
                warmup_steps=warmup_steps,
                t_total=num_train_steps,
            )

            optimizers.append(optimizer)
            schedulers.append(scheduler_obj)

        global_step = 0
        data_iterators = [iter(dataloader) for dataloader in dataloaders]

        num_train_objectives = len(train_objectives)

        skip_scheduler = False
        for epoch in trange(epochs, desc="Epoch", disable=not show_progress_bar):
            training_steps = 0

            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            for _ in trange(
                steps_per_epoch,
                desc="Iteration",
                smoothing=0.05,
                disable=not show_progress_bar,
            ):
                for train_idx in range(num_train_objectives):
                    loss_model = loss_models[train_idx]
                    optimizer = optimizers[train_idx]
                    scheduler = schedulers[train_idx]
                    data_iterator = data_iterators[train_idx]

                    try:
                        data = next(data_iterator)
                    except StopIteration:
                        data_iterator = iter(dataloaders[train_idx])
                        data_iterators[train_idx] = data_iterator
                        data = next(data_iterator)

                    features, labels = data
                    labels = labels.to(self.device)
                    features = list(
                        map(lambda batch: batch_to_device(batch, self.device), features)
                    )

                    if use_amp:
                        with torch.autocast(device_type=self.device.type):
                            loss_value = loss_model(features, labels)

                        scale_before_step = scaler.get_scale()
                        scaler.scale(loss_value).backward()
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            loss_model.parameters(), max_grad_norm
                        )
                        scaler.step(optimizer)
                        scaler.update()

                        skip_scheduler = scaler.get_scale() != scale_before_step
                    else:
                        loss_value = loss_model(features, labels)
                        wandb.log(
                            {f"train_{loss_model.__class__.__name__}": loss_value}
                        )
                        loss_value.backward()
                        torch.nn.utils.clip_grad_norm_(
                            loss_model.parameters(), max_grad_norm
                        )
                        optimizer.step()

                    optimizer.zero_grad()

                    if not skip_scheduler:
                        scheduler.step()

                training_steps += 1
                global_step += 1

                if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
                    self._eval_during_training(
                        evaluator,
                        output_path,
                        save_best_model,
                        epoch,
                        training_steps,
                        callback,
                    )

                    for loss_model in loss_models:
                        loss_model.zero_grad()
                        loss_model.train()

                if (
                    checkpoint_path is not None
                    and checkpoint_save_steps is not None
                    and checkpoint_save_steps > 0
                    and global_step % checkpoint_save_steps == 0
                ):
                    self._save_checkpoint(
                        checkpoint_path, checkpoint_save_total_limit, global_step
                    )

            self._eval_during_training(
                evaluator, output_path, save_best_model, epoch, -1, callback
            )

        if (
            evaluator is None and output_path is not None
        ):  # No evaluator, but output path: save final model version
            self.save(output_path)

        if checkpoint_path is not None:
            self._save_checkpoint(
                checkpoint_path, checkpoint_save_total_limit, global_step
            )

In [34]:
word_embedding_model = CustomSentenceTransformer("sentence-transformers/LaBSE")

In [35]:
train_examples = []
for idx, row in tqdm(intents_df.iterrows()):
    negative_sentences = intents_df[intents_df["intent_path"] != row["intent_path"]][
        "phrase"
    ].tolist()
    random.shuffle(negative_sentences)
    positive_sentences = intents_df[intents_df["intent_path"] == row["intent_path"]][
        "phrase"
    ].tolist()
    random.shuffle(positive_sentences)
    train_examples.append(
        InputExample(
            texts=[row["phrase"], positive_sentences[0], negative_sentences[:10]]
        )
    )

7964it [01:01, 130.05it/s]


In [36]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

In [37]:
train_loss = losses.TripletLoss(
    model=word_embedding_model, distance_metric=losses.TripletDistanceMetric.COSINE
)

In [38]:
word_embedding_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

In [39]:
word_embedding_model.save("./labse-intents-triplet-batch")

In [40]:
wandb.log_model("./labse-intents-triplet-batch")

[34m[1mwandb[0m: Adding directory to artifact (./labse-intents-triplet-batch)... Done. 16.8s
