In [2]:
import warnings
warnings.filterwarnings("ignore")
import random
import gc
from ast import literal_eval

import numpy as np
import pandas as pd

from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses
)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TranslationEvaluator
from transformers.integrations import TensorBoardCallback  
from transformers import AutoTokenizer
from datasets import Dataset

import torch
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter


from sklearn.model_selection import train_test_split

torch.cuda.empty_cache()
gc.collect()





67

In [3]:
model = SentenceTransformer('sentence-transformers/LaBSE')
tokenizer = AutoTokenizer.from_pretrained('C:/Users/Vladimir/PycharmProjects/perevod/mansi-translator/data/tokenizers/tokenizer_v1')
model.tokenizer = tokenizer
model._first_module().auto_model.resize_token_embeddings(len(tokenizer))


Embedding(501710, 768, padding_idx=0)

In [4]:
def setup_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    return None


setup_random_seed(777)

In [5]:
dataset = pd.read_csv('data_train.csv')
dataset['similarities'] = dataset['similarities'].apply(literal_eval)
dataset = dataset.loc[dataset['target'].drop_duplicates(keep='last').index]
dataset = dataset.loc[dataset['source'].drop_duplicates(keep='last').index]
dataset.dropna(inplace=True)
dataset.head()

Unnamed: 0,target,source,similarities,negative_examples_mans,negative_examples_rus
0,Та пыгрисит маим вармаль э̄рнэ поратэт ат верм...,Те мальчики не выполнят задание в назначенный ...,"{0: 1.0, 52122: 0.9042518734931946, 18930: 0.8...","Пыгрищит ка̄таныл иӈ ва̄гталыт, ты та̄рвитыӈ р...",У мальчиков еще нет сил выполнять тяжелую рабо...
1,"Ха̄йтыматэ тӯр ва̄тан ёхтыс, вит ва̄тан ха̄йтыс.","Бегая к берегу озера пришла, к воде подбежала.","{1: 0.9999999403953552, 4463: 0.97095388174057...","Ха̄йтыме̄т ос тӯр ва̄та ка̄салас, тув ха̄йтыс.","Бежала, увидела берег озера, туда направилась."
2,Вит са̄мыл сунсым о̄нтыс,Вода прибывала на глазах,"{2: 0.9999999403953552, 33249: 0.9213534593582...",Вӣт ветраныл ма̄н сюргыс.,Вода текла на землю.
3,"Атаявев, акваг лылынг тагл ворн та тотавев.","Обнюхивает нас, живыми на кладбище уносит.","{3: 0.9999997019767761, 12914: 0.9601884484291...","Атаяве̄в, акваг лылыӈ та̄гыл во̄рн та тотаве̄в",Она обнюхивает нас и на кладбище уносит
4,"Ман ты пӣлтал, веськат хумиюв нэтхуньт ат ёру...",Мы никогда не забудем этого честного человека.,"{4: 1.0, 55368: 0.9368814826011658, 61451: 0.9...",Ёмас хо̄тпа хотта мус номуӈкве патылӯв.,Хорошего человека будем помнить долго.


In [6]:
train_set, test_set = train_test_split(dataset, test_size=7_500, random_state=777, shuffle=True)

In [7]:
train_set.shape, test_set.shape

((62772, 5), (7500, 5))

In [8]:
train_dataset = Dataset.from_dict(
    {
        "anchor": train_set['source'].to_list(),
        "positive": train_set['target'].to_list(),
        "negative": train_set['negative_examples_mans'].to_list()
    }
)

test_dataset = Dataset.from_dict(
    {
        "anchor": test_set['source'].to_list(),
        "positive": test_set['target'].to_list(),
        "negative": test_set['negative_examples_mans'].to_list()
    }
)



In [9]:
class LaBSEFineTuner:
    
    def __init__(self, model):
        self.model = model
        self.bert_model = model._first_module().auto_model
        
        self.pooling = self.model[1]
        self.dense = self.model[2]
        self.normalize = self.model[3]
        
        self.unfreeze_before_attention()
        self.unfreeze_last_head()
        self.unfreeze_pooling_dense_normalize()

    def unfreeze_before_attention(self):
        for name, param in self.bert_model.named_parameters():
            if 'encoder.layer.0.attention' in name:
                break
            param.requires_grad = True
            print(f"Unfroze {name}")

    def unfreeze_last_head(self):
        for name, param in self.bert_model.named_parameters():
            param.requires_grad = False

        last_layer_idx = len(self.bert_model.encoder.layer) - 1
        for name, param in self.bert_model.named_parameters():
            if f'encoder.layer.{last_layer_idx}.attention' in name or f'encoder.layer.{last_layer_idx}.output' in name:
                param.requires_grad = True
                print(f"Unfroze {name}")

        for name, param in self.bert_model.named_parameters():
            if 'pooler' in name or 'cls' in name:
                param.requires_grad = True
                print(f"Unfroze {name}")
                
    def unfreeze_pooling_dense_normalize(self):
        for name, param in self.pooling.named_parameters():
            param.requires_grad = True
            print(f"Unfroze Pooling layer: {name}")
        
        for name, param in self.dense.named_parameters():
            param.requires_grad = True
            print(f"Unfroze Dense layer: {name}")
        
        for name, param in self.normalize.named_parameters():
            param.requires_grad = True
            print(f"Unfroze Normalize layer: {name}")
        
    def count_trainable_params(self):
        trainable_params = sum(p.numel() for p in self.bert_model.parameters() if p.requires_grad)
        trainable_params += sum(p.numel() for p in self.pooling.parameters() if p.requires_grad)
        trainable_params += sum(p.numel() for p in self.dense.parameters() if p.requires_grad)
        trainable_params += sum(p.numel() for p in self.normalize.parameters() if p.requires_grad)
        return print(f"Trainable parameters: {trainable_params}")
    
    def to(self, device):
        return self.bert_model.to(device=device)


model = SentenceTransformer('sentence-transformers/LaBSE')
LaBSE = LaBSEFineTuner(model)


Unfroze embeddings.word_embeddings.weight
Unfroze embeddings.position_embeddings.weight
Unfroze embeddings.token_type_embeddings.weight
Unfroze embeddings.LayerNorm.weight
Unfroze embeddings.LayerNorm.bias
Unfroze encoder.layer.11.attention.self.query.weight
Unfroze encoder.layer.11.attention.self.query.bias
Unfroze encoder.layer.11.attention.self.key.weight
Unfroze encoder.layer.11.attention.self.key.bias
Unfroze encoder.layer.11.attention.self.value.weight
Unfroze encoder.layer.11.attention.self.value.bias
Unfroze encoder.layer.11.attention.output.dense.weight
Unfroze encoder.layer.11.attention.output.dense.bias
Unfroze encoder.layer.11.attention.output.LayerNorm.weight
Unfroze encoder.layer.11.attention.output.LayerNorm.bias
Unfroze encoder.layer.11.output.dense.weight
Unfroze encoder.layer.11.output.dense.bias
Unfroze encoder.layer.11.output.LayerNorm.weight
Unfroze encoder.layer.11.output.LayerNorm.bias
Unfroze pooler.dense.weight
Unfroze pooler.dense.bias
Unfroze Dense layer: lin

In [10]:
assert sum([p.numel() for _, p in LaBSE.model[2].named_parameters() if p.requires_grad == True]) == 590592

In [11]:
optimizer_grouped_parameters = [
    {
        'params': [
            *LaBSE.model[0].auto_model.embeddings.word_embeddings.parameters(),
            *LaBSE.model[0].auto_model.embeddings.position_embeddings.parameters(),
            *LaBSE.model[0].auto_model.embeddings.token_type_embeddings.parameters(),
            *LaBSE.model[0].auto_model.embeddings.LayerNorm.parameters()
        ],
        'lr': 5e-3,
        'weight_decay': 0.05
    },
    {
        'params': LaBSE.model[0].auto_model.encoder.layer[11].parameters(),
        'lr': 3e-5,
        'weight_decay': 0.01
    },
    {
        'params': LaBSE.model[0].auto_model.pooler.parameters(),
        'lr': 3e-5,
        'weight_decay': 0.01
    }
]

optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.08,
    patience=2,
    verbose=True
)



# Конфигурация обучения 


In [23]:
loss_function = losses.CachedMultipleNegativesRankingLoss(model=LaBSE.model, mini_batch_size=128).to(device='cuda:0')


args = SentenceTransformerTrainingArguments(
    output_dir="models",  
    num_train_epochs=30,  
    per_device_train_batch_size=64,  
    per_device_eval_batch_size=64, 
    max_grad_norm=1.1,
    # lr_scheduler_type=transformers.get_cosine_schedule_with_warmup(
    #     optimizer, 
    #     num_warmup_steps=len(train_dataset) // 64, 
    #     num_training_steps=len(train_dataset) * 30
    # ),
    restore_callback_states_from_checkpoint=True,
    load_best_model_at_end=True,
    use_cpu=False,
    seed=777,
    fp16=False,  
    bf16=False,
    disable_tqdm=False,
    remove_unused_columns=True,
    greater_is_better=True,
    metric_for_best_model='eval_Perevod_mean_accuracy',
    ignore_data_skip=False,
    # dataloader_persistent_workers=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  
    eval_strategy="epoch",  
    save_strategy="epoch",  
    save_total_limit=2
)

evaluator = TranslationEvaluator(
    source_sentences=test_dataset["anchor"],
    show_progress_bar=True,
    target_sentences=test_dataset["positive"],
    name="Perevod" 
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    optimizers=(optimizer, lr_scheduler),
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss=loss_function,
    evaluator=evaluator,
    # callbacks=[TensorBoardCallback(SummaryWriter(log_dir="logs/tensorboard"))]  
)
trainer.train()

Epoch,Training Loss,Validation Loss,Perevod Src2trg Accuracy,Perevod Trg2src Accuracy,Perevod Mean Accuracy
1,1.4688,1.250345,0.363733,0.310533,0.337133


Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [21]:
def evaluate_and_print_metrics(trainer):
    metrics = trainer.evaluate()
    print("Available metrics:")
    for key in metrics.keys():
        print(key)

# После создания тренера и выполнения тренировки
evaluate_and_print_metrics(trainer)

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Available metrics:
eval_loss
eval_Perevod_src2trg_accuracy
eval_Perevod_trg2src_accuracy
eval_Perevod_mean_accuracy
