In [1]:
import pandas as pd

dataset = pd.read_csv("data1.csv")
dataset["risk_score"] = dataset["risk_score"] / dataset["risk_score"].max()
dataset.head(5)

Unnamed: 0,text,label,risk_score
0,Предоставление права использования КИП в пред...,Если Пользователь не успеет зарегистрировать д...,0.571429
1,Предоставление права использования КИП в пред...,Если Пользователь не успеет зарегистрировать д...,0.571429
2,Предоставление права использования КИП в пред...,Если Пользователь не успеет зарегистрировать д...,0.571429
3,г. Москва,neutral,0.0
4,"ООО «Киберклуб», именуемое в дальнейшем Правоо...",neutral,0.0


In [2]:
dataset.shape

(65, 3)

In [3]:
import re

def text_processing(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(\d)\s+(?=\d)', r'\1', text)
    return text

def clean_label(label):
    if pd.isna(label):
        return 'unknown'
    
    cleaned = str(label).strip().replace('"', '').replace('”', '')

    typo_corrections = {
        'nol_risk': 'no_risk',
        'gal_risk': 'legal_risk',
    }
    
    return typo_corrections.get(cleaned, cleaned)

dataset["text"] = dataset["text"].apply(text_processing)
# dataset['label'] = dataset['label'].apply(clean_label)
# dataset = dataset[dataset["label"] != "unknown"]

In [4]:
dataset.head(5)

Unnamed: 0,text,label,risk_score
0,предоставление права использования кип в пред...,Если Пользователь не успеет зарегистрировать д...,0.571429
1,предоставление права использования кип в пред...,Если Пользователь не успеет зарегистрировать д...,0.571429
2,предоставление права использования кип в пред...,Если Пользователь не успеет зарегистрировать д...,0.571429
3,г москва,neutral,0.0
4,ооо киберклуб именуемое в дальнейшем правообла...,neutral,0.0


In [5]:
from sklearn.model_selection import train_test_split

# train_data, val_data = train_test_split(dataset, test_size=0.2, stratify=dataset['label'], random_state=32)
train_data = dataset
X_train_text = train_data["text"]
y_train = train_data["label"]
X_train_score = train_data["risk_score"]

# X_val_text = val_data["text"]
# y_val = val_data["label"]
# X_val_score = val_data["risk_score"]


In [6]:
from transformers import BertModel, BertTokenizer, BertConfig, BertPreTrainedModel
import torch.nn as nn
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class DualBert(BertPreTrainedModel):
    def __init__(self, config, num_classes):
        super().__init__(config)
        self.bert = BertModel(config)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.init_weights()
    
    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask)
        pooled_output = outputs.pooler_output
        
        logits = self.classifier(pooled_output)
        risk_score = self.regressor(pooled_output).squeeze(-1)
        
        return logits, risk_score

def loss_fn(logits, risk_pred, labels, risk_score, alpha=0.4):
    clf_fn = nn.CrossEntropyLoss()
    reg_fn = nn.MSELoss()
    
    loss_clf = clf_fn(logits, labels)
    loss_reg = reg_fn(risk_pred, risk_score)
    
    total_loss = alpha * loss_clf + (1 - alpha) * loss_reg

    return total_loss


model_id = "DeepPavlov/rubert-base-cased"

unique_classes = dataset["label"].unique()

num_labels = len(unique_classes)

tokenizer = BertTokenizer.from_pretrained(model_id)
config = BertConfig.from_pretrained(model_id, num_labels=num_labels)
model = DualBert.from_pretrained(model_id, config=config, num_classes=num_labels).to(device)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of DualBert were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.cuda()

DualBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [8]:
unique_classes

array(['Если Пользователь не успеет зарегистрировать договор до 15 апреля 2025 г., договор считается незаключенным (ст. 1028 ГК РФ). Риск: действия Пользователя будут незаконными, включая использование товарного знака.',
       'neutral',
       'Запрет открывать клубы в радиусе 15 км в течение 5 лет может быть признано ограничивающим  конкуренцию (п. 3 ст. 1033 ГК РФ).',
       'Установление фиксированных цен Правообладателем нарушает ст. 1033 ГК РФ, которая запрещает антиконкурентные условия. Это может привести к спорам с антимонопольной службой.',
       'Требование передавать базы данных клиентов без их согласия нарушает закон № 152-ФЗ о персональных данных. Пользователь рискует штрафами до 75 000 руб. (ст. 13.11 КоАП РФ).',
       'Указание «10% от выручки» без уточнения (валовая/чистая) противоречит ст. 424 ГК РФ (цена должна быть определена четко). Суд может признать условие несогласованным.',
       'Уменьшение гибкости ведения бизнеса, хотя такой запрет не противоречит законод

In [9]:
X_train_text = tokenizer(X_train_text.tolist(), max_length=256, padding=True, truncation=True, return_tensors="pt")
# X_val_text = tokenizer(X_val_text.tolist(), max_length=256, padding=True, truncation=True, return_tensors="pt")

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(unique_classes)
y_train = label_encoder.transform(y_train)
# y_val = label_encoder.transform(y_val)

In [11]:
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, inputs, labels, masks, risk_scores):
        self.inputs = inputs
        self.labels = labels
        self.mask = masks
        self.risk_scores = risk_scores

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return {
            "input_ids": torch.tensor(self.inputs[index]),
            "attention_mask": torch.tensor(self.mask[index]),
            "labels": torch.tensor(self.labels[index]),
            "risk_scores": torch.tensor(self.risk_scores[index])
        }

train_dataset = TextDataset(X_train_text["input_ids"], y_train, X_train_text["attention_mask"], X_train_score.tolist())
# val_dataset = TextDataset(X_val_text["input_ids"], y_val, X_val_text["attention_mask"], X_val_score.tolist())

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [12]:
from tqdm import tqdm
import numpy as np


def train(model, optimizer, loss_fn, train_loader, lr=1e-5, num_epochs=15):
    best_val_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        epoch_losses = []
        if num_epochs > 8:
            optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

        for batch in tqdm(train_loader, desc=f"Training epoch: {epoch}"):
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device)
            }
            labels = batch["labels"].to(device)
            risk_scores = batch["risk_scores"].to(device)
            model.zero_grad()
            logits, pred = model(**inputs)
            loss = loss_fn(logits, pred, labels, risk_scores)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            epoch_losses.append(loss.item())

        train_acc, train_mse, _ = eval(model, loss_fn, train_loader)
        # val_acc, val_mse, val_loss = eval(model, loss_fn, val_loader)

        ep_loss = sum(epoch_losses) / len(epoch_losses)
        print(f"Train loss: {ep_loss}, accuracy on train: {train_acc}, mse on train: {train_mse}")
        # print(f"Val loss: {val_loss}, accuracy on val: {val_acc}, mse on val: {val_mse}")

        # if val_acc > best_val_acc and epoch > 6:
        #     best_val_acc = val_acc
        #     torch.save(model.state_dict(), "best_model.pth")

    return model

def eval(model, loss_fn,data_loader):
    model.eval()
    correct_clf = 0
    total_clf = 0
    total_loss = 0

    risk_preds = []
    risk_targets = []
    
    with torch.no_grad():
        for batch in data_loader:
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device)
            }
            labels = batch["labels"].to(device)
            risk_scores = batch["risk_scores"].to(device)
            
            logits, risk_pred = model(**inputs)
            
            preds = torch.argmax(logits, dim=1)
            correct_clf += (preds == labels).sum().item()
            total_clf += labels.size(0)
            
            risk_preds.extend(risk_pred.cpu().numpy())
            risk_targets.extend(risk_scores.cpu().numpy())
            loss = loss_fn(logits, risk_pred, labels, risk_scores)
            
            total_loss += loss.item()
    
    clf_accuracy = correct_clf / total_clf
    
    risk_preds = np.array(risk_preds)
    risk_targets = np.array(risk_targets)
    

    mse = np.mean((risk_preds - risk_targets)**2)
    
    return clf_accuracy, mse, total_loss

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

model = train(model, optimizer, loss_fn, train_loader, lr=2e-5, num_epochs=40)

  "input_ids": torch.tensor(self.inputs[index]),
  "attention_mask": torch.tensor(self.mask[index]),
Training epoch: 0: 100%|██████████| 3/3 [00:02<00:00,  1.31it/s]


Train loss: 1.1124444802602131, accuracy on train: 0.2153846153846154, mse on train: 0.11699166148900986


Training epoch: 1: 100%|██████████| 3/3 [00:01<00:00,  1.51it/s]


Train loss: 0.9725470940272013, accuracy on train: 0.46153846153846156, mse on train: 0.13222892582416534


Training epoch: 2: 100%|██████████| 3/3 [00:01<00:00,  1.55it/s]


Train loss: 0.9988273978233337, accuracy on train: 0.5076923076923077, mse on train: 0.09660076349973679


Training epoch: 3: 100%|██████████| 3/3 [00:01<00:00,  1.54it/s]


Train loss: 0.9357498089472452, accuracy on train: 0.5538461538461539, mse on train: 0.0704205259680748


Training epoch: 4: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.7561969757080078, accuracy on train: 0.5538461538461539, mse on train: 0.054788678884506226


Training epoch: 5: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]


Train loss: 0.7179586887359619, accuracy on train: 0.5692307692307692, mse on train: 0.024510834366083145


Training epoch: 6: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.743049144744873, accuracy on train: 0.8615384615384616, mse on train: 0.022821828722953796


Training epoch: 7: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.5733995934327444, accuracy on train: 0.8153846153846154, mse on train: 0.025433020666241646


Training epoch: 8: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.5000570217768351, accuracy on train: 0.9384615384615385, mse on train: 0.02278243564069271


Training epoch: 9: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.5539787809054056, accuracy on train: 0.9384615384615385, mse on train: 0.04654422402381897


Training epoch: 10: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.5163969894250234, accuracy on train: 0.9692307692307692, mse on train: 0.017219390720129013


Training epoch: 11: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.3472173511981964, accuracy on train: 0.9384615384615385, mse on train: 0.014152961783111095


Training epoch: 12: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.4475487271944682, accuracy on train: 1.0, mse on train: 0.010677721351385117


Training epoch: 13: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]


Train loss: 0.4293930729230245, accuracy on train: 1.0, mse on train: 0.017615804448723793


Training epoch: 14: 100%|██████████| 3/3 [00:01<00:00,  1.67it/s]


Train loss: 0.3753760556379954, accuracy on train: 1.0, mse on train: 0.03974276781082153


Training epoch: 15: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.3890705506006877, accuracy on train: 1.0, mse on train: 0.023963933810591698


Training epoch: 16: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.2981776048739751, accuracy on train: 1.0, mse on train: 0.01519633550196886


Training epoch: 17: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.1753117342789968, accuracy on train: 1.0, mse on train: 0.012309006415307522


Training epoch: 18: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.20897195736567178, accuracy on train: 1.0, mse on train: 0.027528652921319008


Training epoch: 19: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.22970154881477356, accuracy on train: 1.0, mse on train: 0.01867770403623581


Training epoch: 20: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]


Train loss: 0.1450513762732347, accuracy on train: 1.0, mse on train: 0.014629151672124863


Training epoch: 21: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.12665135165055594, accuracy on train: 1.0, mse on train: 0.012446352280676365


Training epoch: 22: 100%|██████████| 3/3 [00:01<00:00,  1.67it/s]


Train loss: 0.11725532015164693, accuracy on train: 1.0, mse on train: 0.0204494446516037


Training epoch: 23: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.16181852916876474, accuracy on train: 1.0, mse on train: 0.01112767867743969


Training epoch: 24: 100%|██████████| 3/3 [00:01<00:00,  1.67it/s]


Train loss: 0.09159843934079011, accuracy on train: 1.0, mse on train: 0.012721551582217216


Training epoch: 25: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.0895612562696139, accuracy on train: 1.0, mse on train: 0.013912160880863667


Training epoch: 26: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]


Train loss: 0.09476255625486374, accuracy on train: 1.0, mse on train: 0.004280734807252884


Training epoch: 27: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]


Train loss: 0.11392596860726674, accuracy on train: 1.0, mse on train: 0.006450483575463295


Training epoch: 28: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.0793311595916748, accuracy on train: 1.0, mse on train: 0.005372348241508007


Training epoch: 29: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.11500745763381322, accuracy on train: 1.0, mse on train: 0.017419887706637383


Training epoch: 30: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.09158824632565181, accuracy on train: 1.0, mse on train: 0.007475986145436764


Training epoch: 31: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.05633488359550635, accuracy on train: 1.0, mse on train: 0.02049056813120842


Training epoch: 32: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.053387588200469814, accuracy on train: 1.0, mse on train: 0.01621992699801922


Training epoch: 33: 100%|██████████| 3/3 [00:01<00:00,  1.67it/s]


Train loss: 0.0814598153034846, accuracy on train: 1.0, mse on train: 0.015604463405907154


Training epoch: 34: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.07619776328404744, accuracy on train: 1.0, mse on train: 0.024805203080177307


Training epoch: 35: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.0706695206463337, accuracy on train: 1.0, mse on train: 0.012100568041205406


Training epoch: 36: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.04109942478438219, accuracy on train: 1.0, mse on train: 0.02379259280860424


Training epoch: 37: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


Train loss: 0.06392987569173177, accuracy on train: 1.0, mse on train: 0.011593868024647236


Training epoch: 38: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.051768604665994644, accuracy on train: 1.0, mse on train: 0.014717102982103825


Training epoch: 39: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]


Train loss: 0.07717095439632733, accuracy on train: 1.0, mse on train: 0.013753755949437618


In [14]:
torch.save(model.state_dict(), "best_model.pth")

In [15]:
def test(model, tokenizer, text):
    input = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False).to(device)
    logits, pred = model(**input)
    return logits, pred


model.load_state_dict(torch.load('best_model.pth'))

texts = [
    "Любая из сторон во всякое время вправе досрочно отказаться от договора, уведомив об этом другую сторону не позднее чем за тридцать дней и уплатив ей отступное в размере 360000 руб.",
    "За несвоевременную подачу документов на регистрацию в Роспатент Правообладатель выплачивает Пользователю штраф в размере 1000 руб. за каждый день просрочки.",
    "Правообладатель вправе каждый день приходить в киберклуб"
]


for text in texts:
    text = text_processing(text)

    print(text)

    out, pred = test(model, tokenizer, text)
    pred_label_id = torch.argmax(out, dim=1)
    pred_label_id.cpu()
    label = label_encoder.inverse_transform([pred_label_id.item()])[0]
    print(label, f"{pred.item():.2f}")

любая из сторон во всякое время вправе досрочно отказаться от договора уведомив об этом другую сторону не позднее чем за тридцать дней и уплатив ей отступное в размере 360000 руб
neutral 0.17
за несвоевременную подачу документов на регистрацию в роспатент правообладатель выплачивает пользователю штраф в размере 1000 руб за каждый день просрочки
neutral 0.04
правообладатель вправе каждый день приходить в киберклуб
neutral 0.04
