In [2]:
# !pip install datasets

In [1]:
from datasets import load_dataset
HF_DATASET = "merionum/ru_paraphraser"
train_pd = load_dataset(HF_DATASET, split="train").to_pandas()
test_pd = load_dataset(HF_DATASET, split="test").to_pandas()

Downloading readme:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 2.17M/2.17M [00:00<00:00, 8.83MB/s]
Downloading data: 100%|██████████| 605k/605k [00:00<00:00, 1.49MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [2]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# bert_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-conversational")
# bert_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-conversational").to("cuda")
bert_tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
bert_model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru").to("cuda")
bert_model.eval()

def get_bert_embeddings(
	text: str
) -> np.ndarray:
    tokens = bert_tokenizer.encode(
        text,
        add_special_tokens=True,
        truncation=True,
        max_length=512
    )
    n = 512  # max длина вектора
    padded = torch.LongTensor(
    	[
            np.array(tokens + [0] * (n - len(tokens)))
        ]
    )
    attention_mask = torch.LongTensor(
        np.where(
            padded != 0, 1, 0
        )
    )
    with torch.no_grad():
        batch_embeddings = bert_model(padded.to("cuda"), attention_mask=attention_mask.to("cuda"))[0][:, 0, :].cpu().numpy()
    return batch_embeddings

tokenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [3]:
labels = train_pd["class"].to_numpy()

In [4]:
train_pd

Unnamed: 0,id,id_1,id_2,text_1,text_2,class
0,1,201,8159,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0
1,2,202,8158,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0
2,3,273,8167,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0
3,4,220,8160,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,-1
4,5,223,8160,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,0
...,...,...,...,...,...,...
7222,25514,34622,34633,Путин освободил от должности ряд генералов,Путин снял с должностей более 20 руководителей...,0
7223,25524,34566,34654,Облака над Москвой в День Победы разгонят девя...,Путеводитель по Дню Победы: как провести 9 мая...,-1
7224,25548,34519,34681,Любляна отпразднует День Победы вместе с Москвой,В Москве ограничат движение в связи с Днем Победы,-1
7225,25549,34565,34681,Девять самолетов ВВС разгонят облака над Москв...,В Москве ограничат движение в связи с Днем Победы,-1


In [5]:
import tqdm

def get_data(pd):
    embeddings = []
    labels = []
    for index, row in tqdm.tqdm(pd.iterrows()):
        # Use BERT sentence separation token
        text = row["text_1"] + "[SEP]" + row["text_2"]
        embeddings.append(get_bert_embeddings(text))
        labels.append(int(int(row["class"])>-1))
    return embeddings, labels

In [19]:
train_embeddings, train_labels = get_data(train_pd)

7227it [07:54, 15.25it/s]


In [20]:
test_embeddings, test_labels = get_data(test_pd)

1924it [02:06, 15.23it/s]


In [21]:
train_embeddings = np.array(train_embeddings).squeeze()
test_embeddings = np.array(test_embeddings).squeeze()
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [9]:
!pip install catboost

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [23]:
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import optuna
from sklearn import metrics

In [24]:
X_train, X_valid, Xtrain_labels, valid_labels = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42, shuffle=True, stratify=train_labels)

In [26]:
def objective(trial):
    global X_train, Xtrain_labels, X_valid, valid_labels

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostClassifier(**param)

    gbm.fit(X_train, Xtrain_labels, eval_set=[(X_valid, valid_labels)], verbose=0, early_stopping_rounds=100)

#     preds = gbm.predict(X_valid)
#     pred_labels = np.rint(preds)
#     accuracy = accuracy_score(valid_labels, pred_labels)
    return metrics.f1_score(gbm.predict(X_valid), valid_labels, average='macro')


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
print(study.best_trial)

[I 2024-05-24 13:57:00,620] A new study created in memory with name: no-name-792140a9-c682-43a1-94b1-6ccc92f7c53b
[I 2024-05-24 14:01:15,165] Trial 0 finished with value: 0.7386387110101219 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.08839913927414812, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7386387110101219.
[I 2024-05-24 14:01:53,614] Trial 1 finished with value: 0.7849730921996401 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.03885110236660856, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7866297266569934}. Best is trial 1 with value: 0.7849730921996401.
[I 2024-05-24 14:01:58,443] Trial 2 finished with value: 0.7802097583219334 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.031336554896371785, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4965850100369473}. Best is trial 1 with value: 0

FrozenTrial(number=8, state=TrialState.COMPLETE, values=[0.7886813922356091], datetime_start=datetime.datetime(2024, 5, 24, 14, 8, 21, 155491), datetime_complete=datetime.datetime(2024, 5, 24, 14, 8, 30, 890118), params={'objective': 'Logloss', 'colsample_bylevel': 0.0119009553264922, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5900465528180336}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'objective': CategoricalDistribution(choices=('Logloss', 'CrossEntropy')), 'colsample_bylevel': FloatDistribution(high=0.1, log=False, low=0.01, step=None), 'depth': IntDistribution(high=12, log=False, low=1, step=1), 'boosting_type': CategoricalDistribution(choices=('Ordered', 'Plain')), 'bootstrap_type': CategoricalDistribution(choices=('Bayesian', 'Bernoulli', 'MVS')), 'subsample': FloatDistribution(high=1.0, log=False, low=0.1, step=None)}, trial_id=8, value=None)


In [28]:
param = {
        "objective": "Logloss",
        "colsample_bylevel": 0.0119,
        "depth": 7,
        "boosting_type": "Plain",
        "bootstrap_type": "Bernoulli",
        'subsample': 0.59
}
clf = CatBoostClassifier(**param, verbose=0)
clf.fit(train_embeddings, train_labels)

<catboost.core.CatBoostClassifier at 0x7de736e97130>

In [30]:
preds = clf.predict(test_embeddings)

# Evaluate the model
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(test_labels, preds))

Accuracy: 0.7177754677754677


In [31]:
from sklearn import metrics
f1_score_micro = metrics.f1_score(test_labels, preds, average='micro')
f1_score_macro = metrics.f1_score(test_labels, preds, average='macro')
print(f"Accuracy Score = {accuracy_score(test_labels, preds)}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7177754677754677
F1 Score (Micro) = 0.7177754677754677
F1 Score (Macro) = 0.6786957544667365
