In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
train = train.sample(frac=0.4, random_state=42)
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
data = pd.read_parquet('items.parquet')

In [4]:
train

Unnamed: 0,leftItemId,rightItemId,target
20987,4646476163753910898,5123122680727853126,relevant_minus
7077,179602477240964301,13519372438923722859,relevant_minus
23064,5413851602206336754,5343826051963097072,relevant
36501,12591732453231246389,12597193763191104739,relevant_minus
126719,13542769079320751337,886550916145567852,relevant
...,...,...,...
205708,12601449588782387507,10167081963908444521,no_relevant
12918,3755409024178116673,6996526768143679659,no_relevant
177852,12983183246897348976,16972574557361610135,relevant
7066,16839436918394481486,12534343399457166925,relevant


In [5]:
# merge для left
train = train.merge(
    data.add_suffix("_left").rename(columns={"itemId_left": "itemId"}),
    left_on="leftItemId",
    right_on="itemId",
    how="left"
)

# merge для right
train = train.merge(
    data.add_suffix("_right").rename(columns={"itemId_right": "itemId"}),
    left_on="rightItemId",
    right_on="itemId",
    how="left"
)


In [6]:
train.drop(columns=['leftItemId', 'rightItemId', 'itemId_x', 'itemId_y'], inplace=True)

In [7]:
train.drop(
    columns=['authorId_left', 'authorId_right'],
    inplace=True
)

In [8]:
train = train[train['target'].isin(['relevant_plus', 'relevant', 'relevant_minus', 'no_relevant'])]

In [9]:
from sklearn.model_selection import train_test_split



X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=['target']),
    train['target'],
    test_size=0.2,
    random_state=42
)

In [10]:
y_train.value_counts()

target
relevant_minus    24023
relevant          23052
no_relevant       13143
relevant_plus      6497
Name: count, dtype: int64

In [11]:
from transformers import TrainingArguments
from sklearn.metrics import f1_score


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=-1)
    return {
        "weighted_f1": f1_score(labels, preds, average="weighted")
    }



# используйте эти параметры обучения
training_args = TrainingArguments(
    # Основные параметры
    output_dir='./intfloat',  # Директория для сохранения
    
    # Параметры обучения
    num_train_epochs=1,                     # Количество эпох
    per_device_train_batch_size=1,         # Размер батча для обучения
    per_device_eval_batch_size=16,          # Размер батча для валидации
    learning_rate=1e-5,                     # Learning rate
    warmup_ratio = 0.1,                     # 10% от общего числа шагов для вармапа или warmup_steps = int(0.1 * total_training_steps)
    lr_scheduler_type = 'cosine',           # Можете посмотреть на них в 
                                            # https://www.kaggle.com/code/snnclsr/learning-rate-schedulers 
                                            # соответсвующий ему будет get_cosine_schedule_with_warmup
    gradient_accumulation_steps=32,
    # Сохранение и логирование
    logging_dir='./logs',                   # Директория для логов
    logging_steps=20,                      # Частота логирования
    save_steps=200,                         # Частота сохранения
    save_total_limit=2,                     # Максимум чекпоинтов
    save_strategy='steps',                  # Стратегия сохранения
    
    # Валидация
    eval_strategy='steps',
    eval_steps=200,            # Стратегия валидации
    load_best_model_at_end=True,            # Загружать лучшую модель
    metric_for_best_model='weighted_f1',             # Метрика для выбора лучшей
    greater_is_better=True,                 # Больше значение = лучше
    # воспроизводимость
    seed=42,                                # Seed для воспроизводимости
)

In [12]:
from transformers import Trainer


model_name = "intfloat/multilingual-e5-large-instruct"

In [13]:
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer


train_dataset = Dataset.from_dict({"right_title": X_train['title_right'].tolist(), "left_title": X_train['title_left'].tolist(), "label": y_train.tolist()})
val_dataset  = Dataset.from_dict({"right_title": X_val['title_right'].tolist(), "left_title": X_val['title_left'].tolist(), "label": y_val.tolist()})

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True)
tokenizer.padding_side = "right"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

def preprocess(examples):
    return tokenizer(
        text=examples["right_title"],
        text_pair=examples["left_title"],
        truncation=True,
        padding="max_length",
        max_length=200
    )

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/66715 [00:00<?, ? examples/s]

Map:   0%|          | 0/16679 [00:00<?, ? examples/s]

In [14]:
labels = ClassLabel(names=["relevant_plus", "relevant", "relevant_minus", "no_relevant"])

train_dataset = train_dataset.map(lambda x: {"label": labels.str2int(x["label"])})
val_dataset   = val_dataset.map(lambda x: {"label": labels.str2int(x["label"])})

Map:   0%|          | 0/66715 [00:00<?, ? examples/s]

Map:   0%|          | 0/16679 [00:00<?, ? examples/s]

In [15]:
train_dataset = train_dataset.remove_columns(["right_title", "left_title"])
val_dataset   = val_dataset.remove_columns(["right_title", "left_title"])

In [16]:
train_dataset[0]

{'label': 1,
 'input_ids': [0,
  4805,
  90520,
  134354,
  4476,
  419,
  77,
  118763,
  4,
  252,
  1203,
  23590,
  87152,
  114299,
  35,
  16721,
  40519,
  32,
  6,
  89077,
  155031,
  44333,
  10090,
  10573,
  743,
  46,
  56756,
  227,
  35,
  127353,
  227,
  5,
  2,
  2,
  2443,
  20351,
  194682,
  49,
  77005,
  19448,
  12,
  121124,
  1794,
  103,
  35397,
  4,
  38511,
  9526,
  245,
  19789,
  32889,
  35,
  121592,
  29,
  117037,
  53173,
  105,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [27]:
from transformers import AutoModelForSequenceClassification
import os

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    trust_remote_code=True
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-large-instruct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
model

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [18]:
for param in model.parameters():
    param.requires_grad = True

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Weighted F1
200,1.2283,1.177123,0.358746
400,1.0853,1.08044,0.506529
600,1.0632,1.040879,0.529878
800,1.1288,1.036119,0.534159
1000,1.0217,1.050485,0.529744
1200,1.0429,1.04848,0.532914
1400,0.9978,1.027639,0.540098
1600,1.0395,1.01286,0.545442
1800,1.0358,1.018664,0.544619
2000,1.0058,1.015321,0.545605


TrainOutput(global_step=2084, training_loss=1.0685231171963083, metrics={'train_runtime': 5688.5981, 'train_samples_per_second': 11.728, 'train_steps_per_second': 0.366, 'total_flos': 2.42769760671744e+16, 'train_loss': 1.0685231171963083, 'epoch': 0.9995952934122762})

Эксперименты 

sergeyzh/BERTA - 0.41

BAAI/bge-m3 - 0.48

intfloat/multilingual-e5-large-instruct - 0.51


sentence-transformers/paraphrase-multilingual-mpnet-base-v2 - 0.39


sergeyzh/BERTA - 0.2 датасета 1e-5 4 bs 4 ga - 0.51


sergeyzh/BERTA - 0.4 датасета 1e-5 32 bs - 0.517

sergeyzh/BERTA - 0.8 датасета 1e-5 32 bs - 0.53


In [16]:
test = pd.read_csv('test.csv')

In [17]:
# merge для left
test = test.merge(
    data.add_suffix("_left").rename(columns={"itemId_left": "itemId"}),
    left_on="leftItemId",
    right_on="itemId",
    how="left"
)

# merge для right
test = test.merge(
    data.add_suffix("_right").rename(columns={"itemId_right": "itemId"}),
    left_on="rightItemId",
    right_on="itemId",
    how="left"
)

In [18]:
test.drop(columns=['leftItemId', 'rightItemId', 'itemId_x', 'itemId_y'], inplace=True)
test.drop(
    columns=['authorId_left', 'authorId_right', 'content_right', 'content_left'],
    inplace=True
)

In [19]:
from datasets import Dataset


test = Dataset.from_dict({"right_title": test['title_right'].tolist(), "left_title": test['title_left'].tolist()})

In [20]:
def preprocess_test(examples):
    return tokenizer(
        text=examples["right_title"],
        text_pair=examples["left_title"],
        truncation=True,
        padding="max_length",
        max_length=200
    )
test_dataset = test.map(preprocess_test, batched=True)

Map:   0%|          | 0/51636 [00:00<?, ? examples/s]

In [21]:
test_dataset = test_dataset.remove_columns(["right_title", "left_title"])

In [22]:
from transformers import TrainingArguments, AutoModelForSequenceClassification

checkpoint_dir = "intfloat/checkpoint-2084"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_dir)
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [23]:

test_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [24]:
trainer.model.eval()  # ставим модель в eval-режим (отключает dropout и т.п.)
predictions = trainer.predict(test_dataset)
logits = predictions.predictions

In [25]:
preds = np.argmax(logits, axis=-1)

In [26]:
predictions = pd.DataFrame(preds, columns=['target'])
predictions['target'] = predictions['target'].map(lambda x: labels.int2str(x))
predictions

Unnamed: 0,target
0,relevant
1,relevant
2,relevant_plus
3,no_relevant
4,relevant_minus
...,...
51631,relevant_minus
51632,relevant
51633,relevant
51634,no_relevant


In [27]:
predictions.to_csv('predictions_intfloat.csv', index=True)

**АНСАМБЛИРУЕМ ДВЕ МОДЕЛИ**

In [43]:
berta_logits = np.load("logits_BERTA.npy")
intfloat_logits = np.load("logits_intfloat.npy")
logits = berta_logits * 0.4 + intfloat_logits * 0.6

In [44]:
preds = np.argmax(logits, axis=-1)

In [45]:
predictions = pd.DataFrame(preds, columns=['target'])
predictions['target'] = predictions['target'].map(lambda x: labels.int2str(x))
predictions

Unnamed: 0,target
0,relevant
1,relevant_minus
2,relevant_plus
3,no_relevant
4,relevant_minus
...,...
51631,relevant_minus
51632,relevant
51633,relevant
51634,no_relevant


In [47]:
predictions.to_csv('predictions_ansamble_04.csv', index=True)

**КАТБУСТ НА ЭМБЕДДИНГАХ**

In [60]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import default_data_collator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # выключаем dropout

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(55083, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [59]:
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=default_data_collator)
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=default_data_collator)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=default_data_collator)

In [62]:
test_embeddings = []
train_embeddings = []
val_embeddings = []

In [None]:
from tqdm import tqdm



with torch.no_grad():
    for batch in tqdm(test_loader):
        # если датасет HuggingFace, у него обычно есть input_ids, attention_mask, ...
        inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids","attention_mask","token_type_ids"]}

        # Получаем все скрытые состояния
        outputs = model.bert(**inputs, output_hidden_states=True, return_dict=True)

        # Берём последний слой ([-1]) -> (batch, seq_len, 768)
        last_hidden = outputs.hidden_states[-1]

        # Пуллим в один вектор (например CLS-токен)
        cls_emb = last_hidden[:, 0, :]   # (batch, 768)
        test_embeddings.append(cls_emb.cpu().numpy())


In [63]:
from tqdm import tqdm



with torch.no_grad():
    for batch in tqdm(train_loader):
        # если датасет HuggingFace, у него обычно есть input_ids, attention_mask, ...
        inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids","attention_mask","token_type_ids"]}

        # Получаем все скрытые состояния
        outputs = model.bert(**inputs, output_hidden_states=True, return_dict=True)

        # Берём последний слой ([-1]) -> (batch, seq_len, 768)
        last_hidden = outputs.hidden_states[-1]

        # Пуллим в один вектор (например CLS-токен)
        cls_emb = last_hidden[:, 0, :]   # (batch, 768)
        train_embeddings.append(cls_emb.cpu().numpy())


with torch.no_grad():
    for batch in tqdm(val_loader):
        # если датасет HuggingFace, у него обычно есть input_ids, attention_mask, ...
        inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids","attention_mask","token_type_ids"]}

        # Получаем все скрытые состояния
        outputs = model.bert(**inputs, output_hidden_states=True, return_dict=True)

        # Берём последний слой ([-1]) -> (batch, seq_len, 768)
        last_hidden = outputs.hidden_states[-1]

        # Пуллим в один вектор (например CLS-токен)
        cls_emb = last_hidden[:, 0, :]   # (batch, 768)
        val_embeddings.append(cls_emb.cpu().numpy())

100%|██████████| 4170/4170 [08:39<00:00,  8.03it/s]
100%|██████████| 1043/1043 [02:08<00:00,  8.11it/s]


In [64]:
train_embeddings = np.concatenate(train_embeddings, axis=0)
val_embeddings = np.concatenate(val_embeddings, axis=0)

In [None]:
# Объединяем и сохраняем
test_embeddings = np.concatenate(test_embeddings, axis=0)

Saved: (51636, 768)


In [None]:
X_train_processed = pd.DataFrame(train_embeddings)
X_val_processed = pd.DataFrame(val_embeddings)
X_test_processed = pd.DataFrame(test_embeddings)

In [67]:
from catboost import CatBoostClassifier, Pool


train_pool = Pool(X_train_processed, label=y_train)
val_pool = Pool(X_val_processed, label=y_val)

catboost_model = CatBoostClassifier(
    iterations=5000,
    depth=5,
    learning_rate=0.05,
    early_stopping_rounds=200,
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Weighted",
    verbose=100,
    random_seed=42,
    task_type="GPU" if torch.cuda.is_available() else "CPU"
)

catboost_model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	learn: 0.4922321	test: 0.4831985	best: 0.4831985 (0)	total: 38ms	remaining: 3m 9s
100:	learn: 0.5414123	test: 0.5251432	best: 0.5251432 (100)	total: 2.41s	remaining: 1m 56s
200:	learn: 0.5472344	test: 0.5268458	best: 0.5277029 (181)	total: 4.68s	remaining: 1m 51s
300:	learn: 0.5522035	test: 0.5278320	best: 0.5282948 (250)	total: 6.89s	remaining: 1m 47s
400:	learn: 0.5558517	test: 0.5273309	best: 0.5282948 (250)	total: 9.04s	remaining: 1m 43s
bestTest = 0.5282948337
bestIteration = 250
Shrink model to first 251 iterations.


<catboost.core.CatBoostClassifier at 0x264554cb5d0>

In [68]:
logits = catboost_model.predict_proba(X_test_processed)
logits

array([[0.0485918 , 0.5035331 , 0.38963356, 0.05824154],
       [0.04530167, 0.43056493, 0.48583563, 0.03829777],
       [0.00377576, 0.16880043, 0.02049642, 0.80692739],
       ...,
       [0.04653209, 0.43652009, 0.47987317, 0.03707466],
       [0.27982797, 0.18537426, 0.5140263 , 0.02077147],
       [0.02541443, 0.58608656, 0.24048502, 0.148014  ]])

In [71]:
BERTA_logits = np.load('logits_BERTA.npy')

In [72]:
mean_logits = (BERTA_logits + logits) / 2
mean_logits

array([[-0.54381147,  0.80700903,  0.76759381, -0.44102645],
       [-0.71404672,  0.77461168,  0.92037181, -0.67592782],
       [ 1.31953462,  0.56312398, -0.56972424, -1.00657048],
       ...,
       [-0.51949075,  0.86156499,  0.82769739, -0.64948548],
       [-0.97698074,  0.24418196,  0.88159854,  0.443408  ],
       [ 0.11198893,  0.93156876,  0.37651033, -0.94059185]])

In [74]:
mean_predictions = np.argmax(mean_logits, axis=1)

In [75]:
predictions = pd.DataFrame(mean_predictions, columns=['target'])
predictions['target'] = predictions['target'].map(lambda x: labels.int2str(x))
predictions

Unnamed: 0,target
0,relevant
1,relevant_minus
2,relevant_plus
3,no_relevant
4,relevant_minus
...,...
51631,relevant_minus
51632,relevant
51633,relevant
51634,relevant_minus


In [76]:
predictions.to_csv('mean_predictions_BERTA_CatBoost.csv', index=True)