In [1]:
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import pandas as pd

# Загрузка данных

In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-classif-datasets/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-classif-datasets/test.csv")
categories = ["бытовая техника", "обувь", "одежда", "посуда",
              "текстиль", "товары для детей", "украшения и аксессуары",
              "электроника", "нет товара"]

# Инициализация zero-shot классификатора


In [4]:
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


# Автоматическая разметка train-даты, train val split и формирование Dataset

In [5]:
import pandas as pd

train_texts = train_df['text'].tolist()
train_labels = []
train_confidences = []

for text in tqdm(train_texts):
    res = classifier(text, candidate_labels=categories, multi_label=False)
    label = res['labels'][0]
    score = res['scores'][0]
    # Присваиваем "нет товара", если уверенность слишком низка
    if score < 0.5:
        label = "нет товара"
    train_labels.append(label)
    train_confidences.append(score)

# Добавляем разметку в датафрейм
train_df['category'] = train_labels
train_df['confidence'] = train_confidences

# Преобразуем категории в числовые метки
label2id = {cat: idx for idx, cat in enumerate(categories)}
id2label = {idx: cat for cat, idx in label2id.items()}
train_df['label'] = train_df['category'].map(label2id)

# Сохраняем файл с разметкой
train_df[['text', 'category', 'confidence']].to_csv("train_labeled.csv", index=False, encoding="utf-8-sig")
print("train_labeled.csv сохранён")

# Формируем Dataset для HuggingFace
dataset = Dataset.from_pandas(train_df[['text','label']])
dataset = dataset.train_test_split(test_size=0.2)  # 80% train, 20% val
print("train data is marked up and saved")


  1%|          | 10/1818 [00:02<04:27,  6.76it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1818/1818 [04:44<00:00,  6.38it/s]


train_labeled.csv сохранён
train data is marked up and saved


# Загружаем модель для классификации, конфигурация LoRA


In [None]:
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
import numpy as np
from sklearn.metrics import f1_score

model_name = "DeepPavlov/rubert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(categories), id2label=id2label, label2id=label2id
)

# Конфигурация LoRA
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=["query", "value"]  # адаптеры в модулях attention
)
model = get_peft_model(model, peft_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Токенизация, метрики, параметры обучения

In [None]:
# Токенизатор
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Функция для токенизации
def preprocess(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

train_encodings = dataset['train'].map(preprocess, batched=True)
val_encodings = dataset['test'].map(preprocess, batched=True)

# Функция вычисления метрик
def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    f1 = f1_score(eval_pred.label_ids, preds, average='weighted')
    return {"weighted_f1": f1}

training_args = TrainingArguments(
    output_dir="model_outut",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_dir="logs",
    disable_tqdm=False,
    report_to="none",
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Обучение модели

In [None]:
# Обучение модели
print('Training')
trainer.train()
print('Trained')

Training


Epoch,Training Loss,Validation Loss,Weighted F1
1,1.292,1.040365,0.567869
2,0.9364,0.855897,0.625369
3,0.7991,0.760621,0.752629
4,0.6531,0.729918,0.750897
5,0.5728,0.738861,0.763773
6,0.5236,0.706752,0.764761
7,0.4777,0.793073,0.771045
8,0.452,0.727988,0.779808
9,0.4054,0.79804,0.775634
10,0.3354,0.819023,0.777092


Trained


после 13 эпохи получена лучшая метрика weighted F1 = 0.815 на валидационной выборке. Поэтому выбираем веса модели после 13-й эпохи для предсказаний на тесте

# Загрузка лучшей версии модели

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from peft import PeftModel, PeftConfig

# Путь к сохранённым весам после 13-й эпохи
checkpoint_dir = "/kaggle/working/model_output/checkpoint-3094"

# Загружаем токенизатор
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)

# Загружаем базовую модель
base_model = AutoModelForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased",
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Загружаем LoRA-конфигурацию и веса
model = PeftModel.from_pretrained(base_model, checkpoint_dir)

# Проверим, что всё загрузилось
print("LoRA модель успешно загружена")

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA модель успешно загружена


# Предсказания для тестовой выборки

In [None]:
print('predicting')
# Предсказания на тестовом наборе
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, return_tensors="pt")
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
all_preds = []
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
with torch.no_grad():
    for batch in tqdm(test_loader):
        inputs = {"input_ids": batch[0].cuda(), "attention_mask": batch[1].cuda()}
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
end.record()
torch.cuda.synchronize()
elapsed_time = start.elapsed_time(end) / len(test_df)  # среднее время на один пример (мс)
print(f"Avg inference time per example: {elapsed_time:.2f} ms")

# Создаём DataFrame с предсказаниями
test_labels = [id2label[p] for p in all_preds]
submission = pd.DataFrame({"category": test_labels})
submission.to_csv("submission.csv", index=False)
print('predicted')
print("THE END")

predicting


100%|██████████| 455/455 [00:53<00:00,  8.57it/s]

Avg inference time per example: 7.30 ms
predicted
THE END



