In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder


In [2]:
# Функция для кодирования предложений
def encode_sentences(tokenizer, sentences, max_length):
    input_ids, attention_masks = [], []
    for sentence in sentences:
        encoding = tokenizer.encode_plus(sentence, max_length=max_length, padding='max_length', truncation=True)
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

# Функция для подготовки датасета
def prepare_dataset(sentences, labels, max_length):
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
    input_ids, attention_masks = encode_sentences(tokenizer, sentences, max_length)
    le = LabelEncoder()
    labels = le.fit_transform(labels)  # кодируем метки
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)


In [3]:
# Загрузка данных
train_df = pd.read_csv('data/train_supervised_dataset.csv')
valid_df = pd.read_csv('data/train_unsupervised_dataset.csv')

# Преобразование датафреймов в датасеты
max_length = 512  # или любое другое значение в зависимости от ваших потребностей
train_dataset = prepare_dataset(train_df['name'].tolist(), train_df['good'].tolist(), max_length)
# Так как valid_df не содержит 'good', мы можем использовать пустой список для меток
valid_dataset = prepare_dataset(valid_df['name'].tolist(), [], max_length)


AssertionError: Size mismatch between tensors

In [None]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)


In [None]:
model.train()
for epoch in range(3):  # выберите подходящее количество эпох
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
