In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
# читаю тренировочный датасет
df = pd.read_csv('/content/train.csv (1).csv')

texts = df['text'].tolist()
trend_labels = df[[f'trend_id_res{i}' for i in range(50)]].values

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, trend_labels, test_size=0.001, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

class TrendDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        labels = torch.tensor(self.labels[index], dtype=torch.float)

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }

def create_data_loader(texts, labels, tokenizer, batch_size):
    ds = TrendDataset(texts, labels, tokenizer)
    return DataLoader(ds, batch_size=batch_size, shuffle=True)

batch_size = 16
train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, batch_size)
test_data_loader = create_data_loader(test_texts, test_labels, tokenizer, batch_size)

class ModelWithDropout(nn.Module):
    def __init__(self, base_model, dropout_rate=0.3):  # Increased dropout rate
        super(ModelWithDropout, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        dropped_logits = self.dropout(logits)

        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(dropped_logits, labels)

        return {"loss": loss, "logits": dropped_logits}

base_model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=50)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ModelWithDropout(base_model, dropout_rate=0.3).to(device)  # Increased dropout rate

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1.5e-2, correct_bias=False)  # Adjusted learning rate

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs['loss']  # No L1 regularization

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Collect predictions and true labels for accuracy calculation
        logits = outputs['logits']
        all_preds.extend(logits.sigmoid().detach().cpu().numpy())  # Move logits back to CPU for evaluation
        all_labels.extend(labels.cpu().numpy())  # Move labels back to CPU for evaluation

    avg_loss = total_loss / len(data_loader)
    all_preds = (np.array(all_preds) > 0.5).astype(int)
    accuracy = accuracy_score(np.array(all_labels), all_preds)

    return avg_loss, accuracy

def eval_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            all_preds.extend(logits.sigmoid().detach().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = (np.array(all_preds) > 0.5).astype(int)
    accuracy = accuracy_score(np.array(all_labels), all_preds)

    return accuracy

def train_model(model, train_loader, test_loader, optimizer, device, epochs):
    best_accuracy = 0.0
    for epoch in range(epochs):
        start_time = time.time()

        train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, device)

        test_accuracy = eval_model(model, test_loader, device)

        end_time = time.time()
        epoch_time = end_time - start_time

        print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f} | Test Accuracy: {test_accuracy:.4f} | Time: {epoch_time:.2f} seconds')

        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy

train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=10)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 | Train Loss: 0.3084 | Train Accuracy: 0.0000 | Test Accuracy: 0.0000 | Time: 109.25 seconds
Epoch 2/10 | Train Loss: 0.2783 | Train Accuracy: 0.0058 | Test Accuracy: 0.0000 | Time: 108.52 seconds
Epoch 3/10 | Train Loss: 0.2682 | Train Accuracy: 0.0981 | Test Accuracy: 0.2000 | Time: 108.34 seconds
Epoch 4/10 | Train Loss: 0.2620 | Train Accuracy: 0.1784 | Test Accuracy: 0.4000 | Time: 108.35 seconds
Epoch 5/10 | Train Loss: 0.2559 | Train Accuracy: 0.2369 | Test Accuracy: 0.4000 | Time: 108.28 seconds
Epoch 6/10 | Train Loss: 0.2508 | Train Accuracy: 0.2793 | Test Accuracy: 0.4000 | Time: 108.35 seconds
Epoch 7/10 | Train Loss: 0.2467 | Train Accuracy: 0.3116 | Test Accuracy: 0.4000 | Time: 108.25 seconds
Epoch 8/10 | Train Loss: 0.2440 | Train Accuracy: 0.3291 | Test Accuracy: 0.4000 | Time: 108.34 seconds
Epoch 9/10 | Train Loss: 0.2401 | Train Accuracy: 0.3579 | Test Accuracy: 0.4000 | Time: 108.38 seconds
Epoch 10/10 | Train Loss: 0.2371 | Train Accuracy: 0.3735 | Test

In [4]:
train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=30)

Epoch 1/30 | Train Loss: 0.2367 | Train Accuracy: 0.4004 | Test Accuracy: 0.4000 | Time: 109.45 seconds
Epoch 2/30 | Train Loss: 0.2330 | Train Accuracy: 0.3974 | Test Accuracy: 0.4000 | Time: 108.73 seconds
Epoch 3/30 | Train Loss: 0.2309 | Train Accuracy: 0.4158 | Test Accuracy: 0.4000 | Time: 108.54 seconds
Epoch 4/30 | Train Loss: 0.2306 | Train Accuracy: 0.4214 | Test Accuracy: 0.8000 | Time: 108.47 seconds
Epoch 5/30 | Train Loss: 0.2288 | Train Accuracy: 0.4459 | Test Accuracy: 0.6000 | Time: 108.39 seconds
Epoch 6/30 | Train Loss: 0.2274 | Train Accuracy: 0.4554 | Test Accuracy: 0.6000 | Time: 108.50 seconds
Epoch 7/30 | Train Loss: 0.2244 | Train Accuracy: 0.4755 | Test Accuracy: 0.6000 | Time: 108.48 seconds
Epoch 8/30 | Train Loss: 0.2231 | Train Accuracy: 0.4879 | Test Accuracy: 0.8000 | Time: 108.52 seconds
Epoch 9/30 | Train Loss: 0.2223 | Train Accuracy: 0.4822 | Test Accuracy: 0.6000 | Time: 108.40 seconds
Epoch 10/30 | Train Loss: 0.2222 | Train Accuracy: 0.5026 | Test

In [5]:
train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=15)

Epoch 1/15 | Train Loss: 0.2110 | Train Accuracy: 0.5777 | Test Accuracy: 0.6000 | Time: 108.80 seconds
Epoch 2/15 | Train Loss: 0.2116 | Train Accuracy: 0.5840 | Test Accuracy: 0.8000 | Time: 108.55 seconds
Epoch 3/15 | Train Loss: 0.2113 | Train Accuracy: 0.5886 | Test Accuracy: 0.8000 | Time: 108.41 seconds
Epoch 4/15 | Train Loss: 0.2112 | Train Accuracy: 0.5912 | Test Accuracy: 0.8000 | Time: 108.35 seconds
Epoch 5/15 | Train Loss: 0.2101 | Train Accuracy: 0.5920 | Test Accuracy: 0.8000 | Time: 108.37 seconds
Epoch 6/15 | Train Loss: 0.2107 | Train Accuracy: 0.5717 | Test Accuracy: 0.6000 | Time: 108.28 seconds
Epoch 7/15 | Train Loss: 0.2107 | Train Accuracy: 0.6005 | Test Accuracy: 0.8000 | Time: 108.32 seconds
Epoch 8/15 | Train Loss: 0.2109 | Train Accuracy: 0.5871 | Test Accuracy: 0.6000 | Time: 108.33 seconds
Epoch 9/15 | Train Loss: 0.2106 | Train Accuracy: 0.5951 | Test Accuracy: 0.8000 | Time: 108.46 seconds
Epoch 10/15 | Train Loss: 0.2110 | Train Accuracy: 0.5909 | Test

# Saving model

In [11]:
torch.save(model.state_dict(), 'model_weights.pth')

#Prediction on test set:

In [12]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
#читаю тестовый файл
df2 = pd.read_csv('/content/test.csv (2).csv')

df2['text'] = df2['text'].fillna('')

print(df2['text'].isna().sum())

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

def create_test_data_loader(texts, tokenizer, batch_size=16):
    ds = TestDataset(texts, tokenizer)
    return DataLoader(ds, batch_size=batch_size, shuffle=False)

texts_df2 = df2['text'].tolist()
test_data_loader_df2 = create_test_data_loader(texts_df2, tokenizer)

def predict(model, data_loader, device):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            preds = (torch.sigmoid(logits) > 0.5).cpu().numpy()
            all_preds.extend(preds)

    return np.array(all_preds)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

predictions_df2 = predict(model, test_data_loader_df2, device)

predictions_df2_df = pd.DataFrame(predictions_df2, columns=[f'trend_id_res{i}' for i in range(50)])

predictions_with_indices = []
for pred in predictions_df2:
    indices_of_ones = np.where(pred == 1)[0].tolist()
    indices_as_str = ' '.join(map(str, indices_of_ones))
    predictions_with_indices.append(indices_as_str)

df2['target'] = predictions_with_indices

output_df = df2[['index', 'target']]

file_path = 'result.csv'
output_df.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

0




DataFrame saved to result.csv
