In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

df = pd.read_csv('/content/train.csv (1).csv')

texts = df['text'].tolist()
trend_labels = df[[f'trend_id_res{i}' for i in range(50)]].values

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, trend_labels, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

class TrendDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        labels = torch.tensor(self.labels[index], dtype=torch.float)

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }

def create_data_loader(texts, labels, tokenizer, batch_size):
    ds = TrendDataset(texts, labels, tokenizer)
    return DataLoader(ds, batch_size=batch_size, shuffle=True)

batch_size = 16
train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, batch_size)
test_data_loader = create_data_loader(test_texts, test_labels, tokenizer, batch_size)

class ModelWithDropout(nn.Module):
    def __init__(self, base_model, dropout_rate=0.3):
        super(ModelWithDropout, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        dropped_logits = self.dropout(logits)

        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(dropped_logits, labels)

        return {"loss": loss, "logits": dropped_logits}

base_model = AutoModelForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=50)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ModelWithDropout(base_model, dropout_rate=0.3).to(device)

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1.5e-2, correct_bias=False)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs['loss']

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        logits = outputs['logits']
        all_preds.extend(logits.sigmoid().detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    all_preds = (np.array(all_preds) > 0.5).astype(int)
    accuracy = accuracy_score(np.array(all_labels), all_preds)

    return avg_loss, accuracy

def eval_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            all_preds.extend(logits.sigmoid().detach().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = (np.array(all_preds) > 0.5).astype(int)
    accuracy = accuracy_score(np.array(all_labels), all_preds)

    return accuracy

def train_model(model, train_loader, test_loader, optimizer, device, epochs):
    best_accuracy = 0.0
    for epoch in range(epochs):
        start_time = time.time()

        train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, device)

        test_accuracy = eval_model(model, test_loader, device)

        end_time = time.time()
        epoch_time = end_time - start_time

        print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f} | Test Accuracy: {test_accuracy:.4f} | Time: {epoch_time:.2f} seconds')

        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy

train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=40)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/40 | Train Loss: 0.3139 | Train Accuracy: 0.0000 | Test Accuracy: 0.0000 | Time: 87.58 seconds
Epoch 2/40 | Train Loss: 0.2847 | Train Accuracy: 0.0000 | Test Accuracy: 0.0000 | Time: 88.73 seconds
Epoch 3/40 | Train Loss: 0.2777 | Train Accuracy: 0.0165 | Test Accuracy: 0.1319 | Time: 89.49 seconds
Epoch 4/40 | Train Loss: 0.2699 | Train Accuracy: 0.0919 | Test Accuracy: 0.1492 | Time: 89.77 seconds
Epoch 5/40 | Train Loss: 0.2652 | Train Accuracy: 0.1412 | Test Accuracy: 0.2854 | Time: 89.76 seconds
Epoch 6/40 | Train Loss: 0.2593 | Train Accuracy: 0.1825 | Test Accuracy: 0.3276 | Time: 89.63 seconds
Epoch 7/40 | Train Loss: 0.2553 | Train Accuracy: 0.2226 | Test Accuracy: 0.3438 | Time: 89.59 seconds
Epoch 8/40 | Train Loss: 0.2509 | Train Accuracy: 0.2677 | Test Accuracy: 0.3946 | Time: 89.76 seconds
Epoch 9/40 | Train Loss: 0.2481 | Train Accuracy: 0.2953 | Test Accuracy: 0.4292 | Time: 89.67 seconds
Epoch 10/40 | Train Loss: 0.2449 | Train Accuracy: 0.3275 | Test Accuracy

In [4]:
train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=10)

Epoch 1/10 | Train Loss: 0.2135 | Train Accuracy: 0.5673 | Test Accuracy: 0.5200 | Time: 87.56 seconds
Epoch 2/10 | Train Loss: 0.2139 | Train Accuracy: 0.5714 | Test Accuracy: 0.5286 | Time: 89.83 seconds
Epoch 3/10 | Train Loss: 0.2136 | Train Accuracy: 0.5800 | Test Accuracy: 0.5200 | Time: 89.95 seconds
Epoch 4/10 | Train Loss: 0.2118 | Train Accuracy: 0.5909 | Test Accuracy: 0.5351 | Time: 89.83 seconds
Epoch 5/10 | Train Loss: 0.2124 | Train Accuracy: 0.5773 | Test Accuracy: 0.5157 | Time: 89.82 seconds
Epoch 6/10 | Train Loss: 0.2119 | Train Accuracy: 0.5895 | Test Accuracy: 0.5211 | Time: 89.80 seconds
Epoch 7/10 | Train Loss: 0.2106 | Train Accuracy: 0.5792 | Test Accuracy: 0.5308 | Time: 89.75 seconds
Epoch 8/10 | Train Loss: 0.2122 | Train Accuracy: 0.6014 | Test Accuracy: 0.5276 | Time: 89.61 seconds
Epoch 9/10 | Train Loss: 0.2118 | Train Accuracy: 0.5933 | Test Accuracy: 0.5405 | Time: 89.74 seconds
Epoch 10/10 | Train Loss: 0.2123 | Train Accuracy: 0.5811 | Test Accuracy

In [5]:
train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=10)

Epoch 1/10 | Train Loss: 0.2124 | Train Accuracy: 0.5982 | Test Accuracy: 0.5330 | Time: 89.83 seconds
Epoch 2/10 | Train Loss: 0.2109 | Train Accuracy: 0.5782 | Test Accuracy: 0.5286 | Time: 89.84 seconds
Epoch 3/10 | Train Loss: 0.2113 | Train Accuracy: 0.6044 | Test Accuracy: 0.5330 | Time: 89.90 seconds
Epoch 4/10 | Train Loss: 0.2112 | Train Accuracy: 0.5882 | Test Accuracy: 0.5373 | Time: 89.88 seconds
Epoch 5/10 | Train Loss: 0.2101 | Train Accuracy: 0.5884 | Test Accuracy: 0.5319 | Time: 89.91 seconds
Epoch 6/10 | Train Loss: 0.2102 | Train Accuracy: 0.5903 | Test Accuracy: 0.5319 | Time: 89.94 seconds
Epoch 7/10 | Train Loss: 0.2099 | Train Accuracy: 0.5887 | Test Accuracy: 0.5265 | Time: 89.87 seconds
Epoch 8/10 | Train Loss: 0.2102 | Train Accuracy: 0.5922 | Test Accuracy: 0.5232 | Time: 89.86 seconds
Epoch 9/10 | Train Loss: 0.2111 | Train Accuracy: 0.5919 | Test Accuracy: 0.5351 | Time: 89.89 seconds
Epoch 10/10 | Train Loss: 0.2102 | Train Accuracy: 0.5909 | Test Accuracy

In [6]:
train_model(model, train_data_loader, test_data_loader, optimizer, device, epochs=10)

Epoch 1/10 | Train Loss: 0.2110 | Train Accuracy: 0.6044 | Test Accuracy: 0.5416 | Time: 89.82 seconds
Epoch 2/10 | Train Loss: 0.2104 | Train Accuracy: 0.6025 | Test Accuracy: 0.5265 | Time: 89.87 seconds
Epoch 3/10 | Train Loss: 0.2107 | Train Accuracy: 0.5890 | Test Accuracy: 0.5286 | Time: 89.95 seconds
Epoch 4/10 | Train Loss: 0.2102 | Train Accuracy: 0.5919 | Test Accuracy: 0.5297 | Time: 89.85 seconds
Epoch 5/10 | Train Loss: 0.2093 | Train Accuracy: 0.5936 | Test Accuracy: 0.5384 | Time: 89.97 seconds
Epoch 6/10 | Train Loss: 0.2099 | Train Accuracy: 0.5922 | Test Accuracy: 0.5319 | Time: 89.88 seconds
Epoch 7/10 | Train Loss: 0.2086 | Train Accuracy: 0.6106 | Test Accuracy: 0.5265 | Time: 89.78 seconds
Epoch 8/10 | Train Loss: 0.2090 | Train Accuracy: 0.6033 | Test Accuracy: 0.5276 | Time: 89.84 seconds
Epoch 9/10 | Train Loss: 0.2111 | Train Accuracy: 0.5952 | Test Accuracy: 0.5211 | Time: 89.82 seconds
Epoch 10/10 | Train Loss: 0.2100 | Train Accuracy: 0.5984 | Test Accuracy

In [11]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
df2 = pd.read_csv('/content/test.csv (2).csv')

df2['text'] = df2['text'].fillna('')

print(df2['text'].isna().sum())

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

def create_test_data_loader(texts, tokenizer, batch_size=16):
    ds = TestDataset(texts, tokenizer)
    return DataLoader(ds, batch_size=batch_size, shuffle=False)

texts_df2 = df2['text'].tolist()
test_data_loader_df2 = create_test_data_loader(texts_df2, tokenizer)

def predict(model, data_loader, device):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            preds = (torch.sigmoid(logits) > 0.5).cpu().numpy()
            all_preds.extend(preds)

    return np.array(all_preds)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

predictions_df2 = predict(model, test_data_loader_df2, device)

predictions_df2_df = pd.DataFrame(predictions_df2, columns=[f'trend_id_res{i}' for i in range(50)])

predictions_with_indices = []
for pred in predictions_df2:
    indices_of_ones = np.where(pred == 1)[0].tolist()
    indices_as_str = ' '.join(map(str, indices_of_ones))
    predictions_with_indices.append(indices_as_str)

df2['target'] = predictions_with_indices

output_df = df2[['index', 'target']]

file_path = 'result.csv'
output_df.to_csv(file_path, index=False)

print(f"DataFrame saved to {file_path}")

0




DataFrame saved to result.csv
