In [3]:
!pip install wandb



In [4]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.notebook import tqdm
import wandb

In [5]:
import time

# Создаем список для итерации
items = list(range(10))

# Используем tqdm для создания прогресс-бара
for item in tqdm(items, desc='Прогресс'):
    # Имитируем задержку для наглядности
    time.sleep(0.5)


Прогресс:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
wandb.login()
wandb.init(project="base001")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myaeooa[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
data = pd.read_csv('/kaggle/input/zxcvbn/pars_data.csv', delimiter=';')
data.columns = ['text', 'target']
data.dropna(inplace=True)
data = data.sample(n=15000, random_state=42)

class_counts = data['target'].value_counts()
print("Количество примеров в каждом классе:")
print(class_counts)

Количество примеров в каждом классе:
target
0    10923
1     4077
Name: count, dtype: int64


In [8]:
def tokenize_text(texts, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenization Progress"):
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

max_length = 512

train_input_ids, train_attention_masks = tokenize_text(train_texts, tokenizer, max_length)
val_input_ids, val_attention_masks = tokenize_text(val_texts, tokenizer, max_length)
test_input_ids, test_attention_masks = tokenize_text(test_texts, tokenizer, max_length)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Tokenization Progress:   0%|          | 0/12000 [00:00<?, ?it/s]

Tokenization Progress:   0%|          | 0/1500 [00:00<?, ?it/s]

Tokenization Progress:   0%|          | 0/1500 [00:00<?, ?it/s]

In [9]:
batch_size = 16

train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(train_labels.values))
val_dataset = TensorDataset(val_input_ids, val_attention_masks, torch.tensor(val_labels.values))
test_dataset = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(test_labels.values))

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [10]:
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
model.cuda()
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3

class_weights = torch.tensor([class_counts[1]/class_counts[0], 1.0], dtype=torch.float).cuda()
loss_function = torch.nn.CrossEntropyLoss(weight=class_weights)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
for epoch in range(epochs):
    model.train()
    train_losses = []

    for batch in tqdm(train_dataloader, desc="Epoch: {}".format(epoch + 1)):
        batch = tuple(t.cuda() for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_function(outputs.logits, inputs['labels'])
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()

    train_loss = np.mean(train_losses)
    wandb.log({"Train Loss": train_loss})

    model.eval()
    val_losses = []
    val_predictions = []
    val_true_labels = []

    for batch in val_dataloader:
        batch = tuple(t.cuda() for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)

        loss = loss_function(outputs.logits, inputs['labels'])
        val_losses.append(loss.item())
        logits = outputs.logits
        val_predictions.extend(torch.argmax(logits, dim=1).tolist())
        val_true_labels.extend(inputs['labels'].tolist())

    val_loss = np.mean(val_losses)
    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    val_precision = precision_score(val_true_labels, val_predictions)
    val_recall = recall_score(val_true_labels, val_predictions)
    val_f1 = f1_score(val_true_labels, val_predictions)

    wandb.log({"Val Loss": val_loss,
               "Val Accuracy": val_accuracy,
               "Val Precision": val_precision,
               "Val Recall": val_recall,
               "Val F1": val_f1})

    print("Epoch {} - Train Loss: {:.4f}, Val Loss: {:.4f}, Val Accuracy: {:.4f}, Val Precision: {:.4f}, Val Recall: {:.4f}, Val F1: {:.4f}".format(
        epoch + 1, train_loss, val_loss, val_accuracy, val_precision, val_recall, val_f1))

Epoch: 1:   0%|          | 0/750 [00:00<?, ?it/s]

Epoch 1 - Train Loss: 0.6164, Val Loss: 0.5495, Val Accuracy: 0.7493, Val Precision: 0.5305, Val Recall: 0.7220, Val F1: 0.6116


Epoch: 2:   0%|          | 0/750 [00:00<?, ?it/s]

Epoch 2 - Train Loss: 0.5280, Val Loss: 0.5197, Val Accuracy: 0.7713, Val Precision: 0.5682, Val Recall: 0.6805, Val F1: 0.6193


Epoch: 3:   0%|          | 0/750 [00:00<?, ?it/s]

Epoch 3 - Train Loss: 0.4615, Val Loss: 0.4889, Val Accuracy: 0.7693, Val Precision: 0.5556, Val Recall: 0.7805, Val F1: 0.6491


In [13]:
test_predictions = []
test_true_labels = []

for batch in test_dataloader:
    batch = tuple(t.cuda() for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    test_predictions.extend(torch.argmax(logits, dim=1).tolist())
    test_true_labels.extend(inputs['labels'].tolist())

test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision = precision_score(test_true_labels, test_predictions)
test_recall = recall_score(test_true_labels, test_predictions)
test_f1 = f1_score(test_true_labels, test_predictions)

print("Test Accuracy: {:.4f}, Test Precision: {:.4f}, Test Recall: {:.4f}, Test F1: {:.4f}".format(
    test_accuracy, test_precision, test_recall, test_f1))

Test Accuracy: 0.7500, Test Precision: 0.5144, Test Recall: 0.7315, Test F1: 0.6040
