In [2]:
!pip install wandb
!pip install 'transformers[torch]' 
!pip install datasets 
!pip install evaluate

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


In [3]:
import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from datasets import load_dataset
from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertConfig, Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers import default_data_collator

import evaluate

from torch.utils.data import DataLoader, TensorDataset

from tqdm import tqdm

from IPython.display import clear_output

from sklearn.model_selection import train_test_split

In [4]:
TEST_SIZE = 0.3
SPLIT_RANDOM_SEED = 42

In [10]:
def encode(examples):
    result = tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")
    return result

@torch.no_grad()
def test(model, loader, device, tqdm_desc):
    loss_log = []
    acc_log = []
    model.eval()
    loss_func = nn.CrossEntropyLoss()

    for input_ids, attention_mask, labels in loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        out = model(input_ids, attention_mask=attention_mask)
        loss = loss_func(out.logits, labels)

        loss_log.append(loss.item())

        pred = torch.argmax(out.logits, dim=1)
        acc_log.append((pred == labels).detach().cpu().numpy().sum() / len(pred))

    return loss_log, acc_log


def train(model, optimizer, n_epochs, train_loader, val_loader, batch_size, scheduler=None):
    train_loss = []
    train_acc = []
    
    run = wandb.init(project='html classificator', reinit=True)
    wandb.watch(model, nn.CrossEntropyLoss(), log="all", log_freq=100)
    model.train()

    batch = 0
    loss_func = nn.CrossEntropyLoss()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    for epoch in range(n_epochs):
        for input_ids, attention_mask, labels in tqdm(train_loader, desc=f'Training {epoch}/{n_epochs}'):
            batch += 1
            
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            out = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = loss_func(out.logits, labels)
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

            pred = torch.argmax(out.logits, dim=1)
            train_acc.append((pred == labels).detach().cpu().numpy().sum() / len(pred))
            
            if batch == 100:
                batch = 0
                
                val_loss, val_acc = test(model, val_loader, device, tqdm_desc='Validating')
                model.train()
                
                wandb.log({"train": {"acc": np.mean(train_acc), "loss": np.mean(train_loss)}, "val": {"acc": np.mean(val_acc), "loss": np.mean(val_loss)}})
                
                # clear_output()
                print(f"Next 100 batches:")
                print(f" train loss: {np.mean(train_loss)}, train acc: {np.mean(train_acc)}")
                print(f" val loss: {np.mean(val_loss)}, val acc: {np.mean(val_acc)}\n")
                train_loss = []
                train_acc = []

        if scheduler is not None:
            scheduler.step()
            
    # последние батчи
    val_loss, val_acc = test(model, val_loader, device, tqdm_desc='Validating')
    wandb.log({"train": {"acc": np.mean(train_acc), "loss": np.mean(train_loss)}, "val": {"acc": np.mean(val_acc), "loss": np.mean(val_loss)}})
    print(f"Last batches:")
    print(f" train loss: {np.mean(train_loss)}, train acc: {np.mean(train_acc)}")
    print(f" val loss: {np.mean(val_loss)}, val acc: {np.mean(val_acc)}\n")

    wandb.unwatch()
    run.finish()
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [7]:
# tokenizer = AutoTokenizer.from_pretrained("XSY/albert-base-v2-imdb-calssification")
# model = AlbertForSequenceClassification(AlbertConfig()).cuda()

# Load model directly

tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
model = AlbertForSequenceClassification.from_pretrained("albert/albert-base-v2").to(device)

metric = evaluate.load("accuracy")

# tokenizer = AutoTokenizer.from_pretrained("XSY/albert-base-v2-imdb-calssification")
# model = AutoModelForSequenceClassification.from_pretrained("XSY/albert-base-v2-imdb-calssification").to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
dataset = load_dataset("imdb")
tokenized_datasets = dataset.map(encode, batched=True, remove_columns="text")

input_ids_t, input_ids_v, attention_mask_t, attention_mask_v, label_t, label_v = train_test_split(torch.tensor(tokenized_datasets['train']['input_ids']), torch.tensor(tokenized_datasets['train']['attention_mask']), torch.tensor(tokenized_datasets['train']['label']), 
                 random_state=SPLIT_RANDOM_SEED, test_size=TEST_SIZE, shuffle=True)

train_dataset = TensorDataset(input_ids_t, attention_mask_t, label_t)
val_dataset = TensorDataset(input_ids_v, attention_mask_v, label_v)
test_dataset = TensorDataset(torch.tensor(tokenized_datasets['test']['input_ids']), torch.tensor(tokenized_datasets['test']['attention_mask']), torch.tensor(tokenized_datasets['test']['label']))

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [9]:
wandb.login(key="eba16103be2afd0b5c96243771d60f5d7e562f68")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [28]:
# показания модели imdb (преобученная)
# model = model.to(device)
val_loss, val_acc = test(model, val_loader, device, tqdm_desc='Test')
print(f'Без обучения на датасете (чисто модель с параметрами) loss = {np.mean(val_loss)}, accuracy = {np.mean(val_acc)}')

Test: 100%|██████████| 1563/1563 [15:43<00:00,  1.66it/s]

Без обучения на датасете (чисто модель с параметрами) loss = 0.1985105358434201, accuracy = 0.9360204734484965





In [12]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [11]:
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=3, eta_min=3e-10)
train_loss_log, train_acc_log, val_loss_log, val_acc_log = train(model, optimizer, 1, train_loader, val_loader, batch_size, scheduler)

[34m[1mwandb[0m: Currently logged in as: [33mrodion-chernomordin[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training 0/1:   9%|▉         | 100/1094 [06:56<22:26:35, 81.28s/it]

Next 100 batches:
 train loss: 0.40254384763538836, train acc: 0.829375
 val loss: 0.2803314509295197, val acc: 0.898676261549396



Training 0/1:  18%|█▊        | 200/1094 [13:53<20:10:56, 81.27s/it]

Next 100 batches:
 train loss: 0.27852990984916687, train acc: 0.895
 val loss: 0.32157292686450456, val acc: 0.8734452736318408



Training 0/1:  27%|██▋       | 300/1094 [20:50<17:56:28, 81.35s/it]

Next 100 batches:
 train loss: 0.2514940486475825, train acc: 0.903125
 val loss: 0.22795579036765262, val acc: 0.9184434968017058



Training 0/1:  37%|███▋      | 400/1094 [27:46<15:40:08, 81.28s/it]

Next 100 batches:
 train loss: 0.24557729426771402, train acc: 0.90625
 val loss: 0.26205476823010676, val acc: 0.8992537313432836



Training 0/1:  46%|████▌     | 500/1094 [34:43<13:24:47, 81.29s/it]

Next 100 batches:
 train loss: 0.22316523604094982, train acc: 0.91125
 val loss: 0.2117769074107983, val acc: 0.9211975835110164



Training 0/1:  55%|█████▍    | 600/1094 [41:41<11:09:41, 81.34s/it]

Next 100 batches:
 train loss: 0.2266567359957844, train acc: 0.91
 val loss: 0.21679778061886587, val acc: 0.9191098081023454



Training 0/1:  64%|██████▍   | 700/1094 [48:38<8:54:10, 81.35s/it] 

Next 100 batches:
 train loss: 0.2382584900967777, train acc: 0.915625
 val loss: 0.2278970120383351, val acc: 0.9107587064676617



Training 0/1:  73%|███████▎  | 800/1094 [55:35<6:38:38, 81.36s/it]

Next 100 batches:
 train loss: 0.20685897704213857, train acc: 0.920625
 val loss: 0.205471274842109, val acc: 0.9266613361762616



Training 0/1:  82%|████████▏ | 900/1094 [1:02:32<4:22:57, 81.33s/it]

Next 100 batches:
 train loss: 0.2258072620816529, train acc: 0.915625
 val loss: 0.19755653966305606, val acc: 0.9251066098081023



Training 0/1:  91%|█████████▏| 1000/1094 [1:09:29<2:07:27, 81.36s/it]

Next 100 batches:
 train loss: 0.244444946013391, train acc: 0.9
 val loss: 0.22660695995364996, val acc: 0.912091329068941



Training 0/1: 100%|██████████| 1094/1094 [1:11:51<00:00,  3.94s/it]  


Last batches:
 train loss: 0.2126205377280712, train acc: 0.9188829787234043
 val loss: 0.2024343078956008, val acc: 0.923818407960199



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

NameError: name 'train_loss_log' is not defined

In [12]:
test_loss, test_acc = test(model, test_loader, device, tqdm_desc='Test')
print(f'loss = {np.mean(test_loss)}, accuracy = {np.mean(test_acc)}')

loss = 0.1889111399695263, accuracy = 0.9254638515674984


In [10]:
# та же модель, но логирование раз в эпоху
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=3e-10)
train_loss_log, train_acc_log, val_loss_log, val_acc_log = train(model, optimizer, 5, train_loader, val_loader, batch_size, scheduler)

[34m[1mwandb[0m: Currently logged in as: [33mrodion-chernomordin[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training 0/5: 100%|██████████| 1094/1094 [32:46<00:00,  1.80s/it]
Validating 0/5: 100%|██████████| 469/469 [06:18<00:00,  1.24it/s]


Epoch 1
 train loss: 0.24377623107028917, train acc: 0.9031649908592322
 val loss: 0.23198189499027438, val acc: 0.9111140724946695



Training 1/5: 100%|██████████| 1094/1094 [32:49<00:00,  1.80s/it]
Validating 1/5: 100%|██████████| 469/469 [06:15<00:00,  1.25it/s]


Epoch 2
 train loss: 0.15352085591302383, train acc: 0.9429273308957953
 val loss: 0.19266452457207733, val acc: 0.9263059701492538



Training 2/5: 100%|██████████| 1094/1094 [32:45<00:00,  1.80s/it]
Validating 2/5: 100%|██████████| 469/469 [06:15<00:00,  1.25it/s]


Epoch 3
 train loss: 0.10570448501566783, train acc: 0.963989183424741
 val loss: 0.19919305919572267, val acc: 0.9287046908315565



Training 3/5: 100%|██████████| 1094/1094 [32:46<00:00,  1.80s/it]
Validating 3/5: 100%|██████████| 469/469 [06:15<00:00,  1.25it/s]


Epoch 4
 train loss: 0.04956031080909581, train acc: 0.9848606032906764
 val loss: 0.21317417312042117, val acc: 0.93363539445629



Training 4/5: 100%|██████████| 1094/1094 [32:44<00:00,  1.80s/it]
Validating 4/5: 100%|██████████| 469/469 [06:15<00:00,  1.25it/s]


Epoch 5
 train loss: 0.020495343316119536, train acc: 0.9954296160877514
 val loss: 0.2581087981553268, val acc: 0.933590973702914



VBox(children=(Label(value='0.226 MB of 0.226 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [11]:
test_loss, test_acc = test(model, test_loader, device, tqdm_desc='Test')
print(f'loss = {np.mean(test_loss)}, accuracy = {np.mean(test_acc)}')

Test: 100%|██████████| 1563/1563 [15:37<00:00,  1.67it/s]

loss = 0.24936352291772165, accuracy = 0.9337811900191939



