# Добавки

#### Что отдельно пройти

* TensorBoard https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html
* https://pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html
* torchvision
* torchaudio
* torchtext
* Debug
 * anomaly detection: torch.autograd.detect_anomaly or torch.autograd.set_detect_anomaly(True)
 * profiler related: torch.autograd.profiler.emit_nvtx, torch.autograd.profiler.profile
 * autograd gradcheck: torch.autograd.gradcheck or torch.autograd.gradgradcheck

In [None]:
# from __future__ import print_function
import torch
import numpy as np

# что разумно сразу импортировать
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt

# Einsum

In [1]:
import torch
X = torch.tensor([[11, 12, 13], [21, 22, 23], [31, 32, 33]])
Y = torch.tensor([[1, 1, 1], [0, 2, 1], [0, 0, 3]])
print (X)

print (torch.einsum('ij, ij -> ij', X, Y)) # умножение матриц

print (torch.einsum('ii -> i', X)) # диагональ

print (torch.einsum('ij, ij -> ij', X, Y)) # адамарово умножение

print (torch.einsum('ii -> ', X)) # след

print (torch.einsum('ij -> ji', X)) # транспонирование

print (torch.einsum('ij -> j', X)) # сумма по оси

print (torch.einsum('ij -> ', X)) # сумма всех элементов

# import einops
# einops.repeat(x, 'm n -> m k n', k=K)

tensor([[11, 12, 13],
        [21, 22, 23],
        [31, 32, 33]])
tensor([[11, 12, 13],
        [ 0, 44, 23],
        [ 0,  0, 99]])
tensor([11, 22, 33])
tensor([[11, 12, 13],
        [ 0, 44, 23],
        [ 0,  0, 99]])
tensor(66)
tensor([[11, 21, 31],
        [12, 22, 32],
        [13, 23, 33]])
tensor([63, 66, 69])
tensor(198)


# AlexNet

In [None]:
class AlexNet(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11,
                      stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5,
                      padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3,
                      padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3,
                      padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3,
                      padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

NameError: name 'nn' is not defined

In [None]:
# ??



from torch.hub import load_state_dict_from_url
model_urls = {
    'alexnet':
    'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
}

def alexnet(pretrained=False,
            progress=True, **kwargs):
    model = AlexNet(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(
              model_urls['alexnet'],
              progress=progress)
        model.load_state_dict(state_dict)
    return model



## LR find

fast.ai

In [None]:
def find_lr(model, loss_fn, optimizer, train_loader, init_value=1e-8, final_value=10.0, device="cpu"):
    number_in_epoch = len(train_loader) - 1
    update_step = (final_value / init_value) ** (1 / number_in_epoch)
    lr = init_value
    optimizer.param_groups[0]["lr"] = lr
    best_loss = 0.0
    batch_num = 0
    losses = []
    log_lrs = []
    for data in train_loader:
        batch_num += 1
        inputs, targets = data
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        # Crash out if loss explodes

        if batch_num > 1 and loss > 4 * best_loss:
            if(len(log_lrs) > 20):
                return log_lrs[10:-5], losses[10:-5]
            else:
                return log_lrs, losses

        # Record the best loss

        if loss < best_loss or batch_num == 1:
            best_loss = loss

        # Store the values
        losses.append(loss.item())
        log_lrs.append((lr))

        # Do the backward pass and optimize

        loss.backward()
        optimizer.step()

        # Update the lr for the next step and store

        lr *= update_step
        optimizer.param_groups[0]["lr"] = lr
    if(len(log_lrs) > 20):
        return log_lrs[10:-5], losses[10:-5]
    else:
        return log_lrs, losses

## Ансамблирование

In [None]:
models_ensemble = [models.resnet50().to(device), models.resnet50().to(device)]
predictions = [F.softmax(m(torch.rand(1,3,224,244).to(device))) for m in models_ensemble]
avg_prediction = torch.stack(predictions).mean(0).argmax()

# Иллюстрация

In [None]:
# Dummy values to get code to run in the next cells
from torch.utils.data import DataLoader

n_epochs = 1
model = nn.Linear(10,10)
dataset = [(torch.rand(10),torch.rand(10))]*20
train_dataloader = DataLoader(dataset)

val_dataloader = DataLoader(dataset)
test_dataloader = DataLoader(dataset)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(),lr=0.001)

In [None]:
for epoch in range(n_epochs):

    # Training
    for data in train_dataloader:
        input, targets = data
        optimizer.zero_grad()
        output = model(input)
        train_loss = criterion(output, targets)
        train_loss.backward()
        optimizer.step()

    # Validation
    with torch.no_grad():
        for input, targets in val_dataloader:
            output = model(input)
            val_loss = criterion(output, targets)

# Test
with torch.no_grad():
    for input, targets in test_dataloader:
        output = model(input)
        test_loss = criterion(output, targets)



In [None]:
for epoch in range(n_epochs):
    total_train_loss = 0.0 # <1>
    total_val_loss = 0.0  # <1>

    if (epoch == epoch//2):
      optimizer = optim.SGD(model.parameters(),
                            lr=0.001) # <3>
    # Training
    model.train() # <2>
    for data in train_dataloader:
        input, targets = data
        optimizer.zero_grad()
        output = model(input)
        train_loss = criterion(output, targets)
        train_loss.backward()
        optimizer.step()
        total_train_loss += train_loss # <1>

    # Validation
    model.eval() # <2>
    with torch.no_grad():
      for input, targets in val_dataloader:
          output = model(input)
          val_loss = criterion(output, targets)
          total_val_loss += val_loss # <1>

    print("""Epoch: {}
          Train Loss: {}
          Val Loss {}""".format(
         epoch, total_train_loss,
         total_val_loss)) # <1>

# Test
model.eval()
with torch.no_grad():
  for input, targets in test_dataloader:
      output = model(input)
      test_loss = criterion(output, targets)

# TPU

обучение на TPU напрямую не поддерживается в pytorch, надо использовать PyTorch/XLA (Accelerated Linear Algebra)

см. https://github.com/pytorch/xla/

Как можно запустить в колабе:

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version "nightly"

import torch_xla.core.xla_model as xm
device = xm.xla_device()

# Dummy values to get code to run in the next cells
import torch
from torch import nn, optim
from torch.utils.data import DataLoader

n_epochs = 1
model = nn.Linear(10,10)
dataset = [(torch.rand(10,requires_grad=True),torch.rand(10,requires_grad=True))]*20

trainloader = DataLoader(dataset)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(),lr=0.001)


model.to(device)
for epoch in range(n_epochs):
    for data in trainloader:
        input, labels = data
        input = input.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        output = model(input)
        loss = criterion(input, labels)
        loss.backward()
        optimizer.step()

print(output.device) # out: xla:1

# Multiple GPUs (Single Machine)

Есть несколько видов распараллеливания:

- data parallel processing
- model parallel processing (не будем описывать)

(но это не проверить в колабе)

In [None]:
# data parallel processing (1й способ)
if torch.cuda.device_count() > 1:
    print("This machine has", torch.cuda.device_count(),  "GPUs available.")
    model = nn.DataParallel(model) # перед отправкой на device
model.to("cuda")


In [None]:
# data parallel processing (2й способ - предпочтительный)

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DD

def dist_training_loop(rank, world_size, dataloader, model, loss_fn, optimizer):
    dist.init_process_group("gloo", rank=rank, world_size=world_size)
    model = model.to(rank)
    ddp_model = DDP(model, device_ids=[rank])
    optimizer = optimizer(ddp_model.parameters(), lr=0.001)
    for epochs in range(n_epochs):
        for input, labels in dataloader:
            input = input.to(rank)
            labels = labels.to(rank)
            optimizer.zero_grad()
            outputs = ddp_model(input)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
    dist.destroy_process_group()


if __name__=="__main__":
    world_size = 2
    mp.spawn(dist_training_loop, args=(world_size,), nprocs=world_size, join=True)

# Quantization

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(
            F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(
            F.relu(self.conv2(x)), 2)
        x = x.view(-1,
                   int(x.nelement() / x.shape[0]))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = LeNet5()

In [None]:
for n, p in model.named_parameters():
    print(n, ": ", p.dtype)

conv1.weight :  torch.float32
conv1.bias :  torch.float32
conv2.weight :  torch.float32
conv2.bias :  torch.float32
fc1.weight :  torch.float32
fc1.bias :  torch.float32
fc2.weight :  torch.float32
fc2.bias :  torch.float32
fc3.weight :  torch.float32
fc3.bias :  torch.float32


In [None]:
# простейший способ - half
model = model.half()

for n, p in model.named_parameters():
    print(n, ": ", p.dtype)

conv1.weight :  torch.float16
conv1.bias :  torch.float16
conv2.weight :  torch.float16
conv2.bias :  torch.float16
fc1.weight :  torch.float16
fc1.bias :  torch.float16
fc2.weight :  torch.float16
fc2.bias :  torch.float16
fc3.weight :  torch.float16
fc3.bias :  torch.float16


Есть ещё другие способы квантизации:

- dynamic quantization
- post-training static quantization
- quantization-aware training (QAT)

Что-то может поддерживаться только для CPU (читайте документацию).

In [None]:
# динамическая

import torch.quantization

quantized_model = torch.quantization.quantize_dynamic(model,  {torch.nn.Linear}, dtype=torch.qint8) # указываем слои для квантизации и до какого уровня её провести

In [None]:
# post-training static quantization

static_quant_model = LeNet5()
static_quant_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')

torch.quantization.prepare(static_quant_model, inplace=True)
torch.quantization.convert(static_quant_model, inplace=True) # to quantize the model.

In [None]:
# quantization-aware training (QAT)

qat_model = LeNet5()
qat_mode.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')

torch.quantization.prepare_qat(qat_model, inplace=True)
torch.quantization.convert(qat_model, inplace=True)

# Pruning

Можно, кстати, задать и свой метод пранинга.

In [None]:
import torch.nn.utils.prune as prune

prune.random_unstructured(model.conv1,
                          name="weight",
                          amount=0.25)

In [None]:
# последовательно

model = LeNet5().to(device)

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Conv2d):
        prune.random_unstructured(module,
                              name='weight',
                              amount=0.3)
    elif isinstance(module, torch.nn.Linear):
        prune.random_unstructured(module,
                              name='weight',
                              amount=0.5)

In [None]:
# глобальный пранинг

model = LeNet5().to(device)

parameters_to_prune = (
    (model.conv1, 'weight'),
    (model.conv2, 'weight'),
    (model.fc1, 'weight'),
    (model.fc2, 'weight'),
    (model.fc3, 'weight'),
)

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.25)

## TensorBoard

In [None]:
pip install tensorboard
conda install tensorboard

# TensorBoard can then be started on the command line:
tensorboard --logdir=runs
# You can then go to http://[your-machine]:6006

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
writer.add_scalar('example', 3)

In [None]:
import random
value = 10
writer.add_scalar('test_loop', value, 0)
for i in range(1,10000):
    value += random.random() - 0.5
    writer.add_scalar('test_loop', value, i)

In [None]:
import torch
import torchvision
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms,models

writer = SummaryWriter()
model = models.resnet18(False)
writer.add_graph(model, torch.rand([1,3,224,224]))

def train(model, optimizer, loss_fn, train_data_loader, test_data_loader, epochs=20):
    model = model.train()
    iteration = 0

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input, target = batch
            output = model(input)
            loss = loss_fn(output, target)
            writer.add_scalar('loss', loss, epoch)
            loss.backward()
            optimizer.step()

        model.eval()
        num_correct = 0
        num_examples = 0

        for batch in val_loader:
            input, target = batch
            output = model(input)
            correct = torch.eq(torch.max(F.softmax(output), dim=1)[1], target).view(-1)
            num_correct += torch.sum(correct).item()
            num_examples += correct.shape[0]

        print("Epoch {}, accuracy = {:.2f}".format(epoch, num_correct / num_examples) # ? ->
        writer.add_scalar('accuracy', num_correct / num_examples, epoch) # ? ->
        iterations += 1


##  Hooks

функции, которые могут быть приписаны к тензорам и вызываются при проходах: прямом или обратном

In [None]:
def print_hook(self, module, input, output):
    print(f"Shape of input is {input.shape}")

model = models.resnet18()
hook_ref = model.fc.register_forward_hook(print_hook) # для обратного прохода register_backward_hook()
model(torch.rand([1,3,224,224]))
hook_ref.remove()
model(torch.rand([1,3,224,224]))

In [None]:
def send_stats(i, module, input, output):
    writer.add_scalar(f"{i}-mean",output.data.std()) # посылаем статистику в TB
    writer.add_scalar(f"{i}-stddev",output.data.std())

In [None]:
from functools import partial

for i,m in enumerate(model.children()):
    m.register_forward_hook(partial(send_stats, i))

In [None]:
# Class Activation Mapping

class SaveActivations():
    activations=None
    def __init__(self, m):
        self.hook = m.register_forward_hook(self.hook_fn)

    def hook_fn(self, module, input, output):
        self.features = output.data

    def remove(self):
        self.hook.remove()


model = models.resnet18(pretrained=True)
model.eval()

x_activations = SaveActivations(model.layer_4)
prediction = model(x.unsqueeze(0))
pred_probabilities = F.softmax(prediction).data.squeeze()
x_activations.remove()
torch.topk(pred_probabilities,1)


# ?
fts = sf[0].features[idx]
prob = np.exp(to_np(log_prob))
preds = np.argmax(prob[idx])
fts_np = to_np(fts)
f2=np.dot(np.rollaxis(fts_np,0,3), prob[idx])
f2-=f2.min()
f2/=f2.max()
f2
plt.imshow(dx)
plt.imshow(scipy.misc.imresize(f2, dx.shape), alpha=0.5, cmap='jet');


In [2]:
# по умолчанию в вычислительном графе градиенты вычисляются для листьев
import torch

x = torch.tensor([2.], requires_grad=True)
y = x * x
z = y * y

y.register_hook(print) # теперь градиент пропечатается

y.retain_grad() # без этого градиент не выведется, т.к. это не лист

z.backward()

print(y.grad)

tensor([8.])
tensor([8.])


Реализация L1-регуляризации с помощью хуков

См. https://stackoverflow.com/questions/42704283/l1-l2-regularization-in-pytorch

In [2]:
# вызов layer = L1(torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3))

class L1(torch.nn.Module):
    def __init__(self, module, weight_decay):
        super().__init__()
        self.module = module
        self.weight_decay = weight_decay

        # Backward hook is registered on the specified module
        self.hook = self.module.register_full_backward_hook(self._weight_decay_hook)

    # Not dependent on backprop incoming values, placeholder
    def _weight_decay_hook(self, *_):
        for param in self.module.parameters():
            # If there is no gradient or it was zeroed out
            # Zeroed out using optimizer.zero_grad() usually
            # Turn on if needed with grad accumulation/more safer way
            # if param.grad is None or torch.all(param.grad == 0.0):

            # Apply regularization on it
            param.grad = self.regularize(param)

    def regularize(self, parameter):
        # L1 regularization formula
        return self.weight_decay * torch.sign(parameter.data)

    def forward(self, *args, **kwargs):
        # Simply forward and args and kwargs to module
        return self.module(*args, **kwargs)

In [None]:
# L1-регуляризация активаций!
import torch


class OutputHook(list):
    """ Hook to capture module outputs.
    """
    def __call__(self, module, input, output):
        self.append(output)


class MLP(torch.nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear1 = torch.nn.Linear(128, 32)
        self.linear2 = torch.nn.Linear(32, 16)
        self.linear3 = torch.nn.Linear(16, 2)
        # Instantiate ReLU, so a hook can be registered to capture its output.
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        layer1_out = self.relu(self.linear1(x))
        layer2_out = self.relu(self.linear2(layer1_out))
        out = self.linear3(layer2_out)
        return out


batch_size = 4
l1_lambda = 0.01

model = MLP()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
# Register hook to capture the ReLU outputs. Non-trivial networks will often
# require hooks to be applied more judiciously.
output_hook = OutputHook()
model.relu.register_forward_hook(output_hook)

inputs = torch.rand(batch_size, 128)
targets = torch.ones(batch_size).long()

optimizer.zero_grad()
outputs = model(inputs)
cross_entropy_loss = torch.nn.functional.cross_entropy(outputs, targets)

# Compute the L1 penalty over the ReLU outputs captured by the hook.
l1_penalty = 0.
for output in output_hook:
    l1_penalty += torch.norm(output, 1)
l1_penalty *= l1_lambda

loss = cross_entropy_loss + l1_penalty
loss.backward()
optimizer.step()
output_hook.clear()

In [None]:
#  БЕЗ ХУКОВ
# + самописные L1 И L2 регуляризации

optimizer.zero_grad()
outputs, layer1_out, layer2_out = model(inputs)
cross_entropy_loss = F.cross_entropy(outputs, targets)

all_linear1_params = torch.cat([x.view(-1) for x in model.linear1.parameters()])
all_linear2_params = torch.cat([x.view(-1) for x in model.linear2.parameters()])
l1_regularization = lambda1 * torch.norm(all_linear1_params, 1)
l2_regularization = lambda2 * torch.norm(all_linear2_params, 2)

loss = cross_entropy_loss + l1_regularization + l2_regularization
loss.backward()
optimizer.step()

# или так

l1_regularization, l2_regularization = torch.tensor(0), torch.tensor(0)

optimizer.zero_grad()
outputs = model(inputs)
cross_entropy_loss = F.cross_entropy(outputs, targets)
for param in model.parameters():
    l1_regularization += torch.norm(param, 1)**2
    l2_regularization += torch.norm(param, 2)**2

loss = cross_entropy_loss + l1_regularization + l2_regularization
loss.backward()
optimizer.step()

## Label Smoothing

In [None]:
class LabelSmoothingCrossEntropyLoss(nn.Module):
    def __init__(self, epsilon=0.1):
        super(LabelSmoothingCrossEntropyLoss, self).__init__()
        self.epsilon = epsilon

    def forward(self, output, target):
        num_classes = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = (-log_preds.sum(dim=-1)).mean()
        nll = F.nll_loss(log_preds, target)
        final_loss = self.epsilon * loss / num_classes + (1-self.epsilon) * nll
        return final_loss


## FGSM

In [None]:
def fgsm(input_tensor, labels, epsilon=0.02, loss_function, model):
    outputs = model(input_tensor)
    loss = loss_function(outputs, labels)
    loss.backward(retain_graph=True)
    fsgm = torch.sign(inputs.grad) * epsilon
    return fgsm

In [None]:
model_to_break = # load our model to break here
adversarial_mask = fgsm(frog_image.unsqueeze(-1),
                        batch_labels,
                        loss_function,
                        model_to_break)
adversarial_image = adversarial_mask.squeeze(0) + frog_image

## Другое

In [None]:
! nvidia-smi

"nvidia-smi" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


In [None]:
import gc
del tensor_to_be_deleted
gc.collect()



# Показ изображений

In [None]:
# показать изображение

plt.imshow(  tensor_image.permute(1, 2, 0)  ) # permute не выделяет память

plt.imshow(transforms.ToPILImage()(image), interpolation="bicubic")
#transforms.ToPILImage()(image).show() # Alternatively


#

    def show(img):
        npimg = img.numpy()
        plt.imshow(np.transpose(npimg, (1, 2, 0)), interpolation='nearest')


##### test_image = Image.open(test_image_name).convert('RGB')