In [1]:

from copy import deepcopy

import torch.optim as optim
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
from tqdm import tqdm
import time

In [2]:
from senmodel.model.utils import *
from senmodel.metrics.nonlinearity_metrics import *
from senmodel.metrics.edge_finder import *

In [3]:
torch.manual_seed(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
# def get_params_amount(model, eps=1e-8):
#     amount = 0
#     for linear in model.embed_linears:
#         amount += linear.weight_values.shape[0]
#     amount += model.weight_values.shape[0]
#     return amount

def get_params_amount(model):
    amount = 0
    for _, layer in model.named_children():
        if isinstance(layer, ExpandingLinear):
            for linear in layer.embed_linears:
                amount += linear.weight_values.shape[0]
            amount += layer.weight_values.shape[0]
        elif isinstance(layer, nn.Linear):
            amount += linear.in_features * linear.out_features
    return amount

In [5]:
# def get_zero_params_amount(model, eps=1e-8):
#     amount = 0
#     for linear in model.embed_linears:
#         amount += linear.weight_values[linear.weight_values.abs() < eps].shape[0]
#     amount += model.weight_values[model.weight_values.abs() < eps].shape[0]
#     return amount

def get_zero_params_amount(model, eps=1e-8):
    amount = 0
    for _, layer in model.named_children():
        if isinstance(layer, ExpandingLinear):
            for linear in layer.embed_linears:
                amount += linear.weight_values[linear.weight_values.abs() < eps].shape[0]
            amount += layer.weight_values[layer.weight_values.abs() < eps].shape[0]
        elif isinstance(layer, nn.Linear):
            amount += linear.weight[linear.weight.abs() < eps].numel()
    return amount

In [6]:
def train_sparse_recursive(model, train_loader, val_loader, num_epochs, metric, window_size=5, threshold=0.2):
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    replace_epoch = [0]
    val_losses = []
    len_choose = get_model_last_layer(model).count_replaces[0]
    for epoch in range(num_epochs):
        t0 = time.time()
        model.train()
        train_loss = 0
        for i, (inputs, targets) in enumerate(tqdm(train_loader)):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            train_loss += loss.item()
        
        # if len(replace_epoch) > 1:
        #     for g in optimizer.param_groups:  
        #         g['lr'] *= 0.9
        
        train_loss /= len(train_loader)
        train_time = time.time() - t0

        model.eval()
        val_loss = 0
        all_targets = []
        all_preds = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                all_targets.extend(targets.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_targets, all_preds)

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        val_losses.append(val_loss)
        if len(val_losses) > window_size and epoch - replace_epoch[-1] > 8:
            recent_changes = [abs(val_losses[i] - val_losses[i - 1]) for i in range(-window_size, 0)]
            avg_change = sum(recent_changes) / window_size
            if avg_change < threshold:
                # layer = model.fc0
                # mask = torch.ones_like(layer.weight_values, dtype=bool)
                # len_choose = edge_replacement_func_new_layer(layer, mask, optimizer, val_loader, metric, 0.3, 'mean')

                layer = model.fc1
                mask = torch.ones_like(layer.weight_values, dtype=bool)
                len_choose = edge_replacement_func_new_layer(model, 'fc1', mask, optimizer, val_loader, metric, 0.3, 'mean')

                wandb.log({'len_choose': len_choose})
                replace_epoch += [epoch]
                # if len(replace_epoch) == 2:
                #     for g in optimizer.param_groups:
                #         g['lr'] *= 200


        params_amount = get_params_amount(model)
        zero_params_amount = get_zero_params_amount(model)
        wandb.log({'val loss': val_loss, 'val accuracy': val_accuracy,
                    'train loss': train_loss, 'params amount': params_amount,
                      'zero params amount': zero_params_amount, 'train time': train_time,
                        'params ratio': (params_amount - zero_params_amount) / params_amount,
                          'lr': optimizer.param_groups[0]['lr']})

def edge_replacement_func_new_layer(model, layer_name, mask, optim, val_loader, metric, choose_threshold, aggregation_mode='mean'):
    layer = model.__getattr__(layer_name)
    ef = EdgeFinder(metric, val_loader, device, aggregation_mode)
    chosen_edges = ef.choose_edges_threshold(model, layer_name, choose_threshold, mask,)
    print("Chosen edges:", chosen_edges, len(chosen_edges[0]))
    layer.replace_many(*chosen_edges)

    if len(chosen_edges[0]) > 0:
        optim.add_param_group({'params': layer.embed_linears[-1].weight_values})
        optim.add_param_group({'params': layer.weight_values})
    print(len(chosen_edges[0]))
    return len(chosen_edges[0])

# def edge_replacement_func_new_layer(model, optim, val_loader, metric, choose_threshold, aggregation_mode='mean', len_choose=None):
#     layer = get_model_last_layer(model)
#     ef = EdgeFinder(metric, val_loader, device, aggregation_mode)
#     vals = ef.calculate_edge_metric_for_dataloader(model, len_choose, False)
#     print("Edge metrics:", vals, max(vals, default=0), sum(vals))
#     chosen_edges = ef.choose_edges_threshold(model, choose_threshold, len_choose)
#     print("Chosen edges:", chosen_edges, len(chosen_edges[0]))
#     layer.replace_many(*chosen_edges)

#     if len(chosen_edges[0]) > 0:
#         optim.add_param_group({'params': layer.embed_linears[-1].weight_values})
#         # optim.add_param_group({'params': layer.weight_values})
#     else:
#         print("Empty metric")

#     return {'max': max(vals, default=0), 'sum': sum(vals), 'len': len(vals), 'len_choose': layer.count_replaces[-1]}

In [7]:
class SimpleFCN(nn.Module):
    def __init__(self, input_size=28 * 28, hidden_size=16):
        super(SimpleFCN, self).__init__()
        self.fc0 = nn.Linear(input_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 10)
        self.act = nn.ReLU()

    def forward(self, x):
        x = self.fc1(self.act(self.fc0(x)))
        return x

In [8]:
# Dataset and Dataloader
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1))
])

# Load dataset and split into train/validation sets
dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [9]:
criterion = nn.CrossEntropyLoss()
metrics = [
    MagnitudeL2Metric(criterion),
    # SNIPMetric(criterion),
    # GradientMeanEdgeMetric(criterion),
    # PerturbationSensitivityEdgeMetric(criterion),
]
model = SimpleFCN()
sparse_model = convert_dense_to_sparse_network(model, layers=[model.fc0, model.fc1])

In [10]:
import wandb

wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvanyamironov[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
run = wandb.init(
    project="self-expanding-nets",
    name=f"trash",
)

In [12]:
train_sparse_recursive(sparse_model, train_loader, val_loader, 64, metrics[0])

  0%|          | 0/750 [00:00<?, ?it/s]

100%|██████████| 750/750 [00:03<00:00, 245.15it/s]


Epoch 1/64, Train Loss: 1.6563, Val Loss: 1.1339, Val Accuracy: 0.7594


100%|██████████| 750/750 [00:03<00:00, 233.52it/s]


Epoch 2/64, Train Loss: 0.8690, Val Loss: 0.6870, Val Accuracy: 0.8444


100%|██████████| 750/750 [00:02<00:00, 254.62it/s]


Epoch 3/64, Train Loss: 0.5849, Val Loss: 0.5184, Val Accuracy: 0.8729


100%|██████████| 750/750 [00:03<00:00, 244.60it/s]


Epoch 4/64, Train Loss: 0.4708, Val Loss: 0.4410, Val Accuracy: 0.8859


100%|██████████| 750/750 [00:02<00:00, 271.39it/s]


Epoch 5/64, Train Loss: 0.4129, Val Loss: 0.3975, Val Accuracy: 0.8948


100%|██████████| 750/750 [00:03<00:00, 236.27it/s]


Epoch 6/64, Train Loss: 0.3778, Val Loss: 0.3700, Val Accuracy: 0.9004


100%|██████████| 750/750 [00:02<00:00, 278.29it/s]


Epoch 7/64, Train Loss: 0.3539, Val Loss: 0.3488, Val Accuracy: 0.9042


100%|██████████| 750/750 [00:02<00:00, 286.08it/s]


Epoch 8/64, Train Loss: 0.3362, Val Loss: 0.3345, Val Accuracy: 0.9082


100%|██████████| 750/750 [00:02<00:00, 272.42it/s]


Epoch 9/64, Train Loss: 0.3226, Val Loss: 0.3229, Val Accuracy: 0.9113


100%|██████████| 750/750 [00:02<00:00, 270.85it/s]


Epoch 10/64, Train Loss: 0.3116, Val Loss: 0.3136, Val Accuracy: 0.9138
shapes torch.Size([160]) torch.Size([2, 43])
Chosen edges: tensor([[ 0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  4,
          4,  4,  4,  4,  4,  4,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,  7,
          7,  8,  9,  9,  9,  9,  9],
        [ 0,  1,  5,  9, 10, 13,  0,  4,  7,  8, 10,  2, 11, 13, 15, 11, 15,  0,
          1,  2,  3,  5, 14, 15,  4, 12, 13,  1,  2,  3,  9, 11, 15,  4,  5, 12,
         14,  0,  2,  6,  9, 10, 12]]) 43
43


100%|██████████| 750/750 [00:03<00:00, 228.68it/s]


Epoch 11/64, Train Loss: 0.2986, Val Loss: 0.2984, Val Accuracy: 0.9160


100%|██████████| 750/750 [00:03<00:00, 221.04it/s]


Epoch 12/64, Train Loss: 0.2853, Val Loss: 0.2879, Val Accuracy: 0.9198


100%|██████████| 750/750 [00:03<00:00, 202.53it/s]


Epoch 13/64, Train Loss: 0.2753, Val Loss: 0.2797, Val Accuracy: 0.9214


100%|██████████| 750/750 [00:03<00:00, 203.51it/s]


Epoch 14/64, Train Loss: 0.2668, Val Loss: 0.2727, Val Accuracy: 0.9228


100%|██████████| 750/750 [00:03<00:00, 235.40it/s]


Epoch 15/64, Train Loss: 0.2594, Val Loss: 0.2664, Val Accuracy: 0.9250


100%|██████████| 750/750 [00:03<00:00, 241.98it/s]


Epoch 16/64, Train Loss: 0.2527, Val Loss: 0.2611, Val Accuracy: 0.9257


100%|██████████| 750/750 [00:03<00:00, 240.46it/s]


Epoch 17/64, Train Loss: 0.2465, Val Loss: 0.2556, Val Accuracy: 0.9270


100%|██████████| 750/750 [00:03<00:00, 247.19it/s]


Epoch 18/64, Train Loss: 0.2405, Val Loss: 0.2517, Val Accuracy: 0.9286


100%|██████████| 750/750 [00:02<00:00, 257.32it/s]


Epoch 19/64, Train Loss: 0.2348, Val Loss: 0.2464, Val Accuracy: 0.9290
shapes torch.Size([547]) torch.Size([2, 31])
Chosen edges: tensor([[ 5,  0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  4,  4,  4,  4,  4,  5,
          5,  6,  6,  6,  6,  6,  7,  7,  7,  8,  9,  9,  9],
        [ 9, 16, 18, 20, 21, 23, 24, 25, 26, 27, 28, 29, 33, 34, 37, 38, 39, 41,
         42, 43, 44, 45, 46, 47, 49, 50, 52, 53, 54, 57, 58]]) 31
31


100%|██████████| 750/750 [00:03<00:00, 240.09it/s]


Epoch 20/64, Train Loss: 0.2294, Val Loss: 0.2403, Val Accuracy: 0.9319


100%|██████████| 750/750 [00:03<00:00, 199.89it/s]


Epoch 21/64, Train Loss: 0.2226, Val Loss: 0.2349, Val Accuracy: 0.9332


100%|██████████| 750/750 [00:03<00:00, 217.15it/s]


Epoch 22/64, Train Loss: 0.2163, Val Loss: 0.2299, Val Accuracy: 0.9337


100%|██████████| 750/750 [00:03<00:00, 221.67it/s]


Epoch 23/64, Train Loss: 0.2106, Val Loss: 0.2257, Val Accuracy: 0.9351


100%|██████████| 750/750 [00:03<00:00, 218.13it/s]


Epoch 24/64, Train Loss: 0.2052, Val Loss: 0.2213, Val Accuracy: 0.9352


100%|██████████| 750/750 [00:03<00:00, 232.39it/s]


Epoch 25/64, Train Loss: 0.2003, Val Loss: 0.2170, Val Accuracy: 0.9364


100%|██████████| 750/750 [00:03<00:00, 214.68it/s]


Epoch 26/64, Train Loss: 0.1958, Val Loss: 0.2128, Val Accuracy: 0.9386


100%|██████████| 750/750 [00:03<00:00, 213.08it/s]


Epoch 27/64, Train Loss: 0.1916, Val Loss: 0.2080, Val Accuracy: 0.9395


100%|██████████| 750/750 [00:03<00:00, 198.56it/s]


Epoch 28/64, Train Loss: 0.1873, Val Loss: 0.2066, Val Accuracy: 0.9388
shapes torch.Size([826]) torch.Size([2, 28])
Chosen edges: tensor([[ 5,  0,  0,  0,  1,  1,  1,  2,  2,  2,  4,  4,  4,  4,  4,  5,  5,  6,
          6,  6,  6,  6,  7,  7,  7,  8,  9,  9],
        [59, 61, 62, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
         79, 80, 81, 82, 83, 84, 85, 86, 87, 88]]) 28
28


100%|██████████| 750/750 [00:03<00:00, 211.92it/s]


Epoch 29/64, Train Loss: 0.1835, Val Loss: 0.2022, Val Accuracy: 0.9413


100%|██████████| 750/750 [00:03<00:00, 206.17it/s]


Epoch 30/64, Train Loss: 0.1794, Val Loss: 0.1989, Val Accuracy: 0.9433


100%|██████████| 750/750 [00:03<00:00, 196.44it/s]


Epoch 31/64, Train Loss: 0.1752, Val Loss: 0.1950, Val Accuracy: 0.9425


100%|██████████| 750/750 [00:03<00:00, 218.05it/s]


Epoch 32/64, Train Loss: 0.1715, Val Loss: 0.1939, Val Accuracy: 0.9434


100%|██████████| 750/750 [00:03<00:00, 198.57it/s]


Epoch 33/64, Train Loss: 0.1680, Val Loss: 0.1900, Val Accuracy: 0.9458


100%|██████████| 750/750 [00:03<00:00, 193.29it/s]


Epoch 34/64, Train Loss: 0.1647, Val Loss: 0.1891, Val Accuracy: 0.9469


100%|██████████| 750/750 [00:03<00:00, 223.29it/s]


Epoch 35/64, Train Loss: 0.1613, Val Loss: 0.1842, Val Accuracy: 0.9475


100%|██████████| 750/750 [00:03<00:00, 219.60it/s]


Epoch 36/64, Train Loss: 0.1581, Val Loss: 0.1816, Val Accuracy: 0.9487


100%|██████████| 750/750 [00:03<00:00, 212.73it/s]


Epoch 37/64, Train Loss: 0.1557, Val Loss: 0.1816, Val Accuracy: 0.9493
shapes torch.Size([1078]) torch.Size([2, 24])
Chosen edges: tensor([[  5,   0,   0,   1,   1,   2,   2,   4,   4,   4,   4,   5,   5,   6,
           6,   6,   6,   6,   7,   7,   7,   8,   9,   9],
        [ 90,  92,  93,  95,  96,  98,  99, 100, 101, 103, 104, 105, 106, 107,
         108, 109, 110, 111, 112, 113, 114, 115, 116, 117]]) 24
24


100%|██████████| 750/750 [00:03<00:00, 195.27it/s]


Epoch 38/64, Train Loss: 0.1526, Val Loss: 0.1790, Val Accuracy: 0.9485


100%|██████████| 750/750 [00:04<00:00, 169.94it/s]


Epoch 39/64, Train Loss: 0.1500, Val Loss: 0.1757, Val Accuracy: 0.9502


100%|██████████| 750/750 [00:03<00:00, 200.35it/s]


Epoch 40/64, Train Loss: 0.1467, Val Loss: 0.1760, Val Accuracy: 0.9507


100%|██████████| 750/750 [00:03<00:00, 196.62it/s]


Epoch 41/64, Train Loss: 0.1442, Val Loss: 0.1709, Val Accuracy: 0.9528


100%|██████████| 750/750 [00:04<00:00, 184.93it/s]


Epoch 42/64, Train Loss: 0.1413, Val Loss: 0.1719, Val Accuracy: 0.9518


100%|██████████| 750/750 [00:03<00:00, 190.27it/s]


Epoch 43/64, Train Loss: 0.1384, Val Loss: 0.1670, Val Accuracy: 0.9530


100%|██████████| 750/750 [00:03<00:00, 196.16it/s]


Epoch 44/64, Train Loss: 0.1362, Val Loss: 0.1664, Val Accuracy: 0.9543


100%|██████████| 750/750 [00:04<00:00, 174.23it/s]


Epoch 45/64, Train Loss: 0.1339, Val Loss: 0.1642, Val Accuracy: 0.9534


100%|██████████| 750/750 [00:03<00:00, 210.59it/s]


Epoch 46/64, Train Loss: 0.1312, Val Loss: 0.1660, Val Accuracy: 0.9527
shapes torch.Size([1294]) torch.Size([2, 25])
Chosen edges: tensor([[  4,   5,   0,   0,   1,   1,   2,   2,   4,   4,   4,   4,   5,   5,
           6,   6,   6,   6,   6,   7,   7,   7,   8,   9,   9],
        [102, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
         131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141]]) 25
25


100%|██████████| 750/750 [00:04<00:00, 180.39it/s]


Epoch 47/64, Train Loss: 0.1296, Val Loss: 0.1633, Val Accuracy: 0.9537


100%|██████████| 750/750 [00:04<00:00, 180.25it/s]


Epoch 48/64, Train Loss: 0.1274, Val Loss: 0.1607, Val Accuracy: 0.9542


100%|██████████| 750/750 [00:03<00:00, 205.22it/s]


Epoch 49/64, Train Loss: 0.1249, Val Loss: 0.1603, Val Accuracy: 0.9537


100%|██████████| 750/750 [00:04<00:00, 161.49it/s]


Epoch 50/64, Train Loss: 0.1223, Val Loss: 0.1577, Val Accuracy: 0.9546


100%|██████████| 750/750 [00:04<00:00, 179.77it/s]


Epoch 51/64, Train Loss: 0.1209, Val Loss: 0.1555, Val Accuracy: 0.9553


100%|██████████| 750/750 [00:03<00:00, 197.33it/s]


Epoch 52/64, Train Loss: 0.1179, Val Loss: 0.1549, Val Accuracy: 0.9563


100%|██████████| 750/750 [00:03<00:00, 197.29it/s]


Epoch 53/64, Train Loss: 0.1158, Val Loss: 0.1556, Val Accuracy: 0.9559


100%|██████████| 750/750 [00:03<00:00, 198.47it/s]


Epoch 54/64, Train Loss: 0.1139, Val Loss: 0.1532, Val Accuracy: 0.9564


100%|██████████| 750/750 [00:03<00:00, 192.11it/s]


Epoch 55/64, Train Loss: 0.1120, Val Loss: 0.1527, Val Accuracy: 0.9577
shapes torch.Size([1519]) torch.Size([2, 26])
Chosen edges: tensor([[  3,   4,   5,   0,   0,   1,   1,   2,   2,   4,   4,   4,   4,   5,
           5,   6,   6,   6,   6,   6,   7,   7,   7,   8,   9,   9],
        [ 31, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
         155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166]]) 26
26


100%|██████████| 750/750 [00:04<00:00, 159.16it/s]


Epoch 56/64, Train Loss: 0.1107, Val Loss: 0.1533, Val Accuracy: 0.9568


100%|██████████| 750/750 [00:04<00:00, 167.54it/s]


Epoch 57/64, Train Loss: 0.1087, Val Loss: 0.1520, Val Accuracy: 0.9574


100%|██████████| 750/750 [00:04<00:00, 157.37it/s]


Epoch 58/64, Train Loss: 0.1068, Val Loss: 0.1522, Val Accuracy: 0.9573


100%|██████████| 750/750 [00:04<00:00, 176.64it/s]


Epoch 59/64, Train Loss: 0.1051, Val Loss: 0.1493, Val Accuracy: 0.9584


100%|██████████| 750/750 [00:04<00:00, 178.58it/s]


Epoch 60/64, Train Loss: 0.1033, Val Loss: 0.1493, Val Accuracy: 0.9571


100%|██████████| 750/750 [00:04<00:00, 163.56it/s]


Epoch 61/64, Train Loss: 0.1018, Val Loss: 0.1471, Val Accuracy: 0.9587


100%|██████████| 750/750 [00:03<00:00, 195.33it/s]


Epoch 62/64, Train Loss: 0.1002, Val Loss: 0.1460, Val Accuracy: 0.9583


100%|██████████| 750/750 [00:04<00:00, 176.09it/s]


Epoch 63/64, Train Loss: 0.0982, Val Loss: 0.1476, Val Accuracy: 0.9594


100%|██████████| 750/750 [00:04<00:00, 171.34it/s]


Epoch 64/64, Train Loss: 0.0973, Val Loss: 0.1463, Val Accuracy: 0.9587
shapes torch.Size([1753]) torch.Size([2, 26])
Chosen edges: tensor([[  3,   4,   5,   0,   0,   1,   1,   2,   2,   4,   4,   4,   4,   5,
           5,   6,   6,   6,   6,   6,   7,   7,   7,   8,   9,   9],
        [167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
         181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192]]) 26
26


- прунинг по метрике на следующей эпохе после реплейса
