In [1]:
import torch
import torchvision

import random
import re

import numpy as np

from models.cifar100.cifar100resnet import CifarResNet
from models.cifar100.cifar100expert import CifarExpert
from models.cifar100.gating_network import GatingNetwork
from models.cifar100.moe import MoE
from utils.cifar100_dataset import CIFAR100Dataset, create_subset
from torch.utils.data import Subset

from torchsummary import summary

In [2]:
transformations_training = torchvision.transforms.Compose([
                torchvision.transforms.RandomHorizontalFlip(p=0.5),
                torchvision.transforms.RandomCrop(size=32, padding=4),
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276])
])

In [3]:
feature_extractor = CifarResNet(name='CIFAR100 Feature Extractor')
feature_extractor.load_state_dict(torch.load('./trained_models/baseline_model.pth'))

<All keys matched successfully>

In [18]:
experts = []
for file in os.listdir('./trained_models/shuffled_moe/'):
    if 'expert' in file:
        classes = []
        for cls in list(map(int, re.findall('\d+', file))):
            classes.append(CIFAR100Dataset.CIFAR100_DECODING[cls])
        expert = CifarExpert(classes=classes, name=file, feature_extractor=feature_extractor)
        experts.append(expert)
moe.experts = experts

In [19]:
len(moe.experts)

60

In [20]:
# Create new experts

expert_classes = []
for i in range(3):
    encoded_class_labels = set([i for i in range(100)])
    for j in range(10):
        classes_encoded = sorted(random.sample(encoded_class_labels, 10))
        classes_decoded = []
        for cls in classes_encoded:
            classes_decoded.append(CIFAR100Dataset.CIFAR100_DECODING[cls])
        expert_classes.append((classes_decoded, classes_encoded))
        encoded_class_labels = encoded_class_labels - set(classes_encoded)

In [21]:
experts = []
for classes in expert_classes:
    label_string = ''
    for cls in classes[1][:-1]:
        label_string += str(cls) + '_'
    label_string += str(classes[1][-1])
    expert = CifarExpert(classes=classes[0], name='expert_'+ label_string, feature_extractor=feature_extractor)
    experts.append(expert)

In [7]:
cifar_data = CIFAR100Dataset(data_folder='../data/cifar100/training', transform=transformations_training)
training_data, validation_data = cifar_data.train_test_split([0.8, 0.2])

In [8]:
moe = MoE(
    experts=experts,
    Gate=GatingNetwork,
    name='MoE_shuffled_',
    feature_extractor=feature_extractor,
    data_folder='/home/lb4653/thesis/mixture-of-experts-thesis/data/cifar100/training',
    transform=transformations_training
)

In [22]:
moe.train_experts(save_state_path='./trained_models/shuffled_moe/', num_epochs=10)

--
Training of expert_44_14_94_87_19_6_85_63_9_12.pth
Training on device: cuda:0
Training on 4,023 samples
Validation on 977 samples
Number of parameters: 2,173,710

Epoch 1/10
----------
training Loss: 0.9734  Top1 Accuracy: 0.6622
validation Loss: 0.6515  Top1 Accuracy: 0.7840

Epoch 2/10
----------
training Loss: 0.6079  Top1 Accuracy: 0.7852
validation Loss: 0.6331  Top1 Accuracy: 0.7922

Epoch 3/10
----------
training Loss: 0.5203  Top1 Accuracy: 0.8101
validation Loss: 0.5920  Top1 Accuracy: 0.8045

Epoch 4/10
----------
training Loss: 0.4667  Top1 Accuracy: 0.8342
validation Loss: 0.6351  Top1 Accuracy: 0.8045

Epoch 5/10
----------
training Loss: 0.4634  Top1 Accuracy: 0.8417
validation Loss: 0.6767  Top1 Accuracy: 0.7892

Epoch 6/10
----------
training Loss: 0.3895  Top1 Accuracy: 0.8566
validation Loss: 0.6331  Top1 Accuracy: 0.7953

Epoch 7/10
----------
training Loss: 0.3719  Top1 Accuracy: 0.8732
validation Loss: 0.6089  Top1 Accuracy: 0.8066

Epoch 8/10
----------
trainin

In [23]:
moe.train_gate(save_state_path='./trained_models/shuffled_moe/gate.pth', num_epochs=10)

Training of MoE_shuffled__gate
Training on device: cuda:0
Training on 40,000 samples
Validation on 10,000 samples
Number of parameters: 1,555,490

Epoch 1/10
----------
training Loss: 3.1253  Top1 Accuracy: 0.3848  Top5 Accuracy: 0.6867
validation Loss: 2.8644  Top1 Accuracy: 0.4304  Top5 Accuracy: 0.7130

Epoch 2/10
----------
training Loss: 2.6221  Top1 Accuracy: 0.4898  Top5 Accuracy: 0.7815
validation Loss: 2.7197  Top1 Accuracy: 0.4541  Top5 Accuracy: 0.7424

Epoch 3/10
----------
training Loss: 2.4680  Top1 Accuracy: 0.5231  Top5 Accuracy: 0.8067
validation Loss: 2.6460  Top1 Accuracy: 0.4773  Top5 Accuracy: 0.7513

Epoch 4/10
----------
training Loss: 2.3812  Top1 Accuracy: 0.5400  Top5 Accuracy: 0.8150
validation Loss: 2.6330  Top1 Accuracy: 0.4804  Top5 Accuracy: 0.7539

Epoch 5/10
----------
training Loss: 2.3192  Top1 Accuracy: 0.5531  Top5 Accuracy: 0.8206
validation Loss: 2.6305  Top1 Accuracy: 0.4780  Top5 Accuracy: 0.7585

Epoch 6/10
----------
training Loss: 2.2664  Top

In [15]:
moe.gate.load_state_dict(torch.load('./trained_models/shuffled_moe/gate.pth'))

<All keys matched successfully>

In [24]:
moe.evaluate_decisions()

RuntimeError: The size of tensor a (60) must match the size of tensor b (30) at non-singleton dimension 0