# Notebook to investigate checkpoint loading bug

Try to train model and than check if the same result can be obtained by loading the model from checkpoint.

In [37]:
from birdset.datamodule.esc50_datamodule import ESC50DataModule
from birdset.configs.datamodule_configs import DatasetConfig
from birdset.datamodule.components.transforms import BirdSetTransformsWrapper

dataset_config = DatasetConfig(
    task="multiclass",
)
transforms = BirdSetTransformsWrapper(
    task=dataset_config.task,
)
datamodule = ESC50DataModule(dataset=dataset_config)

In [38]:
datamodule.prepare_data()

Repo card metadata block was not found. Setting CardData to empty.


In [39]:
datamodule.setup("fit")

In [40]:
train_loader = datamodule.train_dataloader()
# get the first batch
batch = next(iter(train_loader))
# get shape of the batch
print(batch["input_values"].shape)
print(batch["labels"].shape)

transform task multilabel
torch.Size([32, 1, 128, 1024])
torch.Size([32])


transform task multilabel
transform task multilabel


In [21]:
batch["labels"]

tensor([28,  9, 38, 21, 47, 22, 32, 43, 34, 43, 47, 29, 33, 37, 36, 41,  7, 45,
        47, 42,  7, 41, 32, 33, 41, 27, 38, 39, 10,  5, 18, 20])

In [33]:
from functools import partial
from birdset.modules.base_module import BaseModule
from birdset.configs import NetworkConfig, MulticlassMetricsConfig
from birdset.modules.models.efficientnet import EfficientNetClassifier
from torch.optim import AdamW

model = BaseModule(
    network=NetworkConfig(
        model=EfficientNetClassifier(
            num_classes=datamodule.num_classes,
        ),
    ),
    metrics=MulticlassMetricsConfig(num_labels=50),
    num_epochs=5,
    len_trainset=datamodule.len_trainset,
    optimizer=partial(AdamW, lr=5e-4, weight_decay=5e-4),
)
model

BaseModule(
  (loss): CrossEntropyLoss()
  (model): EfficientNetClassifier(
    (model): EfficientNetForImageClassification(
      (efficientnet): EfficientNetModel(
        (embeddings): EfficientNetEmbeddings(
          (padding): ZeroPad2d((0, 1, 0, 1))
          (convolution): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=valid, bias=False)
          (batchnorm): BatchNorm2d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
          (activation): SiLU()
        )
        (encoder): EfficientNetEncoder(
          (blocks): ModuleList(
            (0): EfficientNetBlock(
              (depthwise_conv): EfficientNetDepthwiseLayer(
                (depthwise_conv_pad): ZeroPad2d((0, 1, 0, 1))
                (depthwise_conv): EfficientNetDepthwiseConv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=same, groups=32, bias=False)
                (depthwise_norm): BatchNorm2d(32, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
          

In [23]:
logits = model.forward(batch["input_values"])
loss = model.loss(logits, batch["labels"])
loss

tensor(3.8957, grad_fn=<NllLossBackward0>)

In [24]:
preds = model.output_activation(logits)
preds

tensor([[0.0192, 0.0206, 0.0209,  ..., 0.0206, 0.0207, 0.0212],
        [0.0178, 0.0193, 0.0194,  ..., 0.0202, 0.0199, 0.0204],
        [0.0188, 0.0192, 0.0201,  ..., 0.0198, 0.0199, 0.0207],
        ...,
        [0.0192, 0.0190, 0.0322,  ..., 0.0138, 0.0279, 0.0262],
        [0.0199, 0.0192, 0.0193,  ..., 0.0206, 0.0195, 0.0204],
        [0.0159, 0.0256, 0.0290,  ..., 0.0199, 0.0279, 0.0254]],
       grad_fn=<SoftmaxBackward0>)

In [25]:
max_preds = preds.argmax(dim=1)
max_preds

tensor([32, 44, 22, 32, 28, 19, 44, 43, 19, 37, 37, 22, 36, 39, 39, 41, 37, 22,
        32, 39, 44, 13, 26, 19, 32, 43, 22, 44, 22, 39, 22, 21])

In [26]:
model.valid_metric(preds, batch["labels"])

tensor(0.0625)

In [27]:
from lightning import Trainer

trainer = Trainer(max_epochs=5, accelerator="gpu", devices=1)
trainer.fit(model, datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name                  | Type                   | Params | Mode 
-------------------------------------------------------------------------
0 | loss                  | CrossEntropyLoss       | 0      | train
1 | model                 | EfficientNetClassifier | 6.6 M  | train
2 | train_metric          | MulticlassAccuracy     | 0      | train
3 | valid_metric          | MulticlassAccuracy     | 0      | train
4 | test_metric           | MulticlassAccuracy     | 0      | train
5 | valid_metric_best     | MaxMetric              | 0      | train
6 | valid_add_metrics     | MetricCollection       | 0      | train
7 | test_add_metrics      | MetricCollection       | 0      | train
8 | test_complete_metrics | MetricCollection       | 0      | train
-------------------------------------------------------------------------
6.6 M     Trainable params
0         Non-trainable params
6.6 M     Total params
26.307    Total estimated model params siz

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [28]:
trainer.test(model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test/CrossEntropyLoss': 2.503328561782837,
  'test/MulticlassAccuracy': 0.2775000035762787,
  'test/F1': 0.2775000035762787}]

In [31]:
model = BaseModule.load_from_checkpoint(
    "/workspace/notebooks/development/lightning_logs/version_13/checkpoints/epoch=4-step=200.ckpt"
)

In [32]:
trainer.test(model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test/CrossEntropyLoss': 2.503328561782837,
  'test/MulticlassAccuracy': 0.2775000035762787,
  'test/F1': 0.2775000035762787}]

In [35]:
trainer = Trainer(max_epochs=5, accelerator="gpu", devices=1)
trainer.test(
    model,
    datamodule,
    ckpt_path="/workspace/notebooks/development/lightning_logs/version_13/checkpoints/epoch=4-step=200.ckpt",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Restoring states from the checkpoint path at /workspace/notebooks/development/lightning_logs/version_13/checkpoints/epoch=4-step=200.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from the checkpoint at /workspace/notebooks/development/lightning_logs/version_13/checkpoints/epoch=4-step=200.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test/CrossEntropyLoss': 2.503328561782837,
  'test/MulticlassAccuracy': 0.2775000035762787,
  'test/F1': 0.2775000035762787}]