In [3]:
import lib
import yaml
import os, sys
import wandb

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [5]:
import pickle

with open('opts.pkl', 'rb') as f:
    opts = pickle.load(f)
opts

{'saliency': True,
 'name': 'check main.py #2',
 'device': 0,
 'wandb': False,
 'visualize_saliency': False,
 'clive': {'images': 'iqa/CLIVE/images',
  'saliency_maps': 'iqa/CLIVE/saliency_maps',
  'data': 'iqa/CLIVE/data.csv'},
 'koniq10k': {'images': 'iqa/koniq10k/images',
  'saliency_maps': 'iqa/koniq10k/saliency_maps',
  'data': 'iqa/koniq10k/data.csv'},
 'image_shape': [384, 512],
 'saliency_shape': [12, 16],
 'batch_size': 16,
 'num_workers': 16,
 'learning_rate_head': 0.0003,
 'learning_rate_backbone': 3e-05,
 'max_epochs': 2,
 'model_checkpoint_best': {'monitor': 'val_srocc',
  'mode': 'max',
  'save_top_k': 1,
  'save_weights_only': True,
  'verbose': True},
 'model_checkpoint_last': {'save_weights_only': True, 'verbose': True},
 'early_stopping': {'monitor': 'val_srocc',
  'mode': 'max',
  'patience': 8,
  'verbose': True},
 'lr_scheduler': {'step_size': 7, 'gamma': 0.1, 'verbose': True},
 'alpha_sal': 0.2,
 'lr_dict': {'interval': 'epoch', 'frequency': 1}}

In [6]:
opts['name'] = 'debug_checkpoint'

In [10]:
loaders = lib.get_loaders(opts, fast=True)
validation_batch = next(iter(loaders['valid']))

model = lib.Model(opts, validation_batch=validation_batch)

if opts['wandb']:
    wandb.init(
        entity='alexkkir',
        project='IQA', 
        name=opts['name'],
        config=opts
    )
    logger = WandbLogger()
else:
    logger = None

CheckpointLast = ModelCheckpoint(
    dirpath=f"checkpoints/{opts['name']}/",
    filename=f'last_date={lib.today()}_' + '{val_srocc:.3f}_{epoch}',
    **opts['model_checkpoint_last'])

CheckpointBest = ModelCheckpoint(
    dirpath=f"checkpoints/{opts['name']}/",
    filename=f'best_date={lib.today()}_' + '{val_srocc:.3f}_{epoch}',
    **opts['model_checkpoint_best'])

MyEarlyStopping = EarlyStopping(**opts['early_stopping'])

trainer = pl.Trainer(
    logger=logger,
    max_epochs=7,
    accelerator='gpu',
    devices=[opts['device']],
    callbacks=[MyEarlyStopping, CheckpointLast, CheckpointBest],
    log_every_n_steps=1,
)

trainer.fit(model, loaders['train'], loaders['valid'])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name     | Type       | Params
----------------------------------------
0 | backbone | Sequential | 7.7 M 
1 | mlp      | Sequential | 1.7 M 
2 | sal_conv | Conv2d     | 1.4 K 
3 | mse_loss | MSELoss    | 0     
----------------------------------------
9.4 M     Trainable params
0         Non-trainable params
9.4 M     Total params
37.642    Total estimated model params size (MB)


Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Sanity Checking: 0it [00:00, ?it/s]

[Epoch   0] VALID plcc: -0.49, srocc: -0.48, loss: 3016.69 

Training: 0it [00:00, ?it/s]

Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Validation: 0it [00:00, ?it/s]

Metric val_srocc improved. New best score: 0.088
Epoch 0, global step 8: 'val_srocc' reached 0.08833 (best 0.08833), saving model to '/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.088_epoch=0.ckpt' as top 1


[Epoch   0] VALID plcc: 0.10, srocc: 0.09, loss: 3220.67 | TRAIN plcc: 0.08, srocc: 0.10, loss: 2654.96
Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Validation: 0it [00:00, ?it/s]

Metric val_srocc improved by 0.074 >= min_delta = 0.0. New best score: 0.162
Epoch 1, global step 16: 'val_srocc' reached 0.16195 (best 0.16195), saving model to '/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.162_epoch=1.ckpt' as top 1


[Epoch   1] VALID plcc: 0.17, srocc: 0.16, loss: 3197.66 | TRAIN plcc: 0.41, srocc: 0.36, loss: 2689.56
Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Validation: 0it [00:00, ?it/s]

Metric val_srocc improved by 0.059 >= min_delta = 0.0. New best score: 0.221
Epoch 2, global step 24: 'val_srocc' reached 0.22116 (best 0.22116), saving model to '/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.221_epoch=2.ckpt' as top 1


[Epoch   2] VALID plcc: 0.24, srocc: 0.22, loss: 3169.11 | TRAIN plcc: 0.50, srocc: 0.43, loss: 2703.99
Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Validation: 0it [00:00, ?it/s]

Metric val_srocc improved by 0.032 >= min_delta = 0.0. New best score: 0.253
Epoch 3, global step 32: 'val_srocc' reached 0.25331 (best 0.25331), saving model to '/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.253_epoch=3.ckpt' as top 1


[Epoch   3] VALID plcc: 0.27, srocc: 0.25, loss: 3135.11 | TRAIN plcc: 0.63, srocc: 0.60, loss: 2652.73
Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Validation: 0it [00:00, ?it/s]

Metric val_srocc improved by 0.038 >= min_delta = 0.0. New best score: 0.291
Epoch 4, global step 40: 'val_srocc' reached 0.29126 (best 0.29126), saving model to '/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.291_epoch=4.ckpt' as top 1


[Epoch   4] VALID plcc: 0.31, srocc: 0.29, loss: 3099.80 | TRAIN plcc: 0.64, srocc: 0.62, loss: 2602.95
Adjusting learning rate of group 0 to 3.0000e-05.
Adjusting learning rate of group 1 to 3.0000e-04.


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 48: 'val_srocc' was not in top 1


[Epoch   5] VALID plcc: 0.31, srocc: 0.29, loss: 3049.18 | TRAIN plcc: 0.66, srocc: 0.61, loss: 2485.68
Adjusting learning rate of group 0 to 3.0000e-06.
Adjusting learning rate of group 1 to 3.0000e-05.


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 56: 'val_srocc' was not in top 1
`Trainer.fit` stopped: `max_epochs=7` reached.


[Epoch   6] VALID plcc: 0.28, srocc: 0.26, loss: 3007.66 | TRAIN plcc: 0.76, srocc: 0.74, loss: 2623.53


In [12]:
CheckpointBest.best_model_path

'/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.291_epoch=4.ckpt'

In [15]:
CheckpointLast.best_model_path

'/home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/last_date=2023-01-31_19:12_val_srocc=0.262_epoch=6.ckpt'

In [17]:
model.test_dashboard = 'test_koniq_best'
trainer.test(ckpt_path=CheckpointBest.best_model_path, dataloaders=loaders['test_koniq'])

model.test_dashboard = 'test_koniq_last'
trainer.test(ckpt_path=CheckpointLast.best_model_path, dataloaders=loaders['test_koniq'])

Restoring states from the checkpoint path at /home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.291_epoch=4.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/best_date=2023-01-31_19:12_val_srocc=0.291_epoch=4.ckpt


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  test_koniq_best/loss        3099.802734375
  test_koniq_best/plcc      0.3071355956043797
  test_koniq_best/srocc     0.2912630017114256
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


Restoring states from the checkpoint path at /home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/last_date=2023-01-31_19:12_val_srocc=0.262_epoch=6.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /home/alexkkir/experiments-with-saliency/checkpoints/debug_checkpoint/last_date=2023-01-31_19:12_val_srocc=0.262_epoch=6.ckpt


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  test_koniq_last/loss       3007.66455078125
  test_koniq_last/plcc      0.27980686079265105
  test_koniq_last/srocc     0.26202551579937994
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_koniq_last/loss': 3007.66455078125,
  'test_koniq_last/plcc': 0.27980686079265105,
  'test_koniq_last/srocc': 0.26202551579937994}]