In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os 
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
import pytorch_lightning as pl

from pathlib import Path
from pytorch_lightning.callbacks import TQDMProgressBar, ModelCheckpoint, EarlyStopping
from tqdm.notebook import tqdm

from lib.datamodule import ImageNetModule
from lib.model import ImageNetModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# set dirs
data_dir = '/old_home/ammeling/projects/ImageNet/Cats'

# load helper
wnid_to_class = pickle.load(open('wnid_to_class.pkl', 'rb'))
class_to_label = pickle.load(open('class_to_label.pkl', 'rb'))

In [4]:
# load data module
dm = ImageNetModule(
    img_dir=data_dir, 
    wnid_to_class=wnid_to_class, 
    class_to_label=class_to_label,
    pilot_data=True
    )

In [5]:
# load model 
model = ImageNetModel(logit_norm=True, temperature=2, optimizer='AdamW')



In [6]:
# define callbacks 
progress_bar = TQDMProgressBar(refresh_rate=10)
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints/',
    monitor='val/acc',
    mode='max',
    save_top_k=1,
    filename='ImageNetModel_Pilot_epoch{epoch:02d}_val_acc{val/acc:.2f}',
    auto_insert_metric_name=False
)

earlystopping = EarlyStopping(monitor='val/acc', stopping_threshold=0.8, verbose=True, patience=20, mode='max')

In [7]:
# set vars
max_epochs = 50

# load trainer
trainer = pl.Trainer(
    fast_dev_run=False,
    max_epochs=max_epochs,
    devices=1,
    accelerator='gpu',
    logger=False,
    callbacks=[progress_bar, checkpoint_callback, earlystopping])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
# fit model
trainer.fit(model, datamodule=dm)

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4]

  | Name              | Type               | Params
---------------------------------------------------------
0 | feature_extractor | Sequential         | 11.2 M
1 | fc                | Linear             | 1.5 K 
2 | loss_func         | LogitNormLoss      | 0     
3 | acc               | MulticlassAccuracy | 0     
---------------------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.712    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 294/294 [00:39<00:00,  7.48it/s, loss=0.614, train/acc_step=0.625, val/acc=0.267, train/acc_epoch=0.640]

Metric val/acc improved. New best score: 0.267


Epoch 1: 100%|██████████| 294/294 [00:39<00:00,  7.46it/s, loss=0.613, train/acc_step=0.688, val/acc=0.567, train/acc_epoch=0.684]

Metric val/acc improved by 0.300 >= min_delta = 0.0. New best score: 0.567


Epoch 5: 100%|██████████| 294/294 [00:39<00:00,  7.47it/s, loss=0.603, train/acc_step=0.625, val/acc=0.633, train/acc_epoch=0.739]

Metric val/acc improved by 0.067 >= min_delta = 0.0. New best score: 0.633


Epoch 9: 100%|██████████| 294/294 [00:39<00:00,  7.48it/s, loss=0.599, train/acc_step=0.562, val/acc=0.667, train/acc_epoch=0.767]

Metric val/acc improved by 0.033 >= min_delta = 0.0. New best score: 0.667


Epoch 12: 100%|██████████| 294/294 [00:39<00:00,  7.37it/s, loss=0.597, train/acc_step=0.688, val/acc=0.700, train/acc_epoch=0.785]

Metric val/acc improved by 0.033 >= min_delta = 0.0. New best score: 0.700


Epoch 14: 100%|██████████| 294/294 [00:38<00:00,  7.57it/s, loss=0.591, train/acc_step=0.938, val/acc=0.733, train/acc_epoch=0.796]

Metric val/acc improved by 0.033 >= min_delta = 0.0. New best score: 0.733


Epoch 34: 100%|██████████| 294/294 [00:38<00:00,  7.68it/s, loss=0.583, train/acc_step=0.875, val/acc=0.667, train/acc_epoch=0.840]

Monitored metric val/acc did not improve in the last 20 records. Best score: 0.733. Signaling Trainer to stop.


Epoch 34: 100%|██████████| 294/294 [00:38<00:00,  7.67it/s, loss=0.583, train/acc_step=0.875, val/acc=0.667, train/acc_epoch=0.840]
