### Training code for dataset Bla Bla

Description of the dataset, experiment etc

#### General imports

In [26]:
import pooch
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader
from careamics.lightning import VAEModule

from microsplit_reproducibility.configs.factory import (
    get_algorithm_config,
    get_likelihood_config,
    get_loss_config,
    get_model_config,
    get_optimizer_config,
    get_training_config,
    get_lr_scheduler_config,
)
from microsplit_reproducibility.utils.callbacks import get_callbacks
from microsplit_reproducibility.utils.io import load_checkpoint
from microsplit_reproducibility.datasets import create_train_val_datasets
from microsplit_reproducibility.utils.utils import plot_training_metrics

#### Experiments specific imports

In [2]:
from microsplit_reproducibility.configs.parameters.pavia_p24 import get_denoisplit_parameters
from microsplit_reproducibility.configs.data.pavia_p24 import get_data_configs
from microsplit_reproducibility.datasets.pavia_p24 import get_train_val_data

### Get data and experiment parameters

Example training code 5 epochs, switch between full training, short training,  fine-tuning 

In [45]:
train_data_config, val_data_config, test_data_configs = get_data_configs()
experiment_params = get_denoisplit_parameters()

ValidationError: 2 validation errors for PaviaDataConfig
data_type
  Input should be 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 or 14 [type=enum, input_value=<DataType.Pavia3SeqData: 2>, input_type=DataType]
    For further information visit https://errors.pydantic.dev/2.8/v/enum
datasplit_type
  Input should be 0, 1, 2 or 3 [type=enum, input_value=<DataSplitType.Train: 1>, input_type=DataSplitType]
    For further information visit https://errors.pydantic.dev/2.8/v/enum

In [16]:
for k in train_data_config:
    print(k)    

('data_type', <DataType.Pavia3SeqData: 2>)
('depth3D', 1)
('datasplit_type', <DataSplitType.Train: 1>)
('num_channels', 2)
('ch1_fname', None)
('ch2_fname', None)
('ch_input_fname', None)
('input_is_sum', False)
('input_idx', None)
('target_idx_list', None)
('start_alpha', None)
('end_alpha', None)
('image_size', (64, 64))
('grid_size', 32)
('empty_patch_replacement_enabled', False)
('empty_patch_replacement_channel_idx', None)
('empty_patch_replacement_probab', None)
('empty_patch_max_val_threshold', None)
('uncorrelated_channels', False)
('uncorrelated_channel_probab', 0.5)
('poisson_noise_factor', -1.0)
('synthetic_gaussian_scale', None)
('input_has_dependant_noise', True)
('enable_gaussian_noise', False)
('allow_generation', False)
('training_validtarget_fraction', None)
('deterministic_grid', None)
('enable_rotation_aug', False)
('max_val', None)
('overlapping_padding_kwargs', {'mode': 'reflect'})
('print_vars', False)
('normalized_input', True)
('use_one_mu_std', True)
('train_au

In [17]:
experiment_params

{'algorithm': 'denoisplit',
 'loss_type': 'denoisplit_musplit',
 'img_size': (64, 64),
 'target_channels': 2,
 'multiscale_count': 3,
 'predict_logvar': 'pixelwise',
 'nm_paths': ['noise_models/noise_model_pavia_p24_channel_0.npz',
  'noise_models/noise_model_pavia_p24_channel_1.npz'],
 'kl_type': 'kl_restricted',
 'batch_size': 32,
 'lr': 0.001,
 'lr_scheduler_patience': 30,
 'earlystop_patience': 200,
 'num_epochs': 400,
 'num_workers': 0,
 'mmse_count': 10,
 'grid_size': 32}

### Create dataset

In [6]:
tmp_local_path = "/localscratch/data/pavia3_sequential_cropped"

In [7]:
DATA = pooch.create(
    # path=pooch.os_cache("microsplit_reproducibility_pavia_p24"), # TODO should be downloaded and stored locally
    path=tmp_local_path,
    base_url="",
    registry={"":""},
)

In [8]:
train_dset, val_dset, _, data_stats = create_train_val_datasets(
    datapath=tmp_local_path,
    train_config=train_data_config,
    val_config=val_data_config,
    test_config=val_data_config,
    load_data_func=get_train_val_data,
)

# TODO problem is, creating a dataloader requires a config, that's ugly af


Explicit datasplit Train [1, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16]
Loaded from SubDsetType.MultiChannel /localscratch/data/pavia3_sequential_cropped 13/17 frames

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style

Padding is not used with this alignement style
MultiFileDset avg height: 1288, avg width: 1092, count: 13

Explicit datasplit Val [2, 12]
Loaded from SubDsetType.MultiChannel /localscratch/data/pavia3_sequential_cropped 2/17 frames

Padding is not used with

### Create dataloaders

In [9]:
train_dloader = DataLoader(
    train_dset,
    batch_size=experiment_params["batch_size"],
    num_workers=experiment_params["num_workers"],
    shuffle=True,
)
val_dloader = DataLoader(
    val_dset,
    batch_size=experiment_params["batch_size"],
    num_workers=experiment_params["num_workers"],
    shuffle=False,
)

### Get experiment configs

In [18]:
experiment_params["data_stats"] = data_stats # TODO rethink

loss_config = get_loss_config(**experiment_params)
model_config = get_model_config(**experiment_params)
gaussian_lik_config, noise_model_config, nm_lik_config = get_likelihood_config(
    **experiment_params
)
training_config = get_training_config(**experiment_params)
lr_scheduler_config = get_lr_scheduler_config(**experiment_params)
optimizer_config = get_optimizer_config(**experiment_params)

# TODO rename to create
experiment_config = get_algorithm_config(
    algorithm=experiment_params["algorithm"],
    loss_config=loss_config,
    model_config=model_config,
    gaussian_lik_config=gaussian_lik_config,
    nm_config=noise_model_config,
    nm_lik_config=nm_lik_config,
    lr_scheduler_config=lr_scheduler_config,
    optimizer_config=optimizer_config,
)

### Visualize configs

In [None]:
#TODO code, discuss

### Initialize the model

In [40]:
lightning_model = VAEModule(algorithm_config=experiment_config)

[GaussianMixtureNoiseModel] min_sigma: 200.0
[GaussianMixtureNoiseModel] min_sigma: 200.0
[MultiChannelNoiseModel] Nmodels count:2
[GaussianLikelihood] PredLVar:pixelwise LowBLVar:-5.0


### Load checkpoint (optional)

It's possible to load a checkpoint to continue training

In [None]:
ckpt = load_checkpoint("checkpoints", best=False)
lightning_model = VAEModule.load_from_checkpoint(ckpt, algorithm_config=experiment_config)

### Train the model

Only 5 epochs for the sake of the example

In [41]:
trainer = Trainer(
    max_epochs=training_config.num_epochs,
    accelerator="gpu",
    enable_progress_bar=True,
    callbacks=get_callbacks("."),
    precision=training_config.precision,
    gradient_clip_val=training_config.gradient_clip_val,
    gradient_clip_algorithm=training_config.gradient_clip_algorithm,
)
trainer.fit(
    model=lightning_model,
        train_dataloaders=train_dloader,
        val_dataloaders=val_dloader,
    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/localscratch/mamba/envs/splits/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/igor.zubarev/projects/microSplit-reproducibility/examples/2D/Pavia_P24 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                   | Type                   | Params | Mode 
--------------------------------------------------------------------------
0 | model                  | LadderVAE              | 3.6 M  | train
1 | noise_model            | MultiChannelNoiseModel | 0      | train
2 | noise_model_likelihood | NoiseModelLikelihood   | 0      | train
3 | gaussian_likelihood    | GaussianLikelihood     | 0      | train
--------------------------------------------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M   

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/localscratch/mamba/envs/splits/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


                                                                           

/localscratch/mamba/envs/splits/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 592/592 [02:52<00:00,  3.43it/s, v_num=2]

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Epoch 0: 100%|██████████| 592/592 [03:13<00:00,  3.06it/s, v_num=2, val_loss=3.530, val_reconstruction_loss=3.520, val_kl_loss=0.00702, val_psnr=24.90]

Metric val_loss improved. New best score: 3.529


Epoch 0:  11%|█▏        | 68/592 [1:06:30<8:32:34,  0.02it/s, v_num=1]l_loss=3.530, val_reconstruction_loss=3.530, val_kl_loss=0.00517, val_psnr=24.90]
Epoch 3: 100%|██████████| 592/592 [03:06<00:00,  3.18it/s, v_num=2, val_loss=3.520, val_reconstruction_loss=3.520, val_kl_loss=0.00563, val_psnr=25.10]

Metric val_loss improved by 0.008 >= min_delta = 1e-06. New best score: 3.521


Epoch 4: 100%|██████████| 592/592 [03:06<00:00,  3.18it/s, v_num=2, val_loss=3.520, val_reconstruction_loss=3.510, val_kl_loss=0.00476, val_psnr=25.10]

Metric val_loss improved by 0.003 >= min_delta = 1e-06. New best score: 3.518


Epoch 5: 100%|██████████| 592/592 [03:06<00:00,  3.18it/s, v_num=2, val_loss=3.520, val_reconstruction_loss=3.510, val_kl_loss=0.00465, val_psnr=25.10]

Metric val_loss improved by 0.000 >= min_delta = 1e-06. New best score: 3.518


Epoch 6: 100%|██████████| 592/592 [03:06<00:00,  3.18it/s, v_num=2, val_loss=3.520, val_reconstruction_loss=3.510, val_kl_loss=0.00389, val_psnr=25.10]

Metric val_loss improved by 0.001 >= min_delta = 1e-06. New best score: 3.516


Epoch 7: 100%|██████████| 592/592 [03:05<00:00,  3.19it/s, v_num=2, val_loss=3.520, val_reconstruction_loss=3.510, val_kl_loss=0.00357, val_psnr=25.10]

Metric val_loss improved by 0.001 >= min_delta = 1e-06. New best score: 3.515


Epoch 8: 100%|██████████| 592/592 [03:07<00:00,  3.16it/s, v_num=2, val_loss=3.510, val_reconstruction_loss=3.510, val_kl_loss=0.00401, val_psnr=25.10]

Metric val_loss improved by 0.000 >= min_delta = 1e-06. New best score: 3.515


Epoch 9: 100%|██████████| 592/592 [03:12<00:00,  3.08it/s, v_num=2, val_loss=3.510, val_reconstruction_loss=3.510, val_kl_loss=0.00353, val_psnr=25.20]

Metric val_loss improved by 0.004 >= min_delta = 1e-06. New best score: 3.511


Epoch 10: 100%|██████████| 592/592 [03:13<00:00,  3.07it/s, v_num=2, val_loss=3.510, val_reconstruction_loss=3.510, val_kl_loss=0.00358, val_psnr=25.20]

Metric val_loss improved by 0.000 >= min_delta = 1e-06. New best score: 3.510


Epoch 12: 100%|██████████| 592/592 [03:10<00:00,  3.11it/s, v_num=2, val_loss=3.510, val_reconstruction_loss=3.510, val_kl_loss=0.00326, val_psnr=25.20]

Metric val_loss improved by 0.001 >= min_delta = 1e-06. New best score: 3.509


Epoch 14:  22%|██▏       | 129/592 [00:38<02:18,  3.33it/s, v_num=2, val_loss=3.510, val_reconstruction_loss=3.510, val_kl_loss=0.00351, val_psnr=25.20]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

### Training logs

In [44]:
experiment_params

NameError: name 'experiment_params' is not defined

In [43]:
plot_training_metrics(f"csv_logs/{experiment_params['experiment_name']}/version_0/")

IsADirectoryError: [Errno 21] Is a directory: 'csv_logs'