In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch
from utils import (
    load_config,
    set_seed
)
from train_continual import continual_training_pipeline


def main():
    config = load_config("config.yaml")
    if config["experiment"].get("reproducibility", False):
        set_seed(config["experiment"].get("seed", 42))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] Using device: {device}")

    if config["model"]["type"] == "resnet50":
        from Models.Wafer_resnet_model import create_resnet_model
        model_factory = create_resnet_model
    elif config["model"]["type"] == "simplenn":
        from Models.Wafer_simple_model import create_simple_model
        model_factory = create_simple_model

    continual_training_pipeline(config, model_factory, device)


if __name__ == '__main__':
    main()

  from .autonotebook import tqdm as notebook_tqdm


[INFO] Using device: cuda

[INFO] Training on Task 0 with classes: [0, 1]


[I 2025-03-15 18:18:47,318] A new study created in memory with name: no-name-245418dd-015d-444a-8402-87133d63f77f


[DEBUG] Creating ResNet50 with lr=3.088271973902033e-05, weight_decay=0.007074911583202907
[DEBUG] Creating ResNet50 with lr=3.088271973902033e-05, weight_decay=0.007074911583202907
[DEBUG] Creating ResNet50 with lr=3.088271973902033e-05, weight_decay=0.007074911583202907


[I 2025-03-15 18:27:31,393] Trial 0 finished with value: 0.026745706890887094 and parameters: {'lr': 3.088271973902033e-05, 'weight_decay': 0.007074911583202907}. Best is trial 0 with value: 0.026745706890887094.


[DEBUG] Creating ResNet50 with lr=0.00013112763125612312, weight_decay=0.004022552330099805
[DEBUG] Creating ResNet50 with lr=0.00013112763125612312, weight_decay=0.004022552330099805
[DEBUG] Creating ResNet50 with lr=0.00013112763125612312, weight_decay=0.004022552330099805


[I 2025-03-15 18:37:21,883] Trial 1 finished with value: 0.0127835709450116 and parameters: {'lr': 0.00013112763125612312, 'weight_decay': 0.004022552330099805}. Best is trial 1 with value: 0.0127835709450116.


[INFO] Best hyperparameters: lr=0.00013112763125612312, weight_decay=0.004022552330099805
[DEBUG] Creating ResNet50 with lr=0.00013112763125612312, weight_decay=0.004022552330099805
Epoch 0: Train Loss=0.0751, Val Loss=0.0979
Epoch 1: Train Loss=0.0063, Val Loss=0.1629
Epoch 2: Train Loss=0.0141, Val Loss=0.1795
Epoch 3: Train Loss=0.0146, Val Loss=0.0950
Epoch 4: Train Loss=0.0137, Val Loss=0.2481
Epoch 5: Train Loss=0.0043, Val Loss=0.1009
Epoch 6: Train Loss=0.0053, Val Loss=0.1116
Epoch 7: Train Loss=0.0208, Val Loss=0.1048
Epoch 8: Train Loss=0.0208, Val Loss=0.1500
Epoch 9: Train Loss=0.0067, Val Loss=0.3039
[INFO] Model checkpoint saved to model_checkpoints\baseline\default\task0\model_default.pth
[Evaluation] Task 0 test set accuracy: 0.8978
[Evaluation] Classification Report for Task 0:
              precision    recall  f1-score   support

           0      0.894     0.999     0.943       832
           1      0.979     0.322     0.485       146

    accuracy                 

[I 2025-03-15 18:43:52,959] A new study created in memory with name: no-name-057d7e7b-8012-4358-8111-44280efc6e6f


[DEBUG] Creating ResNet50 with lr=0.0002955238435626053, weight_decay=6.107669733918394e-05
[DEBUG] Creating ResNet50 with lr=0.0002955238435626053, weight_decay=6.107669733918394e-05
[DEBUG] Creating ResNet50 with lr=0.0002955238435626053, weight_decay=6.107669733918394e-05


[I 2025-03-15 19:15:45,210] Trial 0 finished with value: 0.02761032228502078 and parameters: {'lr': 0.0002955238435626053, 'weight_decay': 6.107669733918394e-05}. Best is trial 0 with value: 0.02761032228502078.


[DEBUG] Creating ResNet50 with lr=2.399676281099289e-05, weight_decay=2.5787867718315216e-05
[DEBUG] Creating ResNet50 with lr=2.399676281099289e-05, weight_decay=2.5787867718315216e-05
[DEBUG] Creating ResNet50 with lr=2.399676281099289e-05, weight_decay=2.5787867718315216e-05


[I 2025-03-15 19:52:21,201] Trial 1 finished with value: 0.030530131620692152 and parameters: {'lr': 2.399676281099289e-05, 'weight_decay': 2.5787867718315216e-05}. Best is trial 0 with value: 0.02761032228502078.


[INFO] Best hyperparameters: lr=0.0002955238435626053, weight_decay=6.107669733918394e-05
Epoch 0: Train Loss=1.1696, Val Loss=1.1882
Epoch 1: Train Loss=0.3609, Val Loss=0.7372
Epoch 2: Train Loss=0.2489, Val Loss=0.5412
Epoch 3: Train Loss=0.1992, Val Loss=0.5082
Epoch 4: Train Loss=0.1628, Val Loss=0.5180
Epoch 5: Train Loss=0.1294, Val Loss=0.5003
Epoch 6: Train Loss=0.1103, Val Loss=0.4122
Epoch 7: Train Loss=0.0981, Val Loss=0.4043
Epoch 8: Train Loss=0.0906, Val Loss=0.4413
Epoch 9: Train Loss=0.0850, Val Loss=0.3705
[INFO] Model checkpoint saved to model_checkpoints\baseline\default\task1\model_default.pth
[Evaluation] Task 1 test set accuracy: 0.8625
[Evaluation] Classification Report for Task 1:
              precision    recall  f1-score   support

           0      0.873     0.944     0.907      2772
           1      0.827     0.663     0.736      1126

    accuracy                          0.862      3898
   macro avg      0.850     0.803     0.821      3898
weighted avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import sys

# Optionally, set sys.argv to simulate command-line arguments.
# This step makes sure that parse_args() in your script picks up the desired configuration.
# For example, if you want to use "config.yaml", you can do:
sys.argv = ["train_continual.py", "--config", "config.yaml"]

# Import your training module. Ensure that train_normal.py is in the working directory.
import train_normal

# Now run the main() function from your module.
train_normal.main()



  from .autonotebook import tqdm as notebook_tqdm


Loaded configuration:
{'experiment': {'reproducibility': True, 'seed': 42, 'num_trials': 2, 'num_epochs': 10, 'final_epochs': 10, 'save_model': True, 'final_model_filename': 'final_model.pth', 'checkpoint_base_dir': 'model_checkpoints', 'tensorboard_log_dir': 'Logs', 'continual_learning': True, 'continual_method': 'baseline', 'ewc_lambda': 100.0, 'task_list': [[0, 1], [2, 3]], 'suggest': {'lr': {'low': '1e-5', 'high': '1e-3', 'log': True}, 'weight_decay': {'low': '1e-6', 'high': '1e-2', 'log': True}}}, 'logging': {'base_log_dir': './Logs'}, 'model': {'type': 'resnet50', 'lr': '1e-4', 'weight_decay': '1e-4'}, 'dataset': {'path': 'D:/Waffer Data/WM811K.pkl'}}
[INFO] Reproducibility enabled. Seed set to 42
[INFO] Using device: cuda
[INFO] Number of classes: 8
[INFO] Classes: ['Center' 'Donut' 'Edge-Loc' 'Edge-Ring' 'Loc' 'Near-full' 'Random'
 'Scratch']
[INFO] Creating Optuna study 'resnet_wafer_1742165514' with DB file: sqlite:///resnet_wafer_v2.db


[I 2025-03-16 23:51:54,983] A new study created in RDB with name: resnet_wafer_1742165514


[INFO] Starting study.optimize with 2 trials...


[OPTUNA] Starting Trial #0 with lr=0.000059, weight_decay=0.000518
[OPTUNA] Trial #0: Starting fold 1/5
[DEBUG] Creating ResNet50 with lr=5.850406800780064e-05, weight_decay=0.0005180819007082417
[INFO] >>> Starting fold 1 training (Trial #0) for 10 epochs...
[Fold 1 Epoch 1/10] Train Loss: 0.2988, Val Loss: 0.1079
[Fold 1 Epoch 2/10] Train Loss: 0.0725, Val Loss: 0.0958
[Fold 1 Epoch 3/10] Train Loss: 0.0366, Val Loss: 0.1276
[Fold 1 Epoch 4/10] Train Loss: 0.0255, Val Loss: 0.1117
[INFO] Early stopping on fold 1 at epoch 4
[INFO] <<< Finished fold 1, best val loss = 0.0958

[OPTUNA] Trial #0: Starting fold 2/5
[DEBUG] Creating ResNet50 with lr=5.850406800780064e-05, weight_decay=0.0005180819007082417
[INFO] >>> Starting fold 2 training (Trial #0) for 10 epochs...
[Fold 2 Epoch 1/10] Train Loss: 0.3066, Val Loss: 0.1060
[Fold 2 Epoch 2/10] Train Loss: 0.0756, Val Loss: 0.1031
[Fold 2 Epoch 3/10] Train Loss: 0.0330, Val Loss: 0.1047
[Fo

[I 2025-03-17 00:29:51,585] Trial 0 finished with value: 0.9098772154754051 and parameters: {'lr': 5.850406800780064e-05, 'weight_decay': 0.0005180819007082417}. Best is trial 0 with value: 0.9098772154754051.


[OPTUNA] Trial #0 done. Fold Accuracies: [0.904204009510847, 0.9232247300491768, 0.9105543547962692, 0.9012749006433296, 0.9101280823774037]. Avg Acc=0.9099

[OPTUNA] Starting Trial #1 with lr=0.000317, weight_decay=0.000003
[OPTUNA] Trial #1: Starting fold 1/5
[DEBUG] Creating ResNet50 with lr=0.00031673411752139316, weight_decay=2.7922343178785083e-06
[INFO] >>> Starting fold 1 training (Trial #1) for 10 epochs...
[Fold 1 Epoch 1/10] Train Loss: 0.2771, Val Loss: 0.2996
[Fold 1 Epoch 2/10] Train Loss: 0.1352, Val Loss: 0.1462
[Fold 1 Epoch 3/10] Train Loss: 0.0919, Val Loss: 0.1382
[Fold 1 Epoch 4/10] Train Loss: 0.0725, Val Loss: 0.1397
[Fold 1 Epoch 5/10] Train Loss: 0.0600, Val Loss: 0.1775
[INFO] Early stopping on fold 1 at epoch 5
[INFO] <<< Finished fold 1, best val loss = 0.1382

[OPTUNA] Trial #1: Starting fold 2/5
[DEBUG] Creating ResNet50 with lr=0.00031673411752139316, weight_decay=2.7922343178785083e-06
[INFO] >>> Starting fold 2 training (Trial #1) for 10 epochs...
[Fold

[I 2025-03-17 01:08:43,639] Trial 1 finished with value: 0.8679034621403648 and parameters: {'lr': 0.00031673411752139316, 'weight_decay': 2.7922343178785083e-06}. Best is trial 0 with value: 0.9098772154754051.


[OPTUNA] Trial #1 done. Fold Accuracies: [0.8617526181795776, 0.8386558398050121, 0.8674732660866928, 0.880325255598856, 0.8913103310316858]. Avg Acc=0.8679
[OPTUNA] Best trial found:
  Trial number: 0
  Avg k-fold acc: 0.9099
    lr: 5.850406800780064e-05
    weight_decay: 0.0005180819007082417

[INFO] Retraining final model on entire training set using best hyperparameters...
[DEBUG] Creating ResNet50 with lr=5.850406800780064e-05, weight_decay=0.0005180819007082417
[Final Train] Epoch 1/10 - Loss: 0.2684, Acc: 0.9092
[Final Train] Epoch 2/10 - Loss: 0.0705, Acc: 0.9774
[Final Train] Epoch 3/10 - Loss: 0.0350, Acc: 0.9886
[Final Train] Epoch 4/10 - Loss: 0.0285, Acc: 0.9912
[Final Train] Epoch 5/10 - Loss: 0.0222, Acc: 0.9934
[Final Train] Epoch 6/10 - Loss: 0.0186, Acc: 0.9942
[Final Train] Epoch 7/10 - Loss: 0.0114, Acc: 0.9969
[Final Train] Epoch 8/10 - Loss: 0.0119, Acc: 0.9970
[Final Train] Epoch 9/10 - Loss: 0.0113, Acc: 0.9965
[Final Train] Epoch 10/10 - Loss: 0.0184, Acc: 0.9

In [None]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import sys

# Optionally, set sys.argv to simulate command-line arguments.
# This step makes sure that parse_args() in your script picks up the desired configuration.
# For example, if you want to use "config.yaml", you can do:
sys.argv = ["train_continual.py", "--config", "config.yaml"]

# Import your training module. Ensure that train_normal.py is in the working directory.
import train_continual

# Now run the main() function from your module.
train_continual.main()



  from .autonotebook import tqdm as notebook_tqdm


[INFO] Loaded configuration:
dataset:
  path: D:/Waffer Data/WM811K.pkl
experiment:
  checkpoint_base_dir: model_checkpoints
  continual_learning: true
  continual_method: baseline
  ewc_lambda: 100.0
  final_epochs: 5
  final_model_filename: final_model.pth
  num_epochs: 5
  num_trials: 1
  reproducibility: true
  save_model: true
  seed: 42
  suggest:
    lr:
      high: 1e-3
      log: true
      low: 1e-5
    weight_decay:
      high: 1e-2
      log: true
      low: 1e-6
  task_list:
  - - 0
    - 1
  - - 2
    - 3
  - - 4
    - 5
  - - 6
    - 7
  tensorboard_log_dir: Logs
logging:
  base_log_dir: ./Logs
model:
  lr: 1e-4
  type: resnet50
  weight_decay: 1e-4

[INFO] Reproducibility enabled. Seed set to 42
[INFO] Using device: cuda
[INFO] Starting continual learning training pipeline with method: baseline
[INFO] Configuration being used:
dataset:
  path: D:/Waffer Data/WM811K.pkl
experiment:
  checkpoint_base_dir: model_checkpoints
  continual_learning: true
  continual_method: ba

[I 2025-03-19 18:18:58,259] A new study created in memory with name: no-name-e72b7848-3d7a-4a13-806d-44c887d9270e


[DEBUG] Creating ResNet50 with lr=1.8000102294982517e-05, weight_decay=0.004201805379484738
[DEBUG] Creating ResNet50 with lr=1.8000102294982517e-05, weight_decay=0.004201805379484738
[DEBUG] Creating ResNet50 with lr=1.8000102294982517e-05, weight_decay=0.004201805379484738


[I 2025-03-19 18:22:29,523] Trial 0 finished with value: 0.038732780675802915 and parameters: {'lr': 1.8000102294982517e-05, 'weight_decay': 0.004201805379484738}. Best is trial 0 with value: 0.038732780675802915.


[INFO] Best hyperparameters for Task 0: lr=1.8000102294982517e-05, weight_decay=0.004201805379484738
[DEBUG] Creating ResNet50 with lr=1.8000102294982517e-05, weight_decay=0.004201805379484738
[MODEL INFO] Task 0: ResNet has total 23512130 parameters (23512130 trainable).
[MODEL INFO] Task 0: Output layer has 2 units.
[INFO] TensorBoard log directory for Task 0: ./Logs\default\task_0
[INFO] Task 0 Epoch 0: Train Loss=0.2850, Val Loss=0.2544
[INFO] Task 0 Epoch 0: Train Acc=0.8943, Val Acc=0.8947
[INFO] Task 0 Epoch 1: Train Loss=0.0372, Val Loss=0.2281
[INFO] Task 0 Epoch 1: Train Acc=0.9928, Val Acc=0.9018
[INFO] Task 0 Epoch 2: Train Loss=0.0153, Val Loss=0.2656
[INFO] Task 0 Epoch 2: Train Acc=0.9974, Val Acc=0.8978
[INFO] Task 0 Epoch 3: Train Loss=0.0074, Val Loss=0.2735
[INFO] Task 0 Epoch 3: Train Acc=0.9992, Val Acc=0.8967
[INFO] Task 0 Epoch 4: Train Loss=0.0051, Val Loss=0.3708
[INFO] Task 0 Epoch 4: Train Acc=1.0000, Val Acc=0.8865
[INFO] Finished Task 0
[INFO] Model checkpo

[I 2025-03-19 20:20:04,105] A new study created in memory with name: no-name-a3c112a6-3aff-4d2b-9479-9f9a9883587b


[DEBUG] Creating ResNet50 with lr=0.00014438522538815255, weight_decay=0.009409816333024557


[W 2025-03-19 20:20:33,514] Trial 0 failed with parameters: {'lr': 0.00014438522538815255, 'weight_decay': 0.009409816333024557} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\OneDrive\Documents\GitHub\DLProject_1\.venv\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "d:\OneDrive\Documents\GitHub\DLProject_1\ModelTraining\Wafer-map\train_continual.py", line 218, in objective
    t_loss, _ = train_one_epoch(model, train_loader, criterion, optimizer, device)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\OneDrive\Documents\GitHub\DLProject_1\ModelTraining\Wafer-map\utils.py", line 77, in train_one_epoch
    outputs = model(inputs)
              ^^^^^^^^^^^^^
  File "d:\OneDrive\Documents\GitHub\DLProject_1\.venv\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    

KeyboardInterrupt: 