In [None]:
import torch
import cv2
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/Simple_DE') # this path is the path of the current .ipynb
import numpy as np
import shutil
from google.colab.patches import cv2_imshow
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import math
from PIL import Image
import torch.nn as nn
import yaml
import random
from google.colab import files
import sys
import time
from torch.utils.data import random_split
import matplotlib.pyplot as plt
device = "cuda" if torch.cuda.is_available() else "cpu" # 檢查是否有可用的 CUDA 設備（通常是顯卡，支援 GPU 運算），如果有，就將 device 變數設置為 "cuda"，否則設置為 "cpu"。

In [None]:
path = 'Model/config/depth_analysis.pth' # 讀取depth的統計數字
# path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Model/config/depth_analysis.pth'

check = torch.load(path)
total_sum = check['total_sum']
DEPTH_NONZERO = check['total_nonzero']
DEPTH_MEAN = check['total_mean']
DEPTH_STD = check['total_std']
del check

In [None]:
from Model.functions.functions import load_config
file_path = 'Model/config/config.yml'
# file_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Model/config/config.yml'
config = load_config(file_path)

In [None]:
target_size = (config['data']['image_size'], config['data']['image_size'])

In [None]:
from Model.Model1.model1_script import Model

model = Model(config)
model = model.to(device)

In [None]:
# BATCH_SIZE = 256

NO_LARGE_EPOCHS = 10
save_frequency = 5
LR = 0.001
VERBOSE = False
# data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip'
# name_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/name_zip'
data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip_shuffle'
name_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/name_zip_shuffle'
batch_size = 32
train_val_rate = 0.99
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [None]:
from Model.Data_Process.data_processing import create_dataset_large_epoch, CustomDataset, image_loader_to_tensor, depth_loader_to_tensor

In [None]:

#debug epoch

epoch = 0
data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip'
name_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/name_zip'
check_length = len(sorted(os.listdir(data_path)))
random_list = []
for idx in range(check_length):
    random_list.append(idx)
now = 15
output_image_path, output_depth_path, output_depth, output_image = create_dataset_large_epoch(random_list, now, data_path, name_path)
custom_dataset = CustomDataset(output_image, output_depth)
train_size = int(train_val_rate * len(custom_dataset))
val_size = len(custom_dataset) - train_size
train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
for large_epoch in range(1, NO_LARGE_EPOCHS + 1):



    for now in range(check_length): # 一個小epoch是一個checkpoint檔，紀錄一次

        epoch += 1


        # output_image_path, output_depth_path, output_depth, output_image = create_dataset_large_epoch(random_list, now, data_path, name_path)
        # custom_dataset = CustomDataset(output_image, output_depth)
        # train_size = int(train_val_rate * len(custom_dataset))
        # val_size = len(custom_dataset) - train_size
        # train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])
        # trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
        # validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
        start_time = time.time()
        mean_epoch_loss = []
        mean_epoch_loss_val = []
        epoch_gradient = {}
        for batch in trainloader:
            t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)

            input_img = batch['img'].to(torch.float32).to(device)
            input_img = image_loader_to_tensor(input_img)
            target_depth = batch['depth'].to(torch.float32).to(device)
            target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)

            pred_depth = model(input_img, target_depth, t)

            optimizer.zero_grad()
            loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
            mean_epoch_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            #---gradient---vvv
            for name, param in model.named_parameters():
                if param.grad == None:
                    epoch_gradient[name + 'zero'] = 1
                elif name not in epoch_gradient:
                    epoch_gradient[name] = param.grad.clone()
                else:
                    epoch_gradient[name] += param.grad
            #---gradient---^^^
        with torch.inference_mode():
            for batch in validloader:
                t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)
                input_img = batch['img'].to(torch.float32).to(device)

                input_img = image_loader_to_tensor(input_img)
                target_depth = batch['depth'].to(torch.float32).to(device)
                target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)
                pred_depth = model(input_img, target_depth, t)

                val_loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
                mean_epoch_loss_val.append(val_loss.item())

        if epoch % save_frequency == 0 or epoch == check_length * NO_LARGE_EPOCHS:
            checkpoint = {
                'large_epoch' : large_epoch,
                'epoch': epoch,
                'model_state_dict': model.state_dict(), # model.state_dict()是存下param的的值和形狀
                'optimizer_state_dict': optimizer.state_dict(), # optimizer.state_dict()則是存下優化器的param如momentum等等 不包含當下梯度
                'valid_loss' : np.mean(mean_epoch_loss_val),
                'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
                'now' : now,
                'random_list' : random_list,
                'gradients' : epoch_gradient
            }

            torch.save(checkpoint, 'weight_{}_{}.pth'.format(large_epoch, epoch))
            source_path = 'weight_{}_{}.pth'.format(large_epoch, epoch)
            destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model0_weight'
            # destination_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Checkpoint/model1_weight'


            # save them to the google drive
            shutil.copy(source_path, destination_path)

        #---計算時間---vvv
        end_time = time.time()
        exe_time = end_time - start_time
        hours, remainder = divmod(exe_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        #---計算時間---^^^

        #-----以下是存loss的---vvv
        checkpoint = {
        'large_epoch' : large_epoch,
        'epoch': epoch,
        'valid_loss' : np.mean(mean_epoch_loss_val),
        'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
        'time' : exe_time
        }

        torch.save(checkpoint, 'loss_{}_{}.pth'.format(large_epoch, epoch))
        source_path = 'loss_{}_{}.pth'.format(large_epoch, epoch)
        # destination_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Checkpoint/model1_loss'
        destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model0_loss'


        # save them to the google drive
        shutil.copy(source_path, destination_path)
        #-----以下是存loss的---^^^

        print('---')
        print(f"Large Epoch: {large_epoch}, Epoch: {epoch} | Train Loss {np.mean(mean_epoch_loss)} | Val Loss {np.mean(mean_epoch_loss_val)}")
        print("time = {}:{}:{}".format(int(hours), int(minutes), int(seconds)))


In [None]:
'''
# continue training debug
large_epoch = 1
epoch = 10
# load_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Checkpoint/model1_weight/weight_{}_{}.pth'.format(large_epoch, epoch)
# data_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Data/data_zip_shuffle'
load_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model0_weight/weight_{}_{}.pth'.format(large_epoch, epoch)
data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip_shuffle'
check_length = len(sorted(os.listdir(data_path)))
checkpoint = torch.load(load_path, map_location=torch.device(device))

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])



data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip'
name_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/name_zip'
check_length = len(sorted(os.listdir(data_path)))
random_list = []
for idx in range(check_length):
    random_list.append(idx)
now = 15
output_image_path, output_depth_path, output_depth, output_image = create_dataset_large_epoch(random_list, now, data_path, name_path)
custom_dataset = CustomDataset(output_image, output_depth)
train_size = int(train_val_rate * len(custom_dataset))
val_size = len(custom_dataset) - train_size
train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last = True)


for now in range(epoch, 51): # 一個小epoch是一個checkpoint檔，紀錄一次

    epoch += 1
    start_time = time.time()
    mean_epoch_loss = []
    mean_epoch_loss_val = []
    epoch_gradient = {}
    for batch in trainloader:
        t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)

        input_img = batch['img'].to(torch.float32).to(device)
        target_depth = batch['depth'].to(torch.float32).to(device)

        input_img = image_loader_to_tensor(input_img)
        target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)

        pred_depth = model(input_img, target_depth, t)

        optimizer.zero_grad()
        loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
        mean_epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        #---gradient---vvv
            for name, param in model.named_parameters():
                if param.grad == None:
                    epoch_gradient[name + 'zero'] = 1
                elif name not in epoch_gradient:
                    epoch_gradient[name] = param.grad.clone()
                else:
                    epoch_gradient[name] += param.grad
            #---gradient---^^^
    with torch.inference_mode():
        for batch in validloader:
            t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)
            input_img = batch['img'].to(torch.float32).to(device)
            target_depth = batch['depth'].to(torch.float32).to(device)
            input_img = image_loader_to_tensor(input_img)
            target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)
            pred_depth = model(input_img, target_depth, t)

            val_loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
            mean_epoch_loss_val.append(val_loss.item())

    if epoch % save_frequency == 0 or epoch == check_length * NO_LARGE_EPOCHS:
        checkpoint = {
            'large_epoch' : large_epoch,
            'epoch': epoch,
            'model_state_dict': model.state_dict(), # model.state_dict()是存下param的的值和形狀
            'optimizer_state_dict': optimizer.state_dict(), # optimizer.state_dict()則是存下優化器的param如momentum等等 不包含當下梯度
            'valid_loss' : np.mean(mean_epoch_loss_val),
            'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
            'now' : now,
            'random_list' : random_list,
            'gradients' : epoch_gradient
        }

        torch.save(checkpoint, 'weight_{}_{}.pth'.format(large_epoch, epoch))
        source_path = 'weight_{}_{}.pth'.format(large_epoch, epoch)
        destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_weight'


        # save them to the google drive
        shutil.copy(source_path, destination_path)

        #---計算時間---vvv
        end_time = time.time()
        exe_time = end_time - start_time
        hours, remainder = divmod(exe_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        #---計算時間---^^^

        #-----以下是存loss的---vvv
        checkpoint = {
        'large_epoch' : large_epoch,
        'epoch': epoch,
        'valid_loss' : np.mean(mean_epoch_loss_val),
        'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
        'time' : exe_time
        }

        torch.save(checkpoint, 'loss_{}_{}.pth'.format(large_epoch, epoch))
        source_path = 'loss_{}_{}.pth'.format(large_epoch, epoch)
        destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_loss'


        # save them to the google drive
        shutil.copy(source_path, destination_path)
        #-----以下是存loss的---^^^

    print('---')
    print(f"Large Epoch: {large_epoch}, Epoch: {epoch} | Train Loss {np.mean(mean_epoch_loss)} | Val Loss {np.mean(mean_epoch_loss_val)}")
    print("time = {}:{}:{}".format(hours, minutes, seconds))

'''


In [None]:

# first epoch
epoch = 0
# data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip'
# name_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/name_zip'
check_length = len(sorted(os.listdir(data_path)))
for large_epoch in range(1, NO_LARGE_EPOCHS + 1):

    random_list = []

    for idx in range(check_length):
        random_list.append(idx)

    random.shuffle(random_list)

    for now in range(check_length): # 一個小epoch是一個checkpoint檔，紀錄一次

        epoch += 1
        output_image_path, output_depth_path, output_depth, output_image = create_dataset_large_epoch(random_list, now, data_path, name_path)
        custom_dataset = CustomDataset(output_image, output_depth)
        train_size = int(train_val_rate * len(custom_dataset))
        val_size = len(custom_dataset) - train_size
        train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])
        trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
        validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
        start_time = time.time()
        mean_epoch_loss = []
        mean_epoch_loss_val = []
        epoch_gradient = {}
        for batch in trainloader:
            t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)

            input_img = batch['img'].to(torch.float32).to(device)
            input_img = image_loader_to_tensor(input_img)
            target_depth = batch['depth'].to(torch.float32).to(device)
            target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)

            pred_depth = model(input_img, target_depth, t)

            optimizer.zero_grad()
            loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
            mean_epoch_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            #---gradient---vvv
            for name, param in model.named_parameters():
                if param.grad == None:
                    epoch_gradient[name + 'zero'] = 1
                elif name not in epoch_gradient:
                    epoch_gradient[name] = param.grad.clone()
                else:
                    epoch_gradient[name] += param.grad
            #---gradient---^^^
        with torch.inference_mode():
            for batch in validloader:
                t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)
                input_img = batch['img'].to(torch.float32).to(device)

                input_img = image_loader_to_tensor(input_img)
                target_depth = batch['depth'].to(torch.float32).to(device)
                target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)
                pred_depth = model(input_img, target_depth, t)

                val_loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
                mean_epoch_loss_val.append(val_loss.item())

        if epoch % save_frequency == 0 or epoch == check_length * NO_LARGE_EPOCHS:
            checkpoint = {
                'large_epoch' : large_epoch,
                'epoch': epoch,
                'model_state_dict': model.state_dict(), # model.state_dict()是存下param的的值和形狀
                'optimizer_state_dict': optimizer.state_dict(), # optimizer.state_dict()則是存下優化器的param如momentum等等 不包含當下梯度
                'valid_loss' : np.mean(mean_epoch_loss_val),
                'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
                'now' : now,
                'random_list' : random_list,
                'gradients' : epoch_gradient
            }

            torch.save(checkpoint, 'weight_{}_{}.pth'.format(large_epoch, epoch))
            source_path = 'weight_{}_{}.pth'.format(large_epoch, epoch)
            destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_weight'
            # destination_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Checkpoint/model1_weight'


            # save them to the google drive
            shutil.copy(source_path, destination_path)

        #---計算時間---vvv
        end_time = time.time()
        exe_time = end_time - start_time
        hours, remainder = divmod(exe_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        #---計算時間---^^^

        #-----以下是存loss的---vvv
        checkpoint = {
        'large_epoch' : large_epoch,
        'epoch': epoch,
        'valid_loss' : np.mean(mean_epoch_loss_val),
        'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
        'time' : exe_time
        }

        torch.save(checkpoint, 'loss_{}_{}.pth'.format(large_epoch, epoch))
        source_path = 'loss_{}_{}.pth'.format(large_epoch, epoch)
        # destination_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Checkpoint/model1_loss'
        destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_loss'


        # save them to the google drive
        shutil.copy(source_path, destination_path)
        #-----以下是存loss的---^^^

        print('---')
        print(f"Large Epoch: {large_epoch}, Epoch: {epoch} | Train Loss {np.mean(mean_epoch_loss)} | Val Loss {np.mean(mean_epoch_loss_val)}")
        print("time = {}:{}:{}".format(int(hours), int(minutes), int(seconds)))


In [None]:
# continue training
large_epoch = 2
epoch = 23
# load_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Checkpoint/model1_weight/weight_{}_{}.pth'.format(large_epoch, epoch)
# data_path = '/content/drive/MyDrive/Colab Notebooks/共用區/Simple_DE/Data/data_zip_shuffle'
load_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_weight/weight_{}_{}.pth'.format(large_epoch, epoch)
data_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Data/data_zip_shuffle'
check_length = len(sorted(os.listdir(data_path)))
checkpoint = torch.load(load_path, map_location=torch.device(device))

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
if (checkpoint['now'] == check_length - 1):
    large_epoch_start = large_epoch + 1
    del large_epoch

    for large_epoch in range(large_epoch_start, NO_LARGE_EPOCHS + 1):
        random_list = []
        for idx in range(check_length):
            random_list.append(idx)
        random.shuffle(random_list)

        for now in range(check_length): # 一個小epoch是一個checkpoint檔，紀錄一次

            epoch += 1
            output_image_path, output_depth_path, output_depth, output_image = create_dataset_large_epoch(random_list, now, data_path, name_path)
            custom_dataset = CustomDataset(output_image, output_depth)
            train_size = int(train_val_rate * len(custom_dataset))
            val_size = len(custom_dataset) - train_size
            train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])
            trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
            validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
            start_time = time.time()
            mean_epoch_loss = []
            mean_epoch_loss_val = []
            epoch_gradient = {}
            for batch in trainloader:
                t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)

                input_img = batch['img'].to(torch.float32).to(device)
                target_depth = batch['depth'].to(torch.float32).to(device)

                input_img = image_loader_to_tensor(input_img)
                target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)

                pred_depth = model(input_img, target_depth, t)

                optimizer.zero_grad()
                loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
                mean_epoch_loss.append(loss.item())
                loss.backward()
                optimizer.step()
                #---gradient---vvv
                for name, param in model.named_parameters():
                    if param.grad == None:
                        epoch_gradient[name + 'zero'] = 1
                    elif name not in epoch_gradient:
                        epoch_gradient[name] = param.grad.clone()
                    else:
                        epoch_gradient[name] += param.grad
            #---gradient---^^^
            with torch.inference_mode():
                for batch in validloader:
                    t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)
                    input_img = batch['img'].to(torch.float32).to(device)
                    target_depth = batch['depth'].to(torch.float32).to(device)
                    input_img = image_loader_to_tensor(input_img)
                    target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)
                    pred_depth = model(input_img, target_depth, t)

                    val_loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
                    mean_epoch_loss_val.append(val_loss.item())

            if epoch % save_frequency == 0 or epoch == check_length * NO_LARGE_EPOCHS:
                checkpoint = {
                    'large_epoch' : large_epoch,
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(), # model.state_dict()是存下param的的值和形狀
                    'optimizer_state_dict': optimizer.state_dict(), # optimizer.state_dict()則是存下優化器的param如momentum等等 不包含當下梯度
                    'valid_loss' : np.mean(mean_epoch_loss_val),
                    'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
                    'now' : now,
                    'random_list' : random_list,
                    'gradients' : epoch_gradient
                }

                torch.save(checkpoint, 'weight_{}_{}.pth'.format(large_epoch, epoch))
                source_path = 'weight_{}_{}.pth'.format(large_epoch, epoch)
                destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_weight'


                # save them to the google drive
                shutil.copy(source_path, destination_path)

                #---計算時間---vvv
                end_time = time.time()
                exe_time = end_time - start_time
                hours, remainder = divmod(exe_time, 3600)
                minutes, seconds = divmod(remainder, 60)
                #---計算時間---^^^

                #-----以下是存loss的---vvv
                checkpoint = {
                'large_epoch' : large_epoch,
                'epoch': epoch,
                'valid_loss' : np.mean(mean_epoch_loss_val),
                'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
                'time' : exe_time
                }

                torch.save(checkpoint, 'loss_{}_{}.pth'.format(large_epoch, epoch))
                source_path = 'loss_{}_{}.pth'.format(large_epoch, epoch)
                destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_loss'


                # save them to the google drive
                shutil.copy(source_path, destination_path)
                #-----以下是存loss的---^^^

            print('---')
            print(f"Large Epoch: {large_epoch}, Epoch: {epoch} | Train Loss {np.mean(mean_epoch_loss)} | Val Loss {np.mean(mean_epoch_loss_val)}")
            print("time = {}:{}:{}".format(hours, minutes, seconds))

else:
    large_epoch_start = large_epoch
    del large_epoch

    for large_epoch in range(large_epoch_start, NO_LARGE_EPOCHS + 1):
        if large_epoch == large_epoch_start:
            now_start = checkpoint['now'] + 1
            random_list = checkpoint['random_list']
        else:
            now_start = 0
            random_list = []
            for idx in range(check_length):
                random_list.append(idx)
            random.shuffle(random_list)


        for now in range(now_start, check_length):
            epoch += 1
            output_image_path, output_depth_path, output_depth, output_image = create_dataset_large_epoch(random_list, now, data_path, name_path)
            custom_dataset = CustomDataset(output_image, output_depth)
            train_size = int(train_val_rate * len(custom_dataset))
            val_size = len(custom_dataset) - train_size
            train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])
            trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
            validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last = True)
            start_time = time.time()
            mean_epoch_loss = []
            mean_epoch_loss_val = []
            epoch_gradient = {}
            for batch in trainloader:
                t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)

                input_img = batch['img'].to(torch.float32).to(device)
                target_depth = batch['depth'].to(torch.float32).to(device)
                input_img = image_loader_to_tensor(input_img)
                target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)

                pred_depth = model(input_img, target_depth, t)

                optimizer.zero_grad()
                loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
                mean_epoch_loss.append(loss.item())
                loss.backward()
                optimizer.step()
                #---gradient---vvv
                for name, param in model.named_parameters():
                    if param.grad == None:
                        epoch_gradient[name + 'zero'] = 1
                    elif name not in epoch_gradient:
                        epoch_gradient[name] = param.grad.clone()
                    else:
                        epoch_gradient[name] += param.grad
            #---gradient---^^^
            with torch.inference_mode():
                for batch in validloader:
                    t = torch.randint(0, config['diffusion']['num_diffusion_timesteps'], (batch_size,)).long().to(device)
                    input_img = batch['img'].to(torch.float32).to(device)
                    target_depth = batch['depth'].to(torch.float32).to(device)
                    input_img = image_loader_to_tensor(input_img)
                    target_depth = depth_loader_to_tensor(target_depth, DEPTH_MEAN, DEPTH_STD)
                    pred_depth = model(input_img, target_depth, t)

                    val_loss = torch.nn.functional.mse_loss(target_depth, pred_depth)
                    mean_epoch_loss_val.append(val_loss.item())

            if epoch % save_frequency == 0 or epoch == check_length * NO_LARGE_EPOCHS:
                checkpoint = {
                    'large_epoch' : large_epoch,
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(), # model.state_dict()是存下param的的值和形狀
                    'optimizer_state_dict': optimizer.state_dict(), # optimizer.state_dict()則是存下優化器的param如momentum等等 不包含當下梯度
                    'valid_loss' : np.mean(mean_epoch_loss_val),
                    'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
                    'now' : now,
                    'random_list' : random_list,
                    'gradients' : epoch_gradient
                }

                torch.save(checkpoint, 'weight_{}_{}.pth'.format(large_epoch, epoch))
                source_path = 'weight_{}_{}.pth'.format(large_epoch, epoch)
                destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_weight'


                # save them to the google drive
                shutil.copy(source_path, destination_path)

            #---計算時間---vvv
            end_time = time.time()
            exe_time = end_time - start_time
            hours, remainder = divmod(exe_time, 3600)
            minutes, seconds = divmod(remainder, 60)
            #---計算時間---^^^

            #-----以下是存loss的---vvv
            checkpoint = {
            'large_epoch' : large_epoch,
            'epoch': epoch,
            'valid_loss' : np.mean(mean_epoch_loss_val),
            'loss' : np.mean(mean_epoch_loss), # 記得不能存tensor
            'time' : exe_time
            }

            torch.save(checkpoint, 'loss_{}_{}.pth'.format(large_epoch, epoch))
            source_path = 'loss_{}_{}.pth'.format(large_epoch, epoch)
            destination_path = '/content/drive/MyDrive/Colab Notebooks/Simple_DE/Checkpoint/model1_loss'


            # save them to the google drive
            shutil.copy(source_path, destination_path)
            #-----以下是存loss的---^^^

            print('---')
            print(f"Large Epoch: {large_epoch}, Epoch: {epoch} | Train Loss {np.mean(mean_epoch_loss)} | Val Loss {np.mean(mean_epoch_loss_val)}")
            print("time = {}:{}:{}".format(int(hours), int(minutes), int(seconds)))





In [None]:
from google.colab import runtime
runtime.unassign()