In [6]:
import os
# os.environ['CUDA_VISIBLE_DEVICES']='0'

import glob
import json
import datetime
import cv2
import numpy as np 
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary 
from torch.optim.lr_scheduler import ReduceLROnPlateau

from preprocess import DrivableDataset
from models import Unet
from metric import compute_miou

def load_checkpoint(checkpoint_path, device, model, optimizer):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    curr_epoch = checkpoint['epoch']
    return model, optimizer, curr_epoch

date_time = datetime.datetime.now()
formatted_datetime = date_time.strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(formatted_datetime, exist_ok=True)
is_pretrained_available = False
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
data_dir = "bdd100k/images/100k/"
train_images = glob.glob(data_dir + "train/*.jpg")
val_images = glob.glob(data_dir + "val/*.jpg")

trainset = DrivableDataset(train_images)
valset = DrivableDataset(val_images)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=4)
valloader = DataLoader(valset, batch_size=64, shuffle=False, num_workers=4)

model = Unet()

if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

model.to(device)
summary(model, (3, 512, 512))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.1, verbose=True)

cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 512, 512]           1,792
       BatchNorm2d-2         [-1, 64, 512, 512]             128
              ReLU-3         [-1, 64, 512, 512]               0
            Conv2d-4         [-1, 64, 512, 512]          36,928
       BatchNorm2d-5         [-1, 64, 512, 512]             128
              ReLU-6         [-1, 64, 512, 512]               0
        conv_block-7         [-1, 64, 512, 512]               0
         MaxPool2d-8         [-1, 64, 256, 256]               0
     encoder_block-9  [[-1, 64, 512, 512], [-1, 64, 256, 256]]               0
           Conv2d-10        [-1, 128, 256, 256]          73,856
      BatchNorm2d-11        [-1, 128, 256, 256]             256
             ReLU-12        [-1, 128, 256, 256]               0
           Conv2d-13        [-1, 128, 256, 256]         147,584
      BatchNorm2d-14

In [7]:
if is_pretrained_available:
    model, optimizer, curr_epoch = load_checkpoint(formatted_datetime + "/best_model.pth", device, model, optimizer)
    
num_epochs = 10
train_losses, val_losses = [], []
min_val_loss = np.Inf
for epoch in range(num_epochs):
  # training loop
    model.train()
    running_loss = 0.0
    train_iou = 0.0

    for images, labels in tqdm(trainloader, total=len(trainloader), desc='Training'):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        # outputs = outputs.long()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        iou = compute_miou(outputs, labels)
        train_iou += iou

    train_losses.append(running_loss/len(trainloader))

    # validation loop
    model.eval()
    val_running_loss = 0.0
    val_iou = 0.0

    with torch.no_grad():
        for images, labels in tqdm(valloader, total=len(valloader), desc='Validation'):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            # outputs = outputs.long()
            loss = criterion(outputs, labels)
            val_running_loss += loss.item()
            iou = compute_miou(outputs, labels)
            val_iou += iou

    val_losses.append(val_running_loss/len(valloader))

    if val_running_loss/len(valloader) < min_val_loss:
        min_val_loss = val_running_loss/len(valloader)
        checkpoint = {"model": model.state_dict(), "optimizer":optimizer.state_dict(), "epoch":epoch}
        torch.save(checkpoint, formatted_datetime + "/best_model.pth")
        print("Saving Checkpoint to", formatted_datetime, "/best_model.pth")
        is_pretrained_available = True

    
    log_dict = {'train_iou': train_iou, 'val_iou': val_iou, 'epoch':epoch, 'train_loss':running_loss/len(trainloader), 'val_loss':val_running_loss/len(valloader)}
    with open(formatted_datetime + "log.json", 'w+') as f:
        json.dump(log_dict, f)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {running_loss/len(trainloader)}, Val Loss: {val_running_loss/len(valloader)}")
    print(f"Epoch {epoch+1}/{num_epochs}, Train IoU: {train_iou/len(trainloader)}, Val IoU: {val_iou/len(valloader)}")



Training:   0%|                                                                                                                                                                                                                           | 1/2188 [03:26<125:09:16, 206.02s/it]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x1656337e0>
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/opt/homebrew/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.5/Frameworks/Python.framework/Ver

KeyboardInterrupt: 