In [None]:
import os
import gc
import cv2
import time
import tqdm
import random
import collections
import numpy as np
import pandas as pd
from PIL import Image
from functools import partial
import matplotlib.pyplot as plt
import tqdm.notebook as tq
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

from torch.nn import ConvTranspose2d
from torch.nn import Conv2d
from torch.nn import MaxPool2d
from torch.nn import Module
from torch.nn import ModuleList
from torch.nn import ReLU
from torchvision.transforms import CenterCrop
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

import threading

import json

from semseg import show_models
from semseg.models import *

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
# [0.008151865817142985, 0.9996605296342409, -0.024746162835626976, 0.022009642363225456]
# [0.9999404077919916, -0.007969450652357784, 0.007461147437525375, -0.0018523410930785084]
# [0.00726140127552204, -0.0248055104298965, -0.9996659235483758, -0.023070561867408388]
# [0.0, 0.0, 0.0, 1.0]
T_cam_imu = np.array([
    [0.008151865817142985, 0.9996605296342409, -0.024746162835626976, 0.022009642363225456],
    [0.9999404077919916, -0.007969450652357784, 0.007461147437525375, -0.0018523410930785084],
    [0.00726140127552204, -0.0248055104298965, -0.9996659235483758, -0.023070561867408388],
    [0.0, 0.0, 0.0, 1.0]
])

In [None]:
from whuvid_dataset import WhuvidDataset
from kitti_dataset import KittiDataset

In [None]:
height, width = 512, 512
# height, width = 256, 256
with_augs = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.RandomHorizontalFlip(),
    # transforms.RandomRotation(10),
    transforms.RandomResizedCrop(size=(height, width), scale=(0.2, 1.0), ratio=(0.5, 2)),
    # transforms.Resize((height, width)),
    transforms.ToTensor()
])

transform = transforms.Compose([
    #transforms.RandomHorizontalFlip(),
    # transforms.RandomRotation(10),
    # transforms.RandomResizedCrop(size=224, scale=(0.4, 1.0)
    #                             #  , ratio=(0.5, 2)
    #                              ),
    # resize to 224x224
    # transforms.Resize((512, 512)),
    # transforms.Resize((height, width)),
    transforms.ToTensor()
])

In [None]:
whuvid_base_path = "/home/thiago/Workspace/motion-segmentation/datasets/WHUVID"

In [None]:
# 33 and 25 removed due to duplicated timestamps on pose ground truth
# whuvid_dataset = WhuvidDataset(whuvid_base_path, ["01", "16", "17", "22", "30", "31", "32"], transform

whuvid_train_dataset = WhuvidDataset(whuvid_base_path, ["01", "02", "17", "19", "20", "23", "24", "25", "30", "31", "32"], with_augs, segmentation=True, flow=True)
val_dataset = WhuvidDataset(whuvid_base_path, ["01", "02", "17", "19", "20", "23", "24", "25", "30", "31", "32"], transform, segmentation=True, flow=True)
# train_dataset = whuvid_train_dataset
kitti_path = "/home/thiago/Workspace/motion-segmentation/datasets/KITTI-Motion"
kitti = KittiDataset(kitti_path, with_augs, is_train=True, segmentation=True, flow=True)
# print(f"Kiti size: {len(kitti)}")
train_dataset = torch.utils.data.ConcatDataset([whuvid_train_dataset, kitti])

print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}")

In [None]:

num_classes = 3

def get_binary_masks(pred):
    predicted_classes = np.argmax(pred, axis=0)
    masks = []
    for i in range(num_classes):
        mask = (predicted_classes == i).astype(np.uint8)
        masks.append(mask)
    return masks

def to_painted_image(image, label):
    green_image = np.zeros_like(image)
    blue_image = np.zeros_like(image)
    green_image[:, :] = [0, 255, 0]  # Green color in BGR
    blue_image[:, :] = [0, 0, 255]   # Blue color in BGR

    masks = get_binary_masks(label)

    alpha = 0.998  # Alpha value for blending
    beta = 1.0 - alpha
    blended_green = cv2.addWeighted(image, alpha, green_image, beta, 0)
    blended_blue = cv2.addWeighted(image, alpha, blue_image, beta, 0)
    blended_green = np.clip(blended_green, 0, 1)
    blended_blue = np.clip(blended_blue, 0, 1)

    output_image = image.copy()
    output_image[masks[0] == 1] = blended_blue[masks[0] == 1]
    output_image[masks[1] == 1] = blended_green[masks[1] == 1]

    return output_image

def print_dataset(dataset, i):
    img, flow, imu, mask = dataset[i]
    img = img * torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    img = img.permute(1, 2, 0).numpy()
    mask = mask.numpy()
    flow = flow.permute(1, 2, 0).numpy()
    flow = flow * 255
    img_painted = to_painted_image(img, mask)
    # print image and flow
    # fig, ax = plt.subplots(1, 2, figsize=(7.5, 5))
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    ax[0].imshow(img_painted)
    ax[0].set_title("Image Annotated")
    ax[1].imshow(flow)
    ax[1].set_title("Flow")
    plt.show()

def view_masks(dataset, i):
    img, flow, imu, mask, bbox = dataset[i]
    img = img * torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    img = img.permute(1, 2, 0).numpy()
    # draw bounding boxes on img
    img_bb = img.copy()
    for b in bbox:
        # to int
        x1, y1, x2, y2 = [int(i) for i in b]
        cv2.rectangle(img_bb, (x1, y1), (x2, y2), (0, 255, 0), 2)
    mask = mask.numpy()
    # show image, mask[0] and mask[1]. Masks are binary, Show black and white
    fig, ax = plt.subplots(1, 3, figsize=(10, 5))
    ax[0].imshow(img_bb)
    ax[0].set_title("Imagem")
    ax[1].imshow(mask[0], cmap='gray')
    ax[1].set_title("Em movimento")
    ax[2].imshow(mask[1], cmap='gray')
    ax[2].set_title("Parado")
    plt.show()

In [None]:
random_index = random.randint(0, len(train_dataset)-1)
print_dataset(train_dataset, random_index)
height, width = train_dataset[0][0].shape[1:]

random_index = random.randint(0, len(val_dataset)-1)
print_dataset(val_dataset, random_index)

random_index = random.randint(0, len(kitti)-1)
print_dataset(kitti, random_index)

view_seq = "22"
view_transform = transforms.Compose([
    transforms.Resize((height, width)),
    transforms.ToTensor()
])
view_seq = WhuvidDataset(whuvid_base_path, [view_seq], view_transform, segmentation=True, flow=True, use_gdino=False, use_bbox=True)
view_masks(view_seq, 40)

In [None]:
batch_size =  10
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12, drop_last=True)
test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=24, drop_last=True)

In [None]:
from semseg.models.backbones import ResNet, PoolFormer, ConvNeXt
from semseg.models.heads import UPerHead, LawinHead
from torch.nn import functional as F

class SemSegModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.print_shape = True
        self.backbone_flow = ResNet('18')
        self.backbone_flow.load_state_dict(torch.load('../models/resnet18_a1.pth',
                                                        map_location='cpu'), strict=False)
        self.backbone_img = ResNet('50')
        self.backbone_img.load_state_dict(torch.load('/home/thiago/Workspace/motion-segmentation/src/resnet50_a1.pth',
                                                        map_location='cuda'), strict=False)
        
        backbone_channels = self.backbone_img.channels + self.backbone_flow.channels

        print(backbone_channels)
        self.head = LawinHead(backbone_channels, 128, num_classes=num_classes)
    
    def forward(self, img, flow, imu):
        img_x = self.backbone_img(img)
        flow_x = self.backbone_flow(flow)
        
        if self.print_shape:
            for i in img_x:
                print(f"Img shape: {i.shape}")
            for i in flow_x:
                print(f"Flow shape: {i.shape}")
   
        x = img_x + flow_x
   
        x = self.head(x)
        
        if self.print_shape:
            print(f"Output shape: {x.shape}")
            self.print_shape = False
        
        x = F.interpolate(x, size=(height, width), mode='bilinear', align_corners=False)
        return x

model = SemSegModel().to(device)

In [None]:
print_dataset(train_dataset, 200)

In [None]:
num_epochs = 10
# initialize loss function and optimizer
pos_weight = torch.tensor([4, 4, 1]).unsqueeze(1).unsqueeze(2).to(device)
# pos_weight = torch.tensor([5, 5, 1]).to(device)
# lossFunc = BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
alpha = 0.75
gamma = 2
lossFunc = torchvision.ops.focal_loss.sigmoid_focal_loss
# opt = Adam(model.parameters(), lr=1e-3)
opt = torch.optim.AdamW(model.parameters())
scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr=0.00005,
                                                steps_per_epoch=len(data_loader), epochs=num_epochs)
trainSteps = len(train_dataset) // batch_size
testSteps = len(val_dataset) // batch_size
# initialize a dictionary to store training history
H = {"train_loss": [], "test_loss": [], "train_iou": [], "test_iou": []}

In [None]:
def iou_score(output, target):
    output = torch.where(output > 0.5, 1, 0)
    target = torch.where(target > 0.5, 1, 0)

    smooth = 1e-6

    if torch.is_tensor(output):
        output = torch.sigmoid(output)

    output = torch.round(output)

    intersection = (output * target).sum()
    union = (output + target).sum() - intersection

    if target.sum() == 0:
        return (output.sum() == 0).float()

    iou = (intersection + smooth) / (union + smooth)

    return iou

In [None]:
# loop over epochs
print("[INFO] training the network...")
startTime = time.time()
# bestValIoU = 0
bestTestLoss = 100000000
train_loss = []
test_loss = []
for e in tq.tqdm(range(0, num_epochs)):
	# set the model in training mode
	model.train()
	# initialize the total training and validation loss
	totalTrainLoss = 0
	totalTestLoss = 0
	iou_train_acm = list(0 for i in range(num_classes))
	iou_train_count = 0
	iou_test_acm = list(0 for i in range(num_classes))
	iou_test_count = 0

	train_bar = tq.tqdm(data_loader)
	for batch in train_bar:
		opt.zero_grad()
		img, flow, imu, y = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device)

		pred = model(img, flow, imu)

		loss = lossFunc(pred, y, reduction='mean', alpha=alpha, gamma=gamma)

		loss.backward()
		opt.step()
		scheduler.step()

		totalTrainLoss += loss.item()
		# totalTrainIoU += iou
		iou_train = [iou_score(pred[:,i], y[:,i]).item() for i in range(num_classes)]
		iou_train_acm = [iou_train_acm[i] + iou_train[i] for i in range(num_classes)]
		iou_train_count += 1
		iou_train_avg = [iou_train_acm[i] / iou_train_count for i in range(num_classes)]

		metrics = {f"iou[{i}]": iou_train_avg[i] for i in range(num_classes)}
		train_bar.set_postfix(metrics)
	# switch off autograd
	with torch.no_grad():
		# set the model in evaluation mode
		model.eval()
		# loop over the validation set
		test_bar = tq.tqdm(test_loader)
		for batch in test_bar:
			img, flow, imu, y = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device)

			pred = model(img, flow, imu)
			# pred = unet(inputs)
			totalTestLoss += lossFunc(pred, y, reduction='mean', alpha=alpha, gamma=gamma).item()

			iou_test = [iou_score(pred[:,i], y[:,i]).item() for i in range(num_classes)]
			iou_test_acm = [iou_test_acm[i] + iou_test[i] for i in range(num_classes)]
			iou_test_count += 1
			iou_test_avg = [iou_test_acm[i] / iou_test_count for i in range(num_classes)]
			metrics = {f"iou[{i}]": iou_test_avg[i] for i in range(num_classes)}
			test_bar.set_postfix(metrics)
	avgTrainLoss = totalTrainLoss / trainSteps
	avgTestLoss = totalTestLoss / testSteps
	
	iou_train_avg = [iou_train_acm[i] / (iou_train_count + 1) for i in range(num_classes)]
	iou_test_avg = [iou_test_acm[i] / (iou_test_count + 1) for i in range(num_classes)]
	# update our training history
	H["train_loss"].append(avgTrainLoss)
	H["test_loss"].append(avgTestLoss)
	H["train_iou"].append(iou_train_avg[0])
	H["test_iou"].append(iou_test_avg[0])
	# print the model training and validation information
	print("Train loss: {:.6f}, Test loss: {:.4f}".format(avgTrainLoss, avgTestLoss))
	
	for i in range(num_classes):
		print(f"IoU[{i}] Train: {iou_train_avg[i]}, IoU[{i}] Test: {iou_test_avg[i]}")
	if not os.path.exists("models"):
		os.mkdir("models")
	torch.save(model, f"models/motion-seg-{e}.pth")
	if avgTestLoss < bestTestLoss:
		bestTestLoss = avgTestLoss
		print(f"Saving epoch {e} with loss {avgTestLoss}")
		torch.save(model, f"models/best-loss.pth")
	torch.save(model, f"models/latest.pth")
	
endTime = time.time()
print("[INFO] total time taken to train the model: {:.2f}s".format(
	endTime - startTime))

In [None]:
torch.save(model, f"models/last.pth")

In [None]:
# plot loss
plt.figure()
plt.plot(H["train_loss"], label="train_loss")
plt.plot(H["test_loss"], label="test_loss")
plt.title("Training Loss")
plt.xlabel("Epoch #")
plt.ylabel("Loss")
plt.legend()
plt.show()