In [None]:
import os
import gc
import cv2
import time
import tqdm
import random
import collections
import numpy as np
import pandas as pd
from PIL import Image
from functools import partial
import matplotlib.pyplot as plt
import tqdm.notebook as tq
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

from torch.nn import ConvTranspose2d
from torch.nn import Conv2d
from torch.nn import MaxPool2d
from torch.nn import Module
from torch.nn import ModuleList
from torch.nn import ReLU
from torchvision.transforms import CenterCrop
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

import threading

import json

from semseg import show_models
from semseg.models import *

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
from whuvid_dataset import WhuvidDataset
from kitti_dataset import KittiDataset

In [None]:
height, width = 512, 512
# height, width = 256, 256
with_augs = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.RandomHorizontalFlip(),
    # transforms.RandomRotation(10),
    transforms.RandomResizedCrop(size=(height, width), scale=(0.2, 1.0), ratio=(0.5, 2)),
    # transforms.Resize((height, width)),
    transforms.ToTensor()
])

transform = transforms.Compose([
    transforms.Resize((height, width)),
    transforms.ToTensor()
])

In [None]:
whuvid_base_path = "/home/thiago/Workspace/motion-segmentation/datasets/WHUVID"

In [None]:
whuvid_train_dataset = WhuvidDataset(whuvid_base_path, ["01", "02", "17", "19", "20", "23", "24", "25", "30", "31", "32"], with_augs, segmentation=True, flow=True)
val_dataset = WhuvidDataset(whuvid_base_path, ["03", "18", "22"], transform, segmentation=True, flow=True)
train_dataset = whuvid_train_dataset

print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}")

In [None]:

num_classes = 3

def get_binary_masks(pred):
    predicted_classes = np.argmax(pred, axis=0)
    masks = []
    for i in range(num_classes):
        mask = (predicted_classes == i).astype(np.uint8)
        masks.append(mask)
    return masks

def to_painted_image(image, label):
    green_image = np.zeros_like(image)
    blue_image = np.zeros_like(image)
    green_image[:, :] = [0, 255, 0]  # Green color in BGR
    blue_image[:, :] = [0, 0, 255]   # Blue color in BGR

    masks = get_binary_masks(label)

    alpha = 0.998  # Alpha value for blending
    beta = 1.0 - alpha
    blended_green = cv2.addWeighted(image, alpha, green_image, beta, 0)
    blended_blue = cv2.addWeighted(image, alpha, blue_image, beta, 0)
    blended_green = np.clip(blended_green, 0, 1)
    blended_blue = np.clip(blended_blue, 0, 1)

    output_image = image.copy()
    output_image[masks[0] == 1] = blended_blue[masks[0] == 1]
    output_image[masks[1] == 1] = blended_green[masks[1] == 1]

    return output_image

def print_dataset(dataset, i):
    img, flow, imu, mask = dataset[i]
    
    img = img * torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    img = img.permute(1, 2, 0).numpy()
    mask = mask.numpy()
    flow = flow.permute(1, 2, 0).numpy()
    flow = flow * 255
    img_painted = to_painted_image(img, mask)
    # print image and flow
    fig, ax = plt.subplots(1, 2, figsize=(7.5, 5))
    ax[0].imshow(img_painted)
    ax[0].set_title("Image Annotated")
    ax[1].imshow(flow)
    ax[1].set_title("Flow")
    plt.show()

def view_masks(dataset, i):
    img, flow, imu, mask, bbox = dataset[i]
    img = img * torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    img = img.permute(1, 2, 0).numpy()
    # draw bounding boxes on img
    img_bb = img.copy()
    for b in bbox:
        # to int
        x1, y1, x2, y2 = [int(i) for i in b]
        cv2.rectangle(img_bb, (x1, y1), (x2, y2), (0, 255, 0), 2)
    mask = mask.numpy()
    # show image, mask[0] and mask[1]. Masks are binary, Show black and white
    fig, ax = plt.subplots(1, 3, figsize=(10, 5))
    ax[0].imshow(img_bb)
    ax[0].set_title("Imagem")
    ax[1].imshow(mask[0], cmap='gray')
    ax[1].set_title("Em movimento")
    ax[2].imshow(mask[1], cmap='gray')
    ax[2].set_title("Parado")
    plt.show()

In [None]:
random_index = random.randint(0, len(train_dataset)-1)
print_dataset(train_dataset, random_index)
height, width = train_dataset[0][0].shape[1:]

random_index = random.randint(0, len(val_dataset)-1)
print_dataset(val_dataset, random_index)

In [None]:
batch_size =  10
data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12, drop_last=True)
test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=24, drop_last=True)

In [None]:
from semseg.models.backbones import ResNet, PoolFormer, ConvNeXt
from semseg.models.heads import UPerHead, LawinHead
from torch.nn import functional as F

class SemSegModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.backbone_flow = ResNet('18')
        self.backbone_flow.load_state_dict(torch.load('../models/resnet18_a1.pth',
                                                        map_location='cpu'), strict=False)
        self.backbone_img = ResNet('50')
        self.backbone_img.load_state_dict(torch.load('/home/thiago/Workspace/motion-segmentation/src/resnet50_a1.pth',
                                                        map_location='cuda'), strict=False)
        
        backbone_channels = self.backbone_img.channels + self.backbone_flow.channels
        
        print(backbone_channels)
        self.head = LawinHead(backbone_channels, 128, num_classes=num_classes)
    
    def forward(self, img, flow, imu):
        
        img_x = self.backbone_img(img)
        flow_x = self.backbone_flow(flow)
        
        
        x = img_x + flow_x
        
        
        x = self.head(x)
        
        x = F.interpolate(x, size=(height, width), mode='bilinear', align_corners=False)
        return x

model = SemSegModel().to(device)
model = torch.load("/home/thiago/Workspace/motion-segmentation/src/models/last-512-2.pth").to(device)

In [None]:
print_dataset(train_dataset, 200)

In [None]:
def iou_score(output, target):
    output = torch.where(output > 0.5, 1, 0)
    target = torch.where(target > 0.5, 1, 0)

    smooth = 1e-6

    if torch.is_tensor(output):
        output = torch.sigmoid(output)

    output = torch.round(output)

    intersection = (output * target).sum()
    union = (output + target).sum() - intersection

    if target.sum() == 0:
        return (output.sum() == 0).float()

    iou = (intersection + smooth) / (union + smooth)

    return iou

In [None]:
seq_3 = "22"
seq_3_dataset = WhuvidDataset(whuvid_base_path, [seq_3], transform, segmentation=True, flow=True, use_gdino=False)
print(len(seq_3_dataset))

In [None]:
model.eval()
def get_results(video_dataset, i):
    image, flow, imu, label = video_dataset[i]

    image = image.unsqueeze(0).to(device)
    flow =  flow.unsqueeze(0).to(device)
    imu = imu.unsqueeze(0).to(device)
    label = label.unsqueeze(0).to(device)
    
    pred = model(image, flow, imu)
    for i in range(3):
        iou = iou_score(pred[:,i], label[:,i]).item()
        # print(f"IoU {i}: {iou}")

    image = image.squeeze(0).cpu().detach()
    image = image * torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    image = image.numpy()
    image = np.transpose(image, (1, 2, 0))
    flow = flow.squeeze(0).cpu().detach().numpy()
    flow = np.transpose(flow, (1, 2, 0))
    flow = flow * 255
    imu = imu.squeeze(0).cpu().detach().numpy()
    label = label.squeeze(0).cpu().detach().numpy()
    pred = pred.squeeze(0).cpu().detach().numpy()

    output_image = to_painted_image(image, label)

    output_image_pred = to_painted_image(image, pred)

    return output_image_pred, output_image, flow, pred
    

def plot_results(dataset, i):
    output_image_pred, output_image, flow, pred = get_results(dataset, i)

    # plot 4 images: output_image_pred, output_image and flow_vis
    fig, axes = plt.subplots(1, 2, figsize=(30, 10))
    for axis in axes:
        axis.set_axis_off()
    output_image_pred = cv2.resize(output_image_pred, (1280, 720), interpolation=cv2.INTER_NEAREST)
    axes[0].imshow(output_image_pred)
    axes[0].set_title("Predição")
    output_image = cv2.resize(output_image, (1280, 720), interpolation=cv2.INTER_NEAREST)
    axes[1].imshow(output_image)
    axes[1].set_title("Anotação")
    # flow = cv2.resize(flow, (1280, 720), interpolation=cv2.INTER_NEAREST)
    # axes[2].imshow(flow)


i = 765
# i = 440
# print(val_dataset.images[i])
# i = index_by_loss[1400]
# print(f"iou[0]: {results[i][0]}, iou[1]: {results[i][1]}, loss: {losses[i]}")
plot_results(seq_3_dataset, i)

In [None]:
video_sequence = "18" # 03, 18, 22
video_dataset = WhuvidDataset(whuvid_base_path, [video_sequence], transform, segmentation=True, flow=True, use_gdino=False)
video_path = f"{whuvid_base_path}/{video_sequence}/other_files/results.avi"
pred_path = f"{whuvid_base_path}/{video_sequence}/pred"
if not os.path.exists(pred_path):
    os.mkdir(pred_path)
img0, _, _, _ = video_dataset[0]
width, height = img0.shape[1:]
video = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'XVID'), 30, (width, height))
model.eval()
for i in tq.tqdm(range(len(video_dataset))):
    img_path = video_dataset.images[i]
    basename = os.path.basename(img_path)
    output_path = os.path.join(pred_path, basename)
    output_image_pred, _, _, pred_output = get_results(video_dataset, i)

    output_image_pred = output_image_pred * 255
    output_image_pred = output_image_pred.astype(np.uint8)
    output_image_pred = cv2.cvtColor(output_image_pred, cv2.COLOR_RGB2BGR)
    video.write(output_image_pred)

    pred_output = np.argmax(pred_output, axis=0)
    pred_output_view = pred_output
    pred_output[pred_output == 0] = 255
    # pred_output[pred_output == 1] = 128
    pred_output[pred_output == 1] = 0
    pred_output[pred_output == 2] = 0
    pred_output = pred_output.astype(np.uint8)
    pred_output = cv2.dilate(pred_output, np.ones((2, 2), np.uint8), iterations=1)
    pred_output = cv2.resize(pred_output, (1280, 720), interpolation=cv2.INTER_NEAREST)
    # save pred
    cv2.imwrite(output_path, pred_output)