In [1]:
from jaad_data import JAAD
import torch
from PIL import Image
from torchvision import transforms
from torchvision import models
import matplotlib.pyplot as plt
import network
import pickle
from torch.utils.data import Dataset, DataLoader


In [2]:
JAAD_PATH = '../JAAD'
DEEPLAB_PATH = '../best_deeplabv3plus_resnet101_cityscapes_os16.pth'
SUBSET_PATH = '../subset'
FILENAME = 'masks_results.pkl'

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
# Load the JAAD dataset
jaad_dt = JAAD(data_path=JAAD_PATH)
#jaad.generate_database()
#jaad_dt.get_data_stats()

data_opts = {
    'fstride': 10,
    'sample_type': 'beh'
}

seq_train = jaad_dt.generate_data_trajectory_sequence('train', **data_opts)  
seq_test = jaad_dt.generate_data_trajectory_sequence('test', **data_opts)  

---------------------------------------------------------
Generating action sequence data
fstride: 10
sample_type: beh
subset: default
height_rng: [0, inf]
squarify_ratio: 0
data_split_type: default
seq_type: intention
min_track_size: 15
random_params: {'ratios': None, 'val_data': True, 'regen_data': False}
kfold_params: {'num_folds': 5, 'fold': 1}
---------------------------------------------------------
Generating database for jaad
jaad database loaded from c:\Users\jacop\Documents\ComputerVision\JAAD\data_cache\jaad_database.pkl
---------------------------------------------------------
Generating intention data
Split: train
Number of pedestrians: 324 
Total number of samples: 83 
---------------------------------------------------------
Generating action sequence data
fstride: 10
sample_type: beh
subset: default
height_rng: [0, inf]
squarify_ratio: 0
data_split_type: default
seq_type: intention
min_track_size: 15
random_params: {'ratios': None, 'val_data': True, 'regen_data': False}

In [5]:
train_transforms = transforms.Compose([
    transforms.Resize((512, 512)),  # Ridimensiona le immagini a 256x256
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [6]:
""" deeplab_model = models.segmentation.deeplabv3_resnet101(pretrained=True).to(device)
deeplab_model.eval()  # Imposta il modello in modalità di valutazione """
#deeplab_model = models.segmentation.deeplabv3_resnet101(pretrained=True)
deeplab_model = network.modeling.__dict__['deeplabv3plus_resnet101'](num_classes=19)
deeplab_model.load_state_dict(torch.load(DEEPLAB_PATH)['model_state'])
deeplab_model.to(device)
deeplab_model.eval()

DeepLabV3(
  (backbone): IntermediateLayerGetter(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Se

In [6]:
def get_segmentation_mask(image_path, model, preprocess):
    # Load the image
    input_image = Image.open(image_path).convert("RGB")
    input_tensor = preprocess(input_image).to(device)
    input_batch = input_tensor.unsqueeze(0)  # Create a batch with a single image
    
    # Pass the image through the model
    with torch.no_grad():
        output = model(input_batch)
        
    # Check if output is a tensor or a dictionary
    if isinstance(output, dict):
        output = output['out'][0]
    elif isinstance(output, torch.Tensor):
        output = output[0]
    else:
        raise ValueError(f"Unexpected output type: {type(output)}")
    
    # Convert the output to a mask
    output_predictions = output.argmax(0).cpu()
    return output_predictions

In [7]:
def process_video_frames(seq_train, model, preprocess):
    all_masks = []
    for video_frames in seq_train['image']:
        video_masks = []

        for frame_path in video_frames:
            mask = get_segmentation_mask(frame_path, model, preprocess)
            #visualize_mask(frame_path, mask)
            video_masks.append(mask)
        all_masks.append(video_masks)
    return all_masks

In [9]:
# Visualizza la maschera semantica
def visualize_mask(image_path, mask):
    image = Image.open(image_path).convert("RGB")
    image = image.resize((256, 256))  # Ridimensiona per la visualizzazione
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title("Original Image")
    plt.subplot(1, 2, 2)
    plt.imshow(mask, cmap='jet')
    plt.title("Semantic Mask")
    plt.show()


In [10]:
all_video_masks = process_video_frames(seq_train, deeplab_model, train_transforms)
seq_train['masks'] = all_video_masks


In [11]:
# Apri il file in modalità scrittura binaria e salva il dizionario
with open(FILENAME, 'wb') as f:
    pickle.dump(seq_train['masks'], f)

In [6]:
#recover data:
with open(FILENAME, 'rb') as f:
    seq_train['masks'] = pickle.load(f)

# Verifica che i risultati siano stati caricati correttamente
print(seq_train['masks'])

[[tensor([[10, 10, 10,  ..., 10, 10, 10],
        [10, 10, 10,  ..., 10, 10, 10],
        [10, 10, 10,  ..., 10, 10, 10],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]]), tensor([[10, 10, 10,  ..., 10,  2,  2],
        [10, 10, 10,  ..., 10,  2,  2],
        [10, 10, 10,  ..., 10,  2,  2],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]]), tensor([[10, 10, 10,  ..., 10, 10, 10],
        [10, 10, 10,  ..., 10, 10, 10],
        [10, 10, 10,  ..., 10, 10, 10],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]]), tensor([[10, 10, 10,  ..., 10, 10, 10],
        [10, 10, 10,  ..., 10, 10, 10],
        [10, 10, 10,  ..., 10, 10, 10],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,

In [12]:
""" for seq in seq_train['image']:
    for image_path in seq:
        if os.path.exists(image_path):
            mask = get_semantic_mask(image_path, model, preprocess)
            mask_image_path = image_path.replace('.jpg', '_mask.png')
            plt.imsave(mask_image_path, mask, cmap='jet') """

" for seq in seq_train['image']:\n    for image_path in seq:\n        if os.path.exists(image_path):\n            mask = get_semantic_mask(image_path, model, preprocess)\n            mask_image_path = image_path.replace('.jpg', '_mask.png')\n            plt.imsave(mask_image_path, mask, cmap='jet') "

In [7]:
class PedestrianIntentModel(torch.nn.Module):
    def __init__(self, vgg19):
        super(PedestrianIntentModel, self).__init__()
        self.vgg19 = vgg19
        self.gru = torch.nn.GRU(input_size=512 + 3, hidden_size=256, num_layers=2, batch_first=True)
        self.fc = torch.nn.Linear(256, 2)  # Output: crossing or not crossing
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, images, bboxes, centers, occlusions):
        batch_size, seq_len, c, h, w = images.size()
        
        # Estrai feature dalle immagini con VGG19
        vgg_features = []
        for i in range(seq_len):
            img = images[:, i]
            vgg_feat_img = self.vgg19.features(img)
            vgg_feat_img = vgg_feat_img.view(batch_size, -1)
            vgg_features.append(vgg_feat_img)
        
        vgg_features = torch.stack(vgg_features, dim=1)
        
        # Combina con i metadati
        metadatas = torch.cat((bboxes, centers, occlusions), dim=-1)
        features = torch.cat((vgg_features, metadatas), dim=-1)
        
        # Passa attraverso la GRU
        gru_out, _ = self.gru(features)
        out = self.sigmoid(self.fc(gru_out[:, -1, :]))
        
        return out


In [9]:
# Carica il modello VGG19 pre-addestrato
vgg19 = models.vgg19(pretrained=True)

# Congela i pesi dei primi strati
for param in vgg19.parameters():
    param.requires_grad = False

# Sostituisci l'ultimo strato della VGG19 per il tuo task specifico
num_features = vgg19.classifier[6].in_features
vgg19.classifier[6] = torch.nn.Linear(num_features, 2)  # Assuming 2 classes for intent

# Abilita il training solo per i nuovi strati aggiunti
for param in vgg19.classifier[6].parameters():
    param.requires_grad = True
# Inizializza il modello con VGG19
model = PedestrianIntentModel(vgg19).to(device)

# Definisci la loss e l'ottimizzatore
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)

In [10]:
class JAADDataset(Dataset):
    def __init__(self, seq_data, transform=None):
        self.seq_data = seq_data
        self.transform = transform

    def __len__(self):
        return len(self.seq_data['image'])

    def __getitem__(self, idx):
        # Carica le immagini e le maschere
        img_paths = self.seq_data['image'][idx]
        images = [Image.open(img_path).convert("RGB") for img_path in img_paths]
        mask_paths = self.seq_data['masks'][idx]
        masks = [Image.fromarray(mask.numpy()) for mask in mask_paths]

        if self.transform:
            images = [self.transform(img) for img in images]
            masks = [self.transform(mask) for mask in masks]

        # Carica i metadati
        bboxes = torch.tensor(self.seq_data['bbox'][idx], dtype=torch.float32)
        centers = torch.tensor(self.seq_data['center'][idx], dtype=torch.float32)
        occlusions = torch.tensor(self.seq_data['occlusion'][idx], dtype=torch.float32)
        intents = torch.tensor(self.seq_data['intent'][idx], dtype=torch.long)

        return images, masks, bboxes, centers, occlusions, intents

In [11]:
train_dataset = JAADDataset(seq_train, transform=train_transforms)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,num_workers=4)

In [12]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for images, masks, bboxes, centers, occlusions, intents in train_loader:
        images = torch.stack(images, dim=1).squeeze(2).to(device)  # Converte la lista di immagini in un tensor
        masks = torch.stack(masks, dim=1).squeeze(2).to(device)  # Converte la lista di maschere in un tensor
        bboxes = bboxes.to(device)
        centers = centers.to(device)
        occlusions = occlusions.to(device)
        intents = intents.to(device)
        
        optimizer.zero_grad()
        outputs = model(images, bboxes, centers, occlusions)
        loss = loss(outputs, intents)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')