In [21]:
from jaad_data import JAAD
import torch
from PIL import Image
from torchvision import transforms
from torchvision import models
import matplotlib.pyplot as plt
import network
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset

In [None]:
JAAD_PATH = '../JAAD'
DEEPLAB_PATH = '../best_deeplabv3plus_resnet101_cityscapes_os16.pth'
SUBSET_PATH = '../subset'

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
# Load the JAAD dataset
jaad_dt = JAAD(data_path=JAAD_PATH)
#jaad.generate_database()
#jaad_dt.get_data_stats()

data_opts = {
    'fstride': 10,
    'sample_type': 'beh',
    #'height_rng': [10, 50],
}

seq_train = jaad_dt.generate_data_trajectory_sequence('train', **data_opts)  
seq_test = jaad_dt.generate_data_trajectory_sequence('test', **data_opts) 

In [None]:
train_transforms = transforms.Compose([
    transforms.Resize((256, 256)),  # Ridimensiona le immagini a 256x256
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
""" deeplab_model = models.segmentation.deeplabv3_resnet101(pretrained=True).to(device)
deeplab_model.eval()  # Imposta il modello in modalità di valutazione """
#deeplab_model = models.segmentation.deeplabv3_resnet101(pretrained=True)
deeplab_model = network.modeling.__dict__['deeplabv3plus_resnet101'](num_classes=19)
deeplab_model.load_state_dict(torch.load(DEEPLAB_PATH)['model_state'])
deeplab_model.to(device)
deeplab_model.eval()

In [None]:
# LABEL_NAMES = np.asarray([
#             'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light',
#             'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck',
#             'bus', 'train', 'motorcycle', 'bycycle'])

# FULL_LABEL_MAP = np.arange(len(LABEL_NAMES)).reshape(len(LABEL_NAMES), 1)
# FULL_COLOR_MAP = label_to_color_image(FULL_LABEL_MAP)

In [None]:
def get_segmentation_mask(image_path, model, preprocess):
    # Load the image
    input_image = Image.open(image_path).convert("RGB")
    input_tensor = preprocess(input_image).to(device)
    input_batch = input_tensor.unsqueeze(0)  # Create a batch with a single image
    
    # Pass the image through the model
    with torch.no_grad():
        output = model(input_batch)
        
    # Check if output is a tensor or a dictionary
    if isinstance(output, dict):
        output = output['out'][0]
    elif isinstance(output, torch.Tensor):
        output = output[0]
    else:
        raise ValueError(f"Unexpected output type: {type(output)}")
    
    # Convert the output to a mask
    output_predictions = output.argmax(0).cpu()
    return output_predictions

In [None]:
def process_video_frames(seq_train, model, preprocess):
    all_masks = []
    for video_frames in seq_train['image']:
        video_masks = []
        #sampled_frame_paths = video_frames[::3]

        for frame_path in video_frames:
            mask = get_segmentation_mask(frame_path, model, preprocess)
            #visualize_mask(frame_path, mask)
            video_masks.append(mask)
        all_masks.append(video_masks)
    return all_masks

In [None]:
# Visualizza la maschera semantica
def visualize_mask(image_path, mask):
    image = Image.open(image_path).convert("RGB")
    #image = image.resize((512, 512))  # Ridimensiona per la visualizzazione
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title("Original Image")
    plt.subplot(1, 2, 2)
    plt.imshow(mask, cmap='jet')
    plt.title("Semantic Mask")
    plt.show()

In [None]:
all_video_masks = process_video_frames(seq_train, deeplab_model, train_transforms)
seq_train['masks'] = all_video_masks


In [None]:
# Nome del file in cui vuoi salvare i risultati
filename = 'masks_results.pkl'

# Apri il file in modalità scrittura binaria e salva il dizionario
with open(filename, 'wb') as f:
    pickle.dump(seq_train['masks'], f)

In [23]:
class PedestrianIntentionModel(torch.nn.Module):
    def __init__(self, num_hidden_units=256, cell_type='gru'):
        super(PedestrianIntentionModel, self).__init__()
        
        # Load the pretrained VGG16 model
        vgg = models.vgg16(pretrained=True)
        self.feature_extractor = torch.nn.Sequential(*list(vgg.features.children()))
        
        # GRU layer
        self.gru = torch.nn.GRU(input_size=2*512*7*7, hidden_size=num_hidden_units, batch_first=True)  # Adjust input size
        
        # Fully connected layer
        self.fc = torch.nn.Linear(num_hidden_units, 1)
        
        # Sigmoid activation
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, images, masks):
        batch_size, seq_len, c, h, w = images.size()
        
        # Extract features using VGG16 for images
        images = images.view(batch_size * seq_len, c, h, w)
        img_features = self.feature_extractor(images)
        
        # Extract features using VGG16 for masks
        masks = masks.view(batch_size * seq_len, c, h, w)
        mask_features = self.feature_extractor(masks)
        
        # Concatenate image features and mask features
        features = torch.cat((img_features, mask_features), dim=1)
        
        # Reshape features for GRU
        features = features.view(batch_size, seq_len, -1)
        
        # Pass through GRU
        gru_out, _ = self.gru(features)
        
        # Use the output of the last time step
        last_hidden_state = gru_out[:, -1, :]
        
        # Fully connected layer
        out = self.fc(last_hidden_state)
        
        # Sigmoid activation
        out = self.sigmoid(out)
        
        return out


In [25]:

class JAADDataset(Dataset):
    def __init__(self,seq_data, transform=None):
        self.seq_data = seq_data
        self.transform = transform

    def __len__(self):
        return len(self.seq_data['image'])

    def __getitem__(self, idx):
        image = Image.open(self.images[idx]).convert('RGB')
        mask = Image.fromarray(self.masks[idx])

        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)

        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return image, mask, label

# Example usage:
train_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.blur(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Assume seq_train contains lists of image paths, mask arrays, and labels
train_dataset = JAADDataset(seq_train['image'], seq_train['masks'], seq_train['intent'], transform=train_transforms)
dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [29]:
# Assuming seq_train['image'] and seq_train['labels'] are lists of file paths and corresponding labels
def process_images_and_masks(seq_train, deeplab_model, transform):
    all_images = []
    all_masks = []
    all_labels = seq_train['intent']

    for video_frames in seq_train['image']:
        video_images = []
        video_masks = []

        for frame_path in video_frames:
            # Load and preprocess image
            image = Image.open(frame_path).convert("RGB")
            image = transform(image)
            video_images.append(image)

            # Get segmentation mask
            mask = get_segmentation_mask(frame_path, deeplab_model, transform)
            video_masks.append(mask)

        all_images.append(video_images)
        all_masks.append(video_masks)
    
    return all_images, all_masks, all_labels

In [42]:
all_images, all_masks, all_labels = process_images_and_masks(seq_train, deeplab_model, train_transforms)

train_dataset = JAADDataset(seq_train['image'], transform=train_transforms)
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


KeyboardInterrupt: 

In [37]:
model = PedestrianIntentionModel(num_hidden_units=256).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 1  # Set number of epochs as needed

for epoch in range(num_epochs):
    model.train()
    for images, masks, labels in dataloader:
        images, masks, labels = images.to(device), masks.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images, masks)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
     
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

AttributeError: 'Tensor' object has no attribute '__array_interface__'