In [136]:
import os
import pandas as pd
import torch
from torchvision.io import decode_image
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import kagglehub
from pathlib import Path
from torchvision import tv_tensors
from torchvision.tv_tensors import BoundingBoxFormat




try:
    from lxml import etree
    print("running with lxml.etree")
except ImportError:
    import xml.etree.ElementTree as etree
    print("running with Python's xml.etree.ElementTree")

# Dataset
# https://www.kaggle.com/datasets/karthika95/pedestrian-detection

# Download latest version
#path = kagglehub.dataset_download("karthika95/pedestrian-detection")

print("Path to dataset files:", path)



running with lxml.etree
Path to dataset files: /Users/Dylan/.cache/kagglehub/datasets/karthika95/pedestrian-detection/versions/1


In [128]:
## Dataset

class Label:
    def __init__(self, filename, width, height, x1, y1, x2, y2, label_tag):
        self.filename = filename
        self.width = width
        self.height = height
        self.points = [(x1, y1), (x2, y2)]
        self.label = label_tag

    def __repr__(self):
        p1, p2 = self.points
        return f"Name: {self.filename}\nLabel:{self.label}\nWidth: {self.width}\nHeight: {self.height}\nBnd Box: {p1},{p2}"
        

'''
image: torchvision.tv_tensors.Image

target: a dict containing the following fields
    - boxes, torchvision.tv_tensors.BoundingBoxes of shape [N, 4]: the coordinates of the N bounding boxes in [x0, y0, x1, y1] format, ranging from 0 to W and 0 to H
    - labels, integer torch.Tensor of shape [N]: the label for each bounding box. 0 represents always the background class.
    - image_id, int: an image identifier. It should be unique between all the images in the dataset, and is used during evaluation
    - area, float torch.Tensor of shape [N]: the area of the bounding box. This is used during evaluation with the COCO metric, to separate the metric scores between small, medium and large boxes.
    - iscrowd, uint8 torch.Tensor of shape [N]: instances with iscrowd=True will be ignored during evaluation.

'''





class MyDataset(Dataset):
    # a dataset has to implement these 3 methods
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        
        self.img_labels = self._get_labels(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels[idx].filename)
        image = decode_image(img_path)
        label = self.img_labels[idx]
        image = image.numpy()
        #label = label.numpy()
        if self.transform:
            image = self.transform(image)
        #if self.target_transform:
            #label = self.target_transform(label)
        return image, label

    def _get_labels(self, annotations_file):
        labels = []
        # iterate over the directory
        annotations_dir = Path(annotations_file)
        # every image can have multiple bounding boxes
        '''
        label format dict(): {
            boxes: torchvision.tv_tensors.BoundingBoxes of shape [N, 4],
            labels: integer torch.Tensor of shape [N],
            image_id: unique image id,
            area: float torch.Tensor of shape [N],
            iscrowd: uint8 torch.Tensor of shape [N] (set to False)
        }
        '''
        for item in annotations_dir.iterdir():
            
            file_path = f"{annotations_dir}/{item.name}"
    
            if Path(file_path).is_file():  
                try:
                    # Parse the XML from the file
                    tree = etree.parse(file_path)
                    # Get the root element
                    root = tree.getroot()
        
                    # get width x height
                    size   = root.find("size")
                    width  = size.find("width").text
                    height = size.find("height").text
        
                    objects = root.findall("object")

                    for obj in objects:
                        # get the bounding box for each object
                        bnd_box = obj.find("bndbox")
                        x_min, y_min, x_max, y_max = float(bnd_box.find("xmin").text), float(bnd_box.find("ymin").text), float(bnd_box.find("xmax").text), float(bnd_box.find("ymax").text)
                        label_tag = obj.find("name").text  
                    # create a label                
                    #label = Label(item.name.strip(".xml") + ".jpg", width, height, x_min, y_min, x_max, y_max, label_tag)
        
                    #labels.append(label)
            
                except etree.XMLSyntaxError as e:
                    print(f"XML parsing error: {e}")
                except IOError as e:
                    print(f"File error: {e}")
            
        return labels


In [145]:
def get_labels(annotations_file):
        labels = []
        label_map = {"person": 0, "person-like": 1}
        # iterate over the directory
        annotations_dir = Path(annotations_file)
        # every image can have multiple bounding boxes
        '''
        label format dict(): {
            boxes: torchvision.tv_tensors.BoundingBoxes of shape [N, 4], 
            labels: integer torch.Tensor of shape [N],
            image_id: unique image id,
            area: float torch.Tensor of shape [N],
            iscrowd: uint8 torch.Tensor of shape [N] (set to False)
        }


        # bbox example
        canvas_size = (512, 512)

        # Bounding box data in XYXY format: [[x1, y1, x2, y2], ...]
        boxes_data = [[17, 16, 344, 495], [0, 10, 0, 10]]
        
        bboxes = tv_tensors.BoundingBoxes(
            boxes_data,
            format=BoundingBoxFormat.XYXY,
            canvas_size=canvas_size
        )
        
        '''
        for item in annotations_dir.iterdir():
            
            file_path = f"{annotations_dir}/{item.name}"
    
            if Path(file_path).is_file():  
                try:
                    # Parse the XML from the file
                    tree = etree.parse(file_path)
                    # Get the root element
                    root = tree.getroot()
        
                    # get width x height
                    size   = root.find("size")
                    width  = size.find("width").text
                    height = size.find("height").text
        
                    objects = root.findall("object")

                    bboxes = []
                    lbls   = []
                    areas  = []

                    for obj in objects:
                        # get the bounding box for each object
                        bnd_box_xml = obj.find("bndbox")
                        x1, y1, x2, y2 = float(bnd_box_xml.find("xmin").text), float(bnd_box_xml.find("ymin").text), float(bnd_box_xml.find("xmax").text), float(bnd_box_xml.find("ymax").text)
                        area = (x2 - x1) * (y2 - y1)
                        areas.append(area)
                        bboxes.append([x1, y1, x2, y2])
                        lbls.append(label_map[obj.find("name").text])
                
                    bboxes = tv_tensors.BoundingBoxes(
                        bboxes,
                        format=BoundingBoxFormat.XYXY,
                        canvas_size=(height, width)
                    )

                    image_label = {
                        "boxes": bboxes,
                        "labels": torch.tensor(lbls),
                        "image_id": int.from_bytes(file_path.encode('utf-8'), 'big'),
                        "area": torch.tensor(areas),
                        "iscrowd": torch.tensor([False for i in range(len(lbls))])
                    }

                    
                    labels.append(image_label)
            
                except etree.XMLSyntaxError as e:
                    print(f"XML parsing error: {e}")
                except IOError as e:
                    print(f"File error: {e}")
            
        return labels

# instantiate val dataset
val_annotations_dir = "/Users/Dylan/Documents/ml/pedestrian_tracking/test"
get_labels(val_annotations_dir)

ValueError: too many dimensions 'str'

In [129]:
# instantiate training dataset
train_annotations_dir = "/Users/Dylan/Documents/ml/pedestrian_tracking/dataset/Train/Annotations"
train_img_dir = "/Users/Dylan/Documents/ml/pedestrian_tracking/dataset/Train/JPEGImages"
training_dataset = MyDataset(train_annotations_dir, train_img_dir, transform=ToTensor(), target_transform=ToTensor())

In [130]:
# instantiate val dataset
val_annotations_dir = "/Users/Dylan/Documents/ml/pedestrian_tracking/dataset/Val/Annotations"
val_img_dir = "/Users/Dylan/Documents/ml/pedestrian_tracking/dataset/Val/JPEGImages"
val_dataset = MyDataset(val_annotations_dir, val_img_dir, transform=ToTensor(), target_transform=ToTensor())

In [135]:
print(training_dataset[0][0].shape)

torch.Size([653, 3, 436])


In [113]:
len(training_dataset)

944

In [114]:
len(val_dataset)

160

In [95]:
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")



Using mps device


In [96]:
# Going to start with a basic NN (no Convolutional layers)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
model = NeuralNetwork.to(device)

In [150]:
# get all unique labels

def get_all_unique_labels(annotations_file):
    annotations_dir = Path(annotations_file)
    labels = dict()
    for item in annotations_dir.iterdir():
        file_path = f"{annotations_dir}/{item.name}"
        if Path(file_path).is_file():  
            try:
                # Parse the XML from the file
                tree = etree.parse(file_path)
                # Get the root element
                root = tree.getroot()
    
                objects = root.findall("object")
                for obj in objects:
                    lbl = obj.find("name").text
                    lbl_count = labels.get(lbl, 0)
                    labels[lbl] = lbl_count + 1
    
               
    
            except etree.XMLSyntaxError as e:
                print(f"XML parsing error: {e}")
            except IOError as e:
                print(f"File error: {e}")
    
    return labels


train_annotations_dir = "/Users/Dylan/Documents/ml/pedestrian_tracking/dataset/Train/Annotations"

get_all_unique_labels(train_annotations_dir)


{'person-like': 960, 'person': 1106}