In [121]:
import os
import cv2
import xml.etree.ElementTree as ET
from collections import defaultdict
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

class SimpleVideoDataset(Dataset):
    def __init__(self, video_frames_path, action_txt_path, xml_labels_path):
        """
        Args:
            video_frames_path (str): Path to the folder containing video frames.
            action_txt_path (str): Path to the .txt file containing action labels.
            xml_labels_path (str): Path to the .xml file containing annotations.
        """
        self.video_frames_path = video_frames_path
        self.action_txt_path = action_txt_path
        self.xml_labels_path = xml_labels_path

        # Action mapping
        self.mapping = {
            'take': 0, 'open': 1, 'pour': 2, 'close': 3, 'shake': 4,
            'scoop': 5, 'stir': 6, 'put': 7, 'fold': 8, 'spread': 9, 'background': 10
        }

        # Transform for resizing and normalizing frames
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),  # Resize to ResNet input size
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ResNet normalization
        ])

        # Load data
        self.frames = self._load_frames()
        self.actions = self._load_actions()
        self.annotations = self._load_annotations()

    def _load_frames(self):
        """Load all frames from the video frames directory."""
        frame_files = sorted(os.listdir(self.video_frames_path))  # Sort by frame order
        frames = [
            cv2.imread(os.path.join(self.video_frames_path, frame))
            for frame in frame_files
        ]
        return frames

    def _load_actions(self):
        """Load actions from the action .txt file."""
        with open(self.action_txt_path, "r") as f:
            actions = [line.strip() for line in f.readlines()]
        # Map actions to numerical values
        return [self.mapping[action] for action in actions]

    def _load_annotations(self):
        """Parse the XML file and load annotations."""
        tree = ET.parse(self.xml_labels_path)
        root = tree.getroot()

        # Dictionary to combine annotations by frame_id
        combined_annotations = defaultdict(lambda: {
            "frame_id": None,
            "width": None,
            "height": None,
            "boxes": [],
            "keypoints": []
        })

        for image in root.findall("image"):
            frame_id = image.get("id")
            width = int(image.get("width"))
            height = int(image.get("height"))

            # Initialize combined data for this frame_id
            combined_annotations[frame_id]["frame_id"] = frame_id
            combined_annotations[frame_id]["width"] = width
            combined_annotations[frame_id]["height"] = height

            # Extract bounding boxes
            for box in image.findall("box"):
                xtl = float(box.get("xtl"))
                ytl = float(box.get("ytl"))
                xbr = float(box.get("xbr"))
                ybr = float(box.get("ybr"))
                label = box.get("label")
                hand_type = box.find("attribute").text
                combined_annotations[frame_id]["boxes"].append({
                    "xtl": xtl,
                    "ytl": ytl,
                    "xbr": xbr,
                    "ybr": ybr,
                    "label": label,
                    "hand_type": hand_type,
                })

            # Extract keypoints
            for polyline in image.findall("polyline"):
                label = polyline.get("label")
                points = polyline.get("points")
                points = [
                    tuple(map(float, point.split(",")))
                    for point in points.split(";") if point.strip()
                ]
                combined_annotations[frame_id]["keypoints"].append({
                    "label": label,
                    "points": points,
                })

        # Return a list of combined annotations sorted by frame_id
        return [combined_annotations[frame_id] for frame_id in sorted(combined_annotations)]

    def __len__(self):
        """Return the number of frames."""
        return len(self.frames)

    def __getitem__(self, idx):
        """Return data for a specific frame."""
        frame = self.frames[idx]
        action = self.actions[idx] if idx < len(self.actions) else self.mapping['background']
        annotation = self.annotations[idx] if idx < len(self.annotations) else None

        # Transform frame (resize and normalize)
        frame = self.transform(frame)

        # Extract bounding box coordinates
        coordinates = [
            [box['xtl'], box['ytl'], box['xbr'], box['ybr']]
            for box in annotation['boxes']
        ]
        coordinates_array = torch.tensor(coordinates, dtype=torch.float32)

        # Separate keypoints by hand type
        left_hand_keypoints = []
        right_hand_keypoints = []

        for kp in annotation['keypoints']:
            if kp['label'] == 'thumb' or kp['label'].startswith('index') or kp['label'].startswith('middle') or kp['label'].startswith('ring') or kp['label'].startswith('pinkie'):
                if 'left' in kp['label']:
                    left_hand_keypoints.extend(kp['points'])
                elif 'right' in kp['label']:
                    right_hand_keypoints.extend(kp['points'])

        # Convert to NumPy arrays and pad if necessary
        left_hand_array = np.array(left_hand_keypoints) if left_hand_keypoints else np.zeros((21, 2))
        right_hand_array = np.array(right_hand_keypoints) if right_hand_keypoints else np.zeros((21, 2))

        # Ensure both arrays are of size (21, 2)
        if left_hand_array.shape[0] != 21:
            left_hand_array = np.vstack([left_hand_array, np.zeros((21 - left_hand_array.shape[0], 2))])
        if right_hand_array.shape[0] != 21:
            right_hand_array = np.vstack([right_hand_array, np.zeros((21 - right_hand_array.shape[0], 2))])

        # Combine left and right hand keypoints into a single array of size (42, 2)
        keypoints_array = np.vstack([left_hand_array, right_hand_array])
        keypoints_array = torch.tensor(keypoints_array, dtype=torch.float32)

        # Pad bounding boxes if one is missing
        if len(annotation['boxes']) < 2:
            if any(box['hand_type'] == 'left' for box in annotation['boxes']):
                # Pad a zero bounding box for the right hand
                coordinates_array = torch.cat([coordinates_array, torch.zeros((1, 4))])
            elif any(box['hand_type'] == 'right' for box in annotation['boxes']):
                # Pad a zero bounding box for the left hand
                coordinates_array = torch.cat([torch.zeros((1, 4)), coordinates_array])

        return frame, coordinates_array, keypoints_array, action, annotation['frame_id']

# Example Usage
video_frames_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\gtea_png\gtea_png\png\S1_Cheese_C1"
action_txt_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\actions-20241207T081235Z-001\actions\S1_Cheese_C1.txt"
xml_labels_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\xml_labels\S1_Cheese_C1.xml"

# Initialize dataset
dataset = SimpleVideoDataset(video_frames_path, action_txt_path, xml_labels_path)

# Create DataLoader
batch_size = 5
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)



In [None]:
import os
import cv2
import xml.etree.ElementTree as ET
from collections import defaultdict
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class SimpleVideoDataset(Dataset):
    def __init__(self, video_frames_path, action_txt_path, xml_labels_path, transform=None):
        """
        Args:
            video_frames_path (str): Path to the folder containing video frames.
            action_txt_path (str): Path to the .txt file containing action labels.
            xml_labels_path (str): Path to the .xml file containing annotations.
            transform (callable, optional): Optional transform to be applied
                on a sample (frame).
        """
        self.video_frames_path = video_frames_path
        self.action_txt_path = action_txt_path
        self.xml_labels_path = xml_labels_path
        self.transform = transform

        # Action mapping
        self.mapping = {
            'take': 0, 'open': 1, 'pour': 2, 'close': 3, 'shake': 4,
            'scoop': 5, 'stir': 6, 'put': 7, 'fold': 8, 'spread': 9, 'background': 10
        }

        # Load data
        self.frames = self._load_frames()
        self.actions = self._load_actions()
        self.annotations = self._load_annotations()

    def _load_frames(self):
        """Load all frames from the video frames directory."""
        frame_files = sorted(os.listdir(self.video_frames_path))  # Sort by frame order
        frames = [
            cv2.imread(os.path.join(self.video_frames_path, frame))
            for frame in frame_files
        ]
        return frames

    def _load_actions(self):
        """Load actions from the action .txt file."""
        with open(self.action_txt_path, "r") as f:
            actions = [line.strip() for line in f.readlines()]
        # Map actions to numerical values
        return [self.mapping[action] for action in actions]

    def _load_annotations(self):
        """Parse the XML file and load annotations."""
        tree = ET.parse(self.xml_labels_path)
        root = tree.getroot()

        # Dictionary to combine annotations by frame_id
        combined_annotations = defaultdict(lambda: {
            "frame_id": None,
            "width": None,
            "height": None,
            "boxes": [],
            "keypoints": []
        })

        for image in root.findall("image"):
            frame_id = image.get("id")
            width = int(image.get("width"))
            height = int(image.get("height"))

            # Initialize combined data for this frame_id
            combined_annotations[frame_id]["frame_id"] = frame_id
            combined_annotations[frame_id]["width"] = width
            combined_annotations[frame_id]["height"] = height

            # Extract bounding boxes
            for box in image.findall("box"):
                xtl = float(box.get("xtl"))
                ytl = float(box.get("ytl"))
                xbr = float(box.get("xbr"))
                ybr = float(box.get("ybr"))
                label = box.get("label")
                hand_type = box.find("attribute").text
                combined_annotations[frame_id]["boxes"].append({
                    "xtl": xtl,
                    "ytl": ytl,
                    "xbr": xbr,
                    "ybr": ybr,
                    "label": label,
                    "hand_type": hand_type,
                })

            # Extract keypoints
            for polyline in image.findall("polyline"):
                label = polyline.get("label")
                points = polyline.get("points")
                points = [
                    tuple(map(float, point.split(",")))
                    for point in points.split(";") if point.strip()
                ]
                combined_annotations[frame_id]["keypoints"].append({
                    "label": label,
                    "points": points,
                })

        # Return a list of combined annotations sorted by frame_id
        return [combined_annotations[frame_id] for frame_id in sorted(combined_annotations)]

    def __len__(self):
        """Return the number of frames."""
        return len(self.frames)

    def __getitem__(self, idx):
        """Return data for a specific frame."""
        frame = self.frames[idx]
        action = self.actions[idx] if idx < len(self.actions) else self.mapping['background']
        annotation = self.annotations[idx] if idx < len(self.annotations) else None

        # Apply transforms to frame
        if self.transform:
            frame = self.transform(frame)

        # Extract bounding box coordinates
        coordinates = [
            [box['xtl'], box['ytl'], box['xbr'], box['ybr']]
            for box in annotation['boxes']
        ]
        coordinates_array = torch.tensor(coordinates, dtype=torch.float32)

        # Separate keypoints by hand type
        left_hand_keypoints = []
        right_hand_keypoints = []

        for kp in annotation['keypoints']:
            if kp['label'] == 'thumb' or kp['label'].startswith('index') or kp['label'].startswith('middle') or kp['label'].startswith('ring') or kp['label'].startswith('pinkie'):
                if 'left' in kp['label']:
                    left_hand_keypoints.extend(kp['points'])
                elif 'right' in kp['label']:
                    right_hand_keypoints.extend(kp['points'])

        # Convert to NumPy arrays and pad if necessary
        left_hand_array = np.array(left_hand_keypoints) if left_hand_keypoints else np.zeros((21, 2))
        right_hand_array = np.array(right_hand_keypoints) if right_hand_keypoints else np.zeros((21, 2))

        # Ensure both arrays are of size (21, 2)
        if left_hand_array.shape[0] != 21:
            left_hand_array = np.vstack([left_hand_array, np.zeros((21 - left_hand_array.shape[0], 2))])
        if right_hand_array.shape[0] != 21:
            right_hand_array = np.vstack([right_hand_array, np.zeros((21 - right_hand_array.shape[0], 2))])

        # Combine left and right hand keypoints into a single array of size (42, 2)
        keypoints_array = np.vstack([left_hand_array, right_hand_array])
        keypoints_array = torch.tensor(keypoints_array, dtype=torch.float32)

        # Pad bounding boxes if one is missing
        if len(annotation['boxes']) < 2:
            if any(box['hand_type'] == 'left' for box in annotation['boxes']):
                # Pad a zero bounding box for the right hand
                coordinates_array = torch.cat([coordinates_array, torch.zeros((1, 4))])
            elif any(box['hand_type'] == 'right' for box in annotation['boxes']):
                # Pad a zero bounding box for the left hand
                coordinates_array = torch.cat([torch.zeros((1, 4)), coordinates_array])

        return frame, coordinates_array, keypoints_array, action, annotation['frame_id']


# Define Transforms
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((404, 720)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Initialize Dataset and DataLoader
dataset = SimpleVideoDataset(video_frames_path, action_txt_path, xml_labels_path, transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


In [144]:
# Iterate through DataLoader
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for batch in dataloader:
    frames, coordinates, keypoints, actions, frame_ids = batch
    print("Frames Shape:", frames.shape)
    print("Coordinates Shape:", coordinates.shape)
    print("Keypoints Shape:", keypoints.shape)
    print("Actions:", actions)
    print("Frame IDs:", frame_ids)
    break
keypoints

Frames Shape: torch.Size([1, 3, 404, 720])
Coordinates Shape: torch.Size([1, 2, 4])
Keypoints Shape: torch.Size([1, 42, 2])
Actions: tensor([7])
Frame IDs: ('frame_000291',)


tensor([[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]])

tensor([[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]])

In [48]:
# Iterate through the DataLoader
for batch in dataloader:
    frames, coordinates, keypoints, actions, frame_ids = batch
    print("Frames:", frames.shape)
    print("Coordinates:", coordinates.shape)
    print("Keypoints:", keypoints.shape)
    print("Actions:", actions)
    print("Frame IDs:", frame_ids)
    break


Frames: torch.Size([5, 3, 404, 720])
Coordinates: torch.Size([5, 2, 4])
Keypoints: torch.Size([5, 42, 2])
Actions: tensor([ 0,  7,  7, 10, 10])
Frame IDs: ('frame_000164', 'frame_000749', 'frame_000502', 'frame_000083', 'frame_000891')


In [None]:
mapping= {0: 'take',
1: 'open',
2: 'pour',
3: 'close',
4: 'shake',
5: 'scoop',
6: 'stir',
7: 'put',
8: 'fold',
9: 'spread',
10: 'background'
}

In [None]:
import os
import cv2
import xml.etree.ElementTree as ET
from collections import defaultdict
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader

class CustomNetwork(nn.Module):
    def __init__(self, num_classes=11, image_height=404, image_width=720):
        super(CustomNetwork, self).__init__()
        self.image_height = image_height
        self.image_width = image_width

        # Transform bounding boxes and keypoints
        self.bbox_fc = nn.Linear(4 * 2, image_width)  # Assume 2 boxes
        self.kp_fc = nn.Linear(42 * 2, image_width)   # 42 keypoints

        # Combine features into 2D
        self.combine_fc = nn.Linear(image_width * 2, image_height * image_width)

        # ResNet backbone
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False)  # Adjust input channels to 4
        self.resnet.fc = nn.Linear(512, num_classes)

    def forward(self, frame, boxes, keypoints):
        # Transform bounding boxes and keypoints
        bbox_feat = self.bbox_fc(boxes).unsqueeze(1)
        kp_feat = self.kp_fc(keypoints).unsqueeze(1)

        # Combine and reshape into 2D
        combined_feat = torch.cat([bbox_feat, kp_feat], dim=1)
        combined_feat = self.combine_fc(combined_feat).view(-1, 1, self.image_height, self.image_width)

        # Concatenate with image
        frame = torch.cat([frame, combined_feat], dim=1)

        # ResNet classification
        return self.resnet(frame)


KeyError: 'background'

In [56]:
frame.shape

(404, 720, 3)

In [54]:
coordinates.view(batch_size,  8)

tensor([[  0.,   0.,   0.,   0., 349., 198., 418., 297.],
        [  0.,   0.,   0.,   0., 379.,  97., 570., 187.],
        [148., 191., 304., 325.,   0.,   0.,   0.,   0.],
        [204., 311., 300., 384., 340., 260., 459., 344.],
        [239., 200., 334., 315., 358., 189., 453., 317.]])

In [52]:
len(coordinates)

5

In [55]:
coordinates

tensor([[[  0.,   0.,   0.,   0.],
         [349., 198., 418., 297.]],

        [[  0.,   0.,   0.,   0.],
         [379.,  97., 570., 187.]],

        [[148., 191., 304., 325.],
         [  0.,   0.,   0.,   0.]],

        [[204., 311., 300., 384.],
         [340., 260., 459., 344.]],

        [[239., 200., 334., 315.],
         [358., 189., 453., 317.]]])

In [None]:
import os
import cv2
import xml.etree.ElementTree as ET
from collections import defaultdict
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Dataset Class
class SimpleVideoDataset(Dataset):
    def __init__(self, video_frames_path, action_txt_path, xml_labels_path):
        """
        Args:
            video_frames_path (str): Path to the folder containing video frames.
            action_txt_path (str): Path to the .txt file containing action labels.
            xml_labels_path (str): Path to the .xml file containing annotations.
        """
        self.video_frames_path = video_frames_path
        self.action_txt_path = action_txt_path
        self.xml_labels_path = xml_labels_path

        # Action mapping
        self.mapping = {
            'take': 0, 'open': 1, 'pour': 2, 'close': 3, 'shake': 4,
            'scoop': 5, 'stir': 6, 'put': 7, 'fold': 8, 'spread': 9, 'background': 10
        }

        # Transform for resizing and normalizing frames
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),  # Resize to ResNet input size
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ResNet normalization
        ])

        # Load data
        self.frames = self._load_frames()
        self.actions = self._load_actions()
        self.annotations = self._load_annotations()

    def _load_frames(self):
        """Load all frames from the video frames directory."""
        frame_files = sorted(os.listdir(self.video_frames_path))  # Sort by frame order
        frames = [
            cv2.imread(os.path.join(self.video_frames_path, frame))
            for frame in frame_files
        ]
        return frames

    def _load_actions(self):
        """Load actions from the action .txt file."""
        with open(self.action_txt_path, "r") as f:
            actions = [line.strip() for line in f.readlines()]
        # Map actions to numerical values
        return [self.mapping[action] for action in actions]

    def _load_annotations(self):
        """Parse the XML file and load annotations."""
        tree = ET.parse(self.xml_labels_path)
        root = tree.getroot()

        # Dictionary to combine annotations by frame_id
        combined_annotations = defaultdict(lambda: {
            "frame_id": None,
            "width": None,
            "height": None,
            "boxes": [],
            "keypoints": []
        })

        for image in root.findall("image"):
            frame_id = image.get("id")
            width = int(image.get("width"))
            height = int(image.get("height"))

            # Initialize combined data for this frame_id
            combined_annotations[frame_id]["frame_id"] = frame_id
            combined_annotations[frame_id]["width"] = width
            combined_annotations[frame_id]["height"] = height

            # Extract bounding boxes
            for box in image.findall("box"):
                xtl = float(box.get("xtl"))
                ytl = float(box.get("ytl"))
                xbr = float(box.get("xbr"))
                ybr = float(box.get("ybr"))
                label = box.get("label")
                hand_type = box.find("attribute").text
                combined_annotations[frame_id]["boxes"].append({
                    "xtl": xtl,
                    "ytl": ytl,
                    "xbr": xbr,
                    "ybr": ybr,
                    "label": label,
                    "hand_type": hand_type,
                })

            # Extract keypoints
            for polyline in image.findall("polyline"):
                label = polyline.get("label")
                points = polyline.get("points")
                points = [
                    tuple(map(float, point.split(",")))
                    for point in points.split(";") if point.strip()
                ]
                combined_annotations[frame_id]["keypoints"].append({
                    "label": label,
                    "points": points,
                })

        # Return a list of combined annotations sorted by frame_id
        return [combined_annotations[frame_id] for frame_id in sorted(combined_annotations)]

    def __len__(self):
        """Return the number of frames."""
        return len(self.frames)

    def __getitem__(self, idx):
        """Return data for a specific frame."""
        frame = self.frames[idx]
        action = self.actions[idx] if idx < len(self.actions) else self.mapping['background']
        annotation = self.annotations[idx] if idx < len(self.annotations) else None

        # Transform frame (resize and normalize)
        frame = self.transform(frame)

        # Extract bounding box coordinates
        coordinates = [
            [box['xtl'], box['ytl'], box['xbr'], box['ybr']]
            for box in annotation['boxes']
        ]
        coordinates_array = torch.tensor(coordinates, dtype=torch.float32)

        # Separate keypoints by hand type
        left_hand_keypoints = []
        right_hand_keypoints = []

        for kp in annotation['keypoints']:
            if kp['label'] == 'thumb' or kp['label'].startswith('index') or kp['label'].startswith('middle') or kp['label'].startswith('ring') or kp['label'].startswith('pinkie'):
                if 'left' in kp['label']:
                    left_hand_keypoints.extend(kp['points'])
                elif 'right' in kp['label']:
                    right_hand_keypoints.extend(kp['points'])

        # Convert to NumPy arrays and pad if necessary
        left_hand_array = np.array(left_hand_keypoints) if left_hand_keypoints else np.zeros((21, 2))
        right_hand_array = np.array(right_hand_keypoints) if right_hand_keypoints else np.zeros((21, 2))

        # Ensure both arrays are of size (21, 2)
        if left_hand_array.shape[0] != 21:
            left_hand_array = np.vstack([left_hand_array, np.zeros((21 - left_hand_array.shape[0], 2))])
        if right_hand_array.shape[0] != 21:
            right_hand_array = np.vstack([right_hand_array, np.zeros((21 - right_hand_array.shape[0], 2))])

        # Combine left and right hand keypoints into a single array of size (42, 2)
        keypoints_array = np.vstack([left_hand_array, right_hand_array])
        keypoints_array = torch.tensor(keypoints_array, dtype=torch.float32)

        # Pad bounding boxes if one is missing
        if len(annotation['boxes']) < 2:
            if any(box['hand_type'] == 'left' for box in annotation['boxes']):
                # Pad a zero bounding box for the right hand
                coordinates_array = torch.cat([coordinates_array, torch.zeros((1, 4))])
            elif any(box['hand_type'] == 'right' for box in annotation['boxes']):
                # Pad a zero bounding box for the left hand
                coordinates_array = torch.cat([torch.zeros((1, 4)), coordinates_array])

        return frame, coordinates_array, keypoints_array, action, annotation['frame_id']



video_frames_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\gtea_png\gtea_png\png\S1_Cheese_C1"
action_txt_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\actions-20241207T081235Z-001\actions\S1_Cheese_C1.txt"
xml_labels_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\xml_labels\S1_Cheese_C1.xml"


dataset = SimpleVideoDataset(video_frames_path, action_txt_path, xml_labels_path)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Model with Custom Depth Handling
class CustomNetwork(nn.Module):
    def __init__(self, num_classes=11, image_size=224):
        super(CustomNetwork, self).__init__()
        self.image_size = image_size

        # Transform bounding boxes and keypoints
        self.bbox_fc = nn.Linear(4 * 2, image_size)  # Assume 2 boxes, adjust if needed
        self.kp_fc = nn.Linear(42 * 2, image_size)  # 42 keypoints * 2 coordinates (x, y)

        # Combine transformed features into a depth map
        self.combine_fc = nn.Linear(image_size * 2, image_size * image_size)

        # Pretrained ResNet modified for 4-channel input
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(512, num_classes)

    def forward(self, frame, boxes, keypoints):
        # Flatten and transform bounding boxes and keypoints
        boxes_flat = boxes.view(boxes.size(0), -1)  # Flatten boxes
        keypoints_flat = keypoints.view(keypoints.size(0), -1)  # Flatten keypoints

        bbox_feat = self.bbox_fc(boxes_flat)  # Transform bounding boxes
        kp_feat = self.kp_fc(keypoints_flat)  # Transform keypoints

        # Combine transformed features and reshape to (B, 1, H, W)
        combined_feat = torch.cat([bbox_feat, kp_feat], dim=1)
        depth_map = self.combine_fc(combined_feat).view(-1, 1, self.image_size, self.image_size)

        # Concatenate depth map with the image
        frame = torch.cat([frame, depth_map], dim=1)

        # Forward pass through ResNet
        return self.resnet(frame)
    




# Training Loop with Updated Data Handling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = CustomNetwork().to(device)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(5):  # Number of epochs
    model.train()
    for batch in dataloader:
        frame, boxes, keypoints, action, _ = batch

        # Send data to GPU if available
        frame, boxes, keypoints, action = frame.to(device), boxes.to(device), keypoints.to(device), action.to(device)

        # Forward pass
        optimizer.zero_grad()
        output = model(frame, boxes, keypoints)

        # Compute loss and update weights
        loss = criterion(output, action)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 1.3539022207260132
Epoch 2, Loss: 1.2201240062713623
Epoch 3, Loss: 1.2203675508499146
Epoch 4, Loss: 0.8758986592292786
Epoch 5, Loss: 1.2039836645126343


In [158]:
class SimpleVideoDataset(Dataset):
    def __init__(self, video_frames_path, action_txt_path, xml_labels_path):
        """
        Args:
            video_frames_path (str): Path to the folder containing video frames.
            action_txt_path (str): Path to the .txt file containing action labels.
            xml_labels_path (str): Path to the .xml file containing annotations.
        """
        self.video_frames_path = video_frames_path
        self.action_txt_path = action_txt_path
        self.xml_labels_path = xml_labels_path

        # Action mapping
        self.mapping = {
            'take': 0, 'open': 1, 'pour': 2, 'close': 3, 'shake': 4,
            'scoop': 5, 'stir': 6, 'put': 7, 'fold': 8, 'spread': 9, 'background': 10
        }

        # Transform for resizing and normalizing frames
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),  # Resize to ResNet input size
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ResNet normalization
        ])

        # Load data
        self.frames = self._load_frames()
        self.actions = self._load_actions()
        self.annotations = self._load_annotations()

    def _load_frames(self):
        """Load all frames from the video frames directory."""
        frame_files = sorted(os.listdir(self.video_frames_path))  # Sort by frame order
        frames = [
            cv2.imread(os.path.join(self.video_frames_path, frame))
            for frame in frame_files
        ]
        return frames

    def _load_actions(self):
        """Load actions from the action .txt file."""
        with open(self.action_txt_path, "r") as f:
            actions = [line.strip() for line in f.readlines()]
        # Map actions to numerical values
        return [self.mapping[action] for action in actions]

    def _load_annotations(self):
        """Parse the XML file and load annotations."""
        tree = ET.parse(self.xml_labels_path)
        root = tree.getroot()

        annotations = defaultdict(lambda: {"boxes": [], "keypoints": []})

        for image in root.findall("image"):
            # Extract the numeric part of the frame ID
            frame_id_str = image.get("id")
            frame_id = int(frame_id_str.split("_")[-1])  # Extract the numeric ID

            # Extract bounding boxes
            for box in image.findall("box"):
                xtl = float(box.get("xtl"))
                ytl = float(box.get("ytl"))
                xbr = float(box.get("xbr"))
                ybr = float(box.get("ybr"))
                annotations[frame_id]["boxes"].append([xtl, ytl, xbr, ybr])

            # Extract keypoints
            for polyline in image.findall("polyline"):
                points = polyline.get("points")
                points = [
                    tuple(map(float, point.split(",")))
                    for point in points.split(";") if point.strip()
                ]
                annotations[frame_id]["keypoints"].append(points)

        return annotations


    def __len__(self):
        """Return the number of frames."""
        return len(self.frames)

    def __getitem__(self, idx):
        # Collect 5-frame sequence (including padded start/end)
        sequence_frames = []
        sequence_boxes = []
        sequence_keypoints = []

        for offset in range(-2, 3):
            frame_idx = max(0, min(len(self.frames) - 1, idx + offset))
            frame = self.frames[frame_idx]
            annotation = self.annotations[frame_idx]

            # Transform frame
            frame = self.transform(frame)

            # Extract bounding boxes and keypoints
            boxes = torch.tensor(
                [[box[0], box[1], box[2], box[3]] for box in annotation["boxes"]]
            ).float()
            if len(boxes) < 2:
                boxes = torch.cat([boxes, torch.zeros((2 - len(boxes), 4))])

            keypoints = torch.tensor(
                [(kp[0], kp[1]) for kps in annotation["keypoints"] for kp in kps]
            ).float()
            if len(keypoints) < 42:
                keypoints = torch.cat([keypoints, torch.zeros((42 - len(keypoints), 2))])

            sequence_frames.append(frame)
            sequence_boxes.append(boxes)
            sequence_keypoints.append(keypoints)

        return (
            torch.stack(sequence_frames),
            torch.stack(sequence_boxes),
            torch.stack(sequence_keypoints),
            self.actions[idx],
        )



In [160]:
# for batch_idx, (frames, boxes, keypoints, actions) in enumerate(dataloader):
#     # Debugging: Check the shape of the frames tensor
#     print(f"Original Frames Shape: {frames.shape}")  # Should be (B, T, C, H, W)
    
#     # Permute the tensor to match VideoMAE's expected shape
#     frames = frames.permute(0, 2, 1, 3, 4)  # From (B, T, C, H, W) to (B, C, T, H, W)
    
#     # Debugging: Check the shape after permutation
#     print(f"Permuted Frames Shape: {frames.shape}")  # Should be (B, C, T, H, W)


In [None]:
video_frames_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\gtea_png\gtea_png\png\S1_Cheese_C1"
action_txt_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\actions-20241207T081235Z-001\actions\S1_Cheese_C1.txt"
xml_labels_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\xml_labels\S1_Cheese_C1.xml"

dataset = SimpleVideoDataset(video_frames_path, action_txt_path, xml_labels_path)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# # Test
# for batch_idx, (frames, boxes, keypoints, actions) in enumerate(dataloader):
#     print(f"Batch {batch_idx+1}")
#     print(f"Frames Shape: {frames.shape}")  # Shape: (batch_size, 5, 3, 224, 224)
#     print(f"Boxes Shape: {boxes.shape}")    # Shape: (batch_size, 5, 2, 4)
#     print(f"Keypoints Shape: {keypoints.shape}")  # Shape: (batch_size, 5, 42, 2)
#     print(f"Actions Shape: {actions.shape}")  # Shape: (batch_size, 5)
#     break
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import VideoMAEForVideoClassification, VideoMAEConfig
# Define configuration
config = VideoMAEConfig.from_pretrained("MCG-NJU/videomae-base")
config.num_frames = 5
config.image_size = 224
config.output_attentions = True

# Load model with custom configuration
model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base", config=config).to(device)

# Iterate over dataloader
for batch_idx, (frames, _, _, _) in enumerate(dataloader):
    # Normalize frames and permute dimensions
    frames = frames.permute(0, 2, 1, 3, 4).to(device)  # (B, C, T, H, W)
    frames = frames / 255.0  # Normalize to [0, 1]

    import pdb
    pdb.set_trace()

ModuleNotFoundError: No module named 'your_dataset_module'

In [173]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import VideoMAEForVideoClassification, VideoMAEConfig

# Paths to your data
video_frames_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\gtea_png\gtea_png\png\S1_Cheese_C1"
action_txt_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\actions-20241207T081235Z-001\actions\S1_Cheese_C1.txt"
xml_labels_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\xml_labels\S1_Cheese_C1.xml"

# Define dataset and dataloader
dataset = SimpleVideoDataset(video_frames_path, action_txt_path, xml_labels_path)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define VideoMAE configuration
config = VideoMAEConfig.from_pretrained("MCG-NJU/videomae-base")
config.num_frames = 5
config.image_size = 224
config.output_attentions = True

# Load model with custom configuration
model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base", config=config).to(device)

# Iterate over dataloader
for batch_idx, (frames, _, _, _) in enumerate(dataloader):
    # Normalize frames and permute dimensions
    frames = frames / 255.0  # Normalize to [0, 1]
    
    # Forward pass through the model to get attention scores
    outputs = model(frames.to(device))
    attention_scores = outputs.attentions

    # Print attention scores
    print(f"Batch {batch_idx + 1}")
    for i, attention in enumerate(attention_scores):
        print(f"Attention Layer {i + 1}: {attention.shape}")

    # Debugging if needed
    import pdb
    pdb.set_trace()

    # Process only the first batch for testing
    break


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch 1
Attention Layer 1: torch.Size([2, 12, 392, 392])
Attention Layer 2: torch.Size([2, 12, 392, 392])
Attention Layer 3: torch.Size([2, 12, 392, 392])
Attention Layer 4: torch.Size([2, 12, 392, 392])
Attention Layer 5: torch.Size([2, 12, 392, 392])
Attention Layer 6: torch.Size([2, 12, 392, 392])
Attention Layer 7: torch.Size([2, 12, 392, 392])
Attention Layer 8: torch.Size([2, 12, 392, 392])
Attention Layer 9: torch.Size([2, 12, 392, 392])
Attention Layer 10: torch.Size([2, 12, 392, 392])
Attention Layer 11: torch.Size([2, 12, 392, 392])
Attention Layer 12: torch.Size([2, 12, 392, 392])
> [1;32mc:\users\abdul\appdata\local\temp\ipykernel_25836\2842341824.py[0m(46)[0;36m<module>[1;34m()[0m



In [152]:
dataset = SimpleVideoDataset(video_frames_path, action_txt_path, xml_labels_path)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for batch_idx, (_, boxes, keypoints, actions) in enumerate(dataloader):
    frames = frames.to(device)
    print(boxes, keypoints)
    break

tensor([[[[338., 209., 408., 287.],
          [  0.,   0.,   0.,   0.]],

         [[333., 209., 410., 286.],
          [  0.,   0.,   0.,   0.]],

         [[337., 204., 405., 273.],
          [  0.,   0.,   0.,   0.]],

         [[339., 202., 409., 282.],
          [  0.,   0.,   0.,   0.]],

         [[341., 199., 410., 289.],
          [  0.,   0.,   0.,   0.]]]]) tensor([[[[402., 279.],
          [375., 281.],
          [358., 268.],
          [349., 253.],
          [346., 244.],
          [361., 246.],
          [344., 227.],
          [346., 234.],
          [352., 242.],
          [375., 236.],
          [358., 220.],
          [358., 228.],
          [364., 235.],
          [388., 231.],
          [373., 215.],
          [371., 222.],
          [375., 229.],
          [399., 230.],
          [388., 217.],
          [385., 221.],
          [388., 227.],
          [  0.,   0.],
          [  0.,   0.],
          [  0.,   0.],
          [  0.,   0.],
          [  0.,   0.],
     

In [153]:
keypoints.shape

torch.Size([1, 5, 42, 2])

In [None]:
keypoints[0][0] 

tensor([[402., 279.],
        [375., 281.],
        [358., 268.],
        [349., 253.],
        [346., 244.],
        [361., 246.],
        [344., 227.],
        [346., 234.],
        [352., 242.],
        [375., 236.],
        [358., 220.],
        [358., 228.],
        [364., 235.],
        [388., 231.],
        [373., 215.],
        [371., 222.],
        [375., 229.],
        [399., 230.],
        [388., 217.],
        [385., 221.],
        [388., 227.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.],
        [  0.,   0.]])

In [146]:
class SimpleVideoDataset(Dataset):
    def __init__(self, video_frames_path):
        """
        Args:
            video_frames_path (str): Path to the folder containing video frames.
        """
        self.video_frames_path = video_frames_path

        # Transform for resizing and normalizing frames
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),  # Resize to ResNet input size
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ResNet normalization
        ])

        # Load frames
        self.frames = self._load_frames()

    def _load_frames(self):
        """Load all frames from the video frames directory."""
        frame_files = sorted(os.listdir(self.video_frames_path))  # Sort by frame order
        frames = [
            cv2.imread(os.path.join(self.video_frames_path, frame))
            for frame in frame_files
        ]
        return frames

    def __len__(self):
        """Return the number of frames."""
        return len(self.frames)

    def __getitem__(self, idx):
        """Return 5-frame sequences."""
        sequence_frames = []

        for offset in range(-2, 3):  # Generate a sequence of 5 frames
            target_idx = idx + offset
            if target_idx < 0 or target_idx >= len(self.frames):
                # Zero padding for out-of-bounds indices
                empty_frame = torch.zeros((3, 224, 224))
                sequence_frames.append(empty_frame)
            else:
                frame = self.transform(self.frames[target_idx])
                sequence_frames.append(frame)

        frames = torch.stack(sequence_frames)  # Shape: (5, 3, 224, 224)
        return frames


In [147]:
video_frames_path = r"C:\Users\abdul\Desktop\visionrd\AI-Hackathon24\data\gtea_png\gtea_png\png\S1_Cheese_C1"

dataset = SimpleVideoDataset(video_frames_path)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)  # Batch size 2 for demonstration

# Iterate over the dataset
for batch_idx, frames in enumerate(dataloader):
    print(f"Batch {batch_idx+1}")
    print(f"Frames Shape: {frames.shape}")  # Shape: (batch_size, 5, 3, 224, 224)
    break  # Only one batch for demonstration


Batch 1
Frames Shape: torch.Size([2, 5, 3, 224, 224])


In [84]:
frames[0].shape

torch.Size([5, 3, 224, 224])

In [None]:
k