In [4]:
    """
    Args:
        df (pd.DataFrame): dataframe containing 'video_name' and integer 'label' columns already encoded outside.
        root_dir (str): directory path where videos are located.
        feature_extractor (callable): PyTorch model/function that takes a batch of frames and outputs features.
        max_seq_length (int): max number of frames to use per video.
        num_features (int): number of features extracted per frame.

    Returns:
        frame_features (torch.Tensor): tensor of shape (num_samples, max_seq_length, num_features), float32
        frame_masks (torch.BoolTensor): tensor of shape (num_samples, max_seq_length), True if frame present else False
        labels (torch.LongTensor): tensor of shape (num_samples,)
    """


    '''  
#Creates an array of False values of shape (1, MAX_SEQ_LENGTH)
#Example: if MAX_SEQ_LENGTH = 20, shape = (1, 20)
#Purpose: later we'll mark which frames are real (1) vs padding (0)
#dtype="bool"  values are either True/False
#This is a mask: it tells the model which timesteps to pay attention to.


frame_masks: Boolean array marking which timesteps are valid (not padding).

Example: For a video with 13 frames, first 13 cells True, remaining 7 False.


frame_features: Placeholder for extracted features from each frame of each video.

Example: For 145 videos, shape is (145, 20, 2048).


for idx, path in enumerate(video_paths):
    frames = load_video(os.path.join(root_dir, path))
    frames = frames[None, ...]
load_video: Reads all frames from a video file.

Example: video1.mp4 → array of shape (30, 224, 224, 3).

frames[None, ...]: Adds a batch dimension, makes the shape (1, video_length, IMG_SIZE, IMG_SIZE, 3).



feature model:
Pass your frames as (batch, 3, 224, 224) after preprocessing.

The model will always return (batch, 1280).

'''

'  \n#Creates an array of False values of shape (1, MAX_SEQ_LENGTH)\n#Example: if MAX_SEQ_LENGTH = 20, shape = (1, 20)\n#Purpose: later we\'ll mark which frames are real (1) vs padding (0)\n#dtype="bool"  values are either True/False\n#This is a mask: it tells the model which timesteps to pay attention to.\n\n\nframe_masks: Boolean array marking which timesteps are valid (not padding).\n\nExample: For a video with 13 frames, first 13 cells True, remaining 7 False.\n\n\nframe_features: Placeholder for extracted features from each frame of each video.\n\nExample: For 145 videos, shape is (145, 20, 2048).\n\n\nfor idx, path in enumerate(video_paths):\nframes = load_video(os.path.join(root_dir, path))\nframes = frames[None, ...]\nload_video: Reads all frames from a video file.\n\nExample: video1.mp4 → array of shape (30, 224, 224, 3).\n\nframes[None, ...]: Adds a batch dimension, makes the shape (1, video_length, IMG_SIZE, IMG_SIZE, 3).\n\n\n\nfeature model:\nPass your frames as (batch, 3,

In [5]:
#If you prefer more control or need to preprocess labels differently.
#If you want to reuse the same processed labels in multiple places without re-encoding each time.
#When you want to explicitly check or debug label encoding separately.

In [6]:
import torch
import torch.nn as nn
import torchvision.models as models

def build_feature_extractor():
    """
    Returns a MobileNetV2 feature extractor (pretrained on ImageNet) that
    outputs a 1280-dim feature vector for each image.
    """
    mobilenet = models.mobilenet_v2(pretrained=True)
    mobilenet.eval()  # Set to eval mode

    # Remove the classifier head, so output is (batch, 1280, 1, 1)
    feature_extractor = nn.Sequential(*list(mobilenet.children())[:-1])
    
    # Wrap to flatten features from (batch, 1280, 1, 1) to (batch, 1280)
    class FlattenExtract(nn.Module):
        def __init__(self, base):
            super().__init__()
            self.base = base
        def forward(self, x):
            x = self.base(x)
            return x.view(x.size(0), -1)  # Flatten

    return FlattenExtract(feature_extractor)

# Usage:
feature_extractor = build_feature_extractor()




In [7]:
import torch.nn as nn
import torchvision.models as models

def build_feature_extractor():
    mobilenet = models.mobilenet_v2(weights="IMAGENET1K_V1")
    mobilenet.eval()
    # Remove classifier head, add adaptive pool and flatten
    feature_extractor = nn.Sequential(
        *list(mobilenet.children())[:-1],      # all but classifier
        nn.AdaptiveAvgPool2d((1, 1)),         # global avg pool
        nn.Flatten()
    )
    return feature_extractor

# Usage:
feature_extractor = build_feature_extractor()


In [None]:
#test
import pandas as pd
import numpy as np
import torch
df=pd.read_csv(r'C:\Users\aliza\Desktop\og_behaviour\dog_dataset_full_shuffled.csv')
 #convert classlabels to label encoding
df = df.dropna(subset=['label']).reset_index(drop=True)
df['label'] = df['label'].map({'aggressive': 1, 'not_aggressive': 0})
labels= df['label'].astype(int) 
labels=torch.tensor(labels)
print(labels,type(labels))



tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0]) <class 'torch

In [5]:
import os
import pandas as pd

df = pd.read_csv(r"C:\Users\aliza\Desktop\og_behaviour\dog_dataset_full_shuffled.csv")

# Check if files actually exist
df['exists'] = df['video_path'].apply(lambda x: os.path.exists(x))

print(df['exists'].value_counts())
print(df[df['exists'] == False].head())  # show any missing files




exists
True    294
Name: count, dtype: int64
Empty DataFrame
Columns: [video_path, action, label, exists]
Index: []


In [None]:
from IPython.display import HTML

video_path = df['video_path'][0] 

HTML(f"""
    <video alt="test" width="520" height="440" controls>
        <source src="{video_path}" type="video/mp4">
    </video>
""")


In [2]:
from sklearn.model_selection import train_test_split
# Split the DataFrame into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(df.head())


                          video_path    action  label
0     dataset_2/barking/barks_39.mp4   barking      1
1     dataset_2/barking/barks_17.mp4   barking      1
2   dataset_2/panting/preview_50.mp4   panting      0
3  dataset_2/sleeping/preview_47.mp4  sleeping      0
4  dataset_2/sleeping/preview_26.mp4  sleeping      0


In [10]:
IMG_SIZE = 224
BATCH_SIZE = 16
EPOCHS = 10
MAX_SEQ_LENGTH = 50 # Maximum number of frames to use per video
NUM_FEATURES = 1280 # Number of features extracted by MobileNetV2

In [16]:
import torch
import os
import frame_constructor as fc  # Assuming this is your feature extraction module

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_path"].values.tolist()
    labels = torch.tensor(df["label"].values, dtype=torch.long)
 
    # Preallocate tensors for whole dataset
    frame_masks = torch.zeros((num_samples, MAX_SEQ_LENGTH), dtype=torch.bool)
    frame_features = torch.zeros((num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype=torch.float32)

    for idx, path in enumerate(video_paths): #loop for each video
        # Use absolute path if present, else combine with root_dir
        if os.path.isabs(path):
            video_path = path
        else:
            video_path = os.path.join(root_dir, path)

        frames = fc.frame_cons(video_path)  
        # frames shape: (video_length, H, W, C), numpy array # eg:300, 224,224,3

        video_length = frames.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length) #change to min length if video is shorter/larger than MAX_SEQ_LENGTH

        # Process frames one by one (or batch if possible:not doing here) 
        #per video
        temp_features = torch.zeros((MAX_SEQ_LENGTH, NUM_FEATURES), dtype=torch.float32) #20,2048

        for i in range(length): #for each frame in the video
            # Convert frame to tensor and permute channels if needed for model
            #print("Feature shape:", temp_features.shape)
            frame = frames[i]  # shape (H, W, C), numpy
            frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).float()  # (1, C, H, W)
            
            # Normalize as needed for MobileNetV2
            frame_tensor = frame_tensor / 255.0
            mean = torch.tensor([0.485, 0.456, 0.406], device=frame_tensor.device).view(1, 3, 1, 1)
            std = torch.tensor([0.229, 0.224, 0.225], device=frame_tensor.device).view(1, 3, 1, 1)
            frame_tensor = (frame_tensor - mean) / std

            # Extract features with model in eval mode, no grad
            with torch.no_grad():
                #mobile_net_v2 expects input shape (1, 3, IMG_SIZE, IMG_SIZE) 
                #output shape of mobile_net_v2 is (batch_size, 1280) after flattning
                feat = feature_extractor(frame_tensor)
                #print("Feat shape:", feat.shape)  # expect output: (1, NUM_FEATURES)
            temp_features[i] = feat.squeeze(0)  # feat.squeeze(0) removes the batch dimension, making it (NUM_FEATURES,)

        # Assign features and mask
        frame_features[idx] = temp_features # Store features for this video
        frame_masks[idx, :length] = True    # Mark valid frames as True in the mask

    return (frame_features, frame_masks), labels


In [17]:
prepare_all_videos(train_df, ' ')

((tensor([[[2.2593e-02, 5.4808e-01, 1.0946e-01,  ..., 1.6816e-01,
            1.4282e-01, 0.0000e+00],
           [2.2218e-02, 5.7474e-01, 1.6505e-01,  ..., 1.2732e-01,
            1.4164e-01, 0.0000e+00],
           [2.4365e-02, 5.0617e-01, 1.3601e-01,  ..., 1.2516e-01,
            1.2625e-01, 0.0000e+00],
           ...,
           [7.2541e-01, 9.2804e-01, 1.9170e-01,  ..., 2.2702e-01,
            4.1084e-01, 7.8962e-03],
           [8.1539e-01, 9.3658e-01, 1.5577e-01,  ..., 2.1304e-01,
            6.1594e-01, 0.0000e+00],
           [6.8362e-01, 9.3220e-01, 2.7625e-01,  ..., 2.7091e-01,
            5.9455e-01, 0.0000e+00]],
  
          [[1.7328e-01, 1.0755e+00, 4.9774e-01,  ..., 9.0506e-03,
            0.0000e+00, 1.3351e+00],
           [1.4263e-01, 1.3234e+00, 5.3215e-01,  ..., 6.9599e-03,
            0.0000e+00, 1.1690e+00],
           [1.0735e-01, 1.4003e+00, 5.9207e-01,  ..., 5.2507e-03,
            3.7992e-02, 1.1076e+00],
           ...,
           [2.7725e-02, 7.6252e-01, 3

In [15]:
#the sequence model
import torch
import torch.nn as nn
import torch.nn.functional as F

class SequenceModel(nn.Module):
    def __init__(self, num_features, max_seq_length, num_classes):
        super().__init__()
        self.gru1 = nn.GRU(input_size=num_features, hidden_size=16, num_layers=1,
                           batch_first=True)
        self.gru2 = nn.GRU(input_size=16, hidden_size=8, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.4)
        self.fc1 = nn.Linear(8, 8)
        self.fc2 = nn.Linear(8, num_classes)

    def forward(self, x, mask=None):
        # x: (batch_size, seq_len, num_features)
        lengths = mask.sum(dim=1) if mask is not None else torch.full((x.size(0),), x.size(1), dtype=torch.long).to(x.device)
        # Pack the sequence for variable-length batch processing
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.gru1(packed)
        output, _ = self.gru2(packed_output)
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True, total_length=x.size(1))

        # Take the last valid output in each sequence according to lengths
        idx = (lengths - 1).unsqueeze(1).unsqueeze(2).expand(-1, 1, output.shape[2])
        last_outputs = output.gather(1, idx).squeeze(1)
        x = self.dropout(last_outputs)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x  # logits
