In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import os

# correct path
dataset_path = "/content/drive/MyDrive/dog_dataset_split/train"

# list label types (aggressive / notaggressive)
label_types = os.listdir(dataset_path)
print(label_types)


In [None]:
import os

train_dir = '/content/drive/MyDrive/dog_dataset_split/train'
class_folders = os.listdir(train_dir)
print(class_folders)


In [None]:
rooms = []
for item in class_folders:
    class_folder = os.path.join(train_dir, item)
    if os.path.isdir(class_folder):  # Only proceed if it's a folder
        all_files = os.listdir(class_folder)
        for fname in all_files:
            file_path = os.path.join(class_folder, fname)
            rooms.append((item, file_path))

# Build a dataframe
import pandas as pd
train_df = pd.DataFrame(rooms, columns=['tag', 'video_path'])
print(train_df.head())
print(train_df.tail())

df = train_df.loc[:,['video_path','tag']]
df
df.to_csv('train.csv')

In [None]:
import os
import pandas as pd

test_dir = '/content/drive/MyDrive/dog_dataset_split/test'
activity_types = os.listdir(test_dir)
print("Types of activities found:", activity_types)

rooms = []

for activity in activity_types:
    activity_folder = os.path.join(test_dir, activity)
    if os.path.isdir(activity_folder):  # Only if it's a folder
        for fname in os.listdir(activity_folder):
            # Absolute file path on Drive
            full_path = os.path.join(activity_folder, fname)
            # OR if you want a relative path: os.path.join('dog_dataset_split/test', activity, fname)
            rooms.append((activity, full_path))

# Build a dataframe
test_df = pd.DataFrame(data=rooms, columns=['tag', 'video_path'])
print(test_df.head())
print(test_df.tail())

# If you specifically need only two columns:
df = test_df.loc[:, ['video_path', 'tag']]
df.to_csv('test.csv', index=False)
print("CSV saved as test.csv")


In [None]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")


train_df.sample(10)

In [1]:
#train label encoding
import pandas as pd
import numpy as np
import torch
train_df=pd.read_csv('train.csv')
 #convert classtags to tag encoding
train_df = train_df.dropna(subset=['tag']).reset_index(drop=True)
train_df['tag'] = train_df['tag'].map({'aggressive': 1, 'not_aggressive': 0})
tag= train_df['tag'].astype(int)
tag=torch.tensor(tag)
#print(tag,type(tag))
train_df.head()


#for test


test_df=pd.read_csv('test.csv')
#convert classtags to tag encoding
test_df = test_df.dropna(subset=['tag']).reset_index(drop=True)
test_df['tag'] = test_df['tag'].map({'aggressive': 1, 'not_aggressive': 0})
tag= test_df['tag'].astype(int)
tag=torch.tensor(tag)
print(tag,type(tag))
test_df.head()



KeyboardInterrupt: 

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 16
EPOCHS = 10
MAX_SEQ_LENGTH = 300 # Maximum number of frames to use per video
NUM_FEATURES = 1280 # Number of features extracted by MobileNetV2

In [None]:
import torch.nn as nn
import torchvision.models as models

def build_feature_extractor():
    mobilenet = models.mobilenet_v2(weights="IMAGENET1K_V1")
    mobilenet.eval()
    # Remove classifier head, add adaptive pool and flatten
    feature_extractor = nn.Sequential(
        *list(mobilenet.children())[:-1],      # all but classifier
        nn.AdaptiveAvgPool2d((1, 1)),         # global avg pool
        nn.Flatten()
    )
    return feature_extractor

# Usage:
feature_extractor = build_feature_extractor()

In [None]:
import torch
import os
import frame_constructor as fc  # Assuming this is your feature extraction module

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_path"].values.tolist()
    labels = torch.tensor(df["tag"].values, dtype=torch.long)

    # Preallocate tensors for whole dataset
    frame_masks = torch.zeros((num_samples, MAX_SEQ_LENGTH), dtype=torch.bool)
    frame_features = torch.zeros((num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype=torch.float32)

    for idx, path in enumerate(video_paths): #loop for each video
        # Use absolute path if present, else combine with root_dir
        if os.path.isabs(path):
            video_path = path
        else:
            video_path = os.path.join(root_dir, path)

        frames = fc.frame_cons(video_path)
        # frames shape: (video_length, H, W, C), numpy array # eg:300, 224,224,3

        video_length = frames.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length) #change to min length if video is shorter/larger than MAX_SEQ_LENGTH

        # Process frames one by one (or batch if possible:not doing here)
        #per video
        temp_features = torch.zeros((MAX_SEQ_LENGTH, NUM_FEATURES), dtype=torch.float32) #20,2048

        for i in range(length): #for each frame in the video
            # Convert frame to tensor and permute channels if needed for model
            #print("Feature shape:", temp_features.shape)
            frame = frames[i]  # shape (H, W, C), numpy
            frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).float()  # (1, C, H, W)

            # Normalize as needed for MobileNetV2
            frame_tensor = frame_tensor / 255.0
            mean = torch.tensor([0.485, 0.456, 0.406], device=frame_tensor.device).view(1, 3, 1, 1)
            std = torch.tensor([0.229, 0.224, 0.225], device=frame_tensor.device).view(1, 3, 1, 1)
            frame_tensor = (frame_tensor - mean) / std

            # Extract features with model in eval mode, no grad
            with torch.no_grad():
                #mobile_net_v2 expects input shape (1, 3, IMG_SIZE, IMG_SIZE)
                #output shape of mobile_net_v2 is (batch_size, 1280) after flattning
                feat = feature_extractor(frame_tensor)
                #print("Feat shape:", feat.shape)  # expect output: (1, NUM_FEATURES)
            temp_features[i] = feat.squeeze(0)  # feat.squeeze(0) removes the batch dimension, making it (NUM_FEATURES,)

        # Assign features and mask
        frame_features[idx] = temp_features # Store features for this video
        frame_masks[idx, :length] = True    # Mark valid frames as True in the mask
        print('done for ', idx)
        #print("Frame features shape:", frame_features.shape)
        #print("Frame masks shape:", frame_masks.shape)

    return (frame_features, frame_masks), labels

In [None]:
(train_features, train_masks), train_labels = prepare_all_videos(train_df, " ")
(test_features, test_masks), test_labels = prepare_all_videos(test_df, " ")


In [None]:
print((train_masks.sum(dim=1) == 0).sum().item(), "samples have zero valid frames in training data")


In [None]:
#lstm sequence model
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMSequenceModel(nn.Module):
    def __init__(self, num_features, max_seq_length, num_classes):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size=num_features, hidden_size=16, num_layers=1, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=16, hidden_size=8, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.4)
        self.fc1 = nn.Linear(8, 8)
        self.fc2 = nn.Linear(8, num_classes)
    def forward(self, x, mask=None):
        # x: (batch_size, seq_len, num_features)
        lengths = mask.sum(dim=1) if mask is not None else torch.full((x.size(0),), x.size(1), dtype=torch.long).to(x.device)
        lengths = torch.clamp(lengths, min=1)  # Prevent zero lengths
        # Pack padded sequence for the LSTM
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (h1, _) = self.lstm1(packed)
        packed_output, (h2, _) = self.lstm2(packed_output)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True, total_length=x.size(1))

        # Get the last valid output in each sequence according to actual length
        idx = (lengths - 1).unsqueeze(1).unsqueeze(2).expand(-1, 1, output.shape[2])
        last_outputs = output.gather(1, idx).squeeze(1)
        x = self.dropout(last_outputs)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        #print(x.shape) 16,2
        return x  # logits



In [None]:
from torch.utils.data import TensorDataset, DataLoader


train_dataset = TensorDataset(train_features, train_masks, train_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(test_features, test_masks, test_labels)  # if using validation set
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
NUM_FEATURES = train_features.shape[2]
MAX_SEQ_LENGTH = train_features.shape[1]
NUM_CLASSES = len(torch.unique(train_labels))

model = LSTMSequenceModel(num_features=NUM_FEATURES, max_seq_length=MAX_SEQ_LENGTH, num_classes=NUM_CLASSES)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


In [None]:
import torch.nn as nn

LEARNING_RATE = 1e-3
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

best_val_acc = 0.0
best_model_state = None

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for batch in train_loader:
        features, masks, labels = [x.to(device) for x in batch]  # Unpack batch
        optimizer.zero_grad()
        logits = model(features, masks)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * features.size(0)
        preds = logits.argmax(1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)

    train_acc = train_correct / train_total
    avg_train_loss = train_loss / train_total

    # Validation
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch in val_loader:
            features, masks, labels = [x.to(device) for x in batch]
            logits = model(features, masks)
            preds = logits.argmax(1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)
    val_acc = val_correct / val_total

    print(f"Epoch {epoch+1}/{EPOCHS} - Train loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict().copy()

# Restore best model
if best_model_state:
    model.load_state_dict(best_model_state)


In [None]:
# Save
torch.save(model.state_dict(), "model.pth")


In [None]:
torch.save({
    'epoch': EPOCHS,  # current epoch index (0-based)
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'checkpoint.pth')


In [None]:
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for batch in val_loader:
        features, masks, labels = [x.to(device) for x in batch]
        logits = model(features, masks)
        preds = logits.argmax(1)
        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)
test_acc = test_correct / test_total
print(f"Test accuracy: {test_acc*100:.2f}%")


In [None]:
import torch
import numpy as np
import os
import frame_constructor as fc  # Your feature extraction module

# Define constants (set these as per your setup)
IMG_SIZE = 224
BATCH_SIZE = 16
EPOCHS = 10
MAX_SEQ_LENGTH = 400 # Maximum number of frames to use per video
NUM_FEATURES = 1280 # Number of features extracted by MobileNetV2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hardcoded class vocabulary corresponding to your label encoding
class_vocab = ["not_aggressive", "aggressive"]  # index 0 = not_aggressive, 1 = aggressive

# Your loaded PyTorch models
# feature_extractor: feature extraction model
# model: your LSTMSequenceModel or equivalent for classification
# Make sure they are both on the correct device
feature_extractor.to(device).eval()
model.to(device).eval()

def prepare_single_video(video_path):
    # Load frames using your frame constructor function
    frames = fc.frame_cons(video_path)  # (num_frames, H, W, C), numpy array

    frame_mask = torch.zeros((1, MAX_SEQ_LENGTH), dtype=torch.bool)
    frame_features = torch.zeros((1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype=torch.float32)

    length = min(MAX_SEQ_LENGTH, frames.shape[0])

    for j in range(length):
        frame = frames[j]
        frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).float() / 255.0
        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
        frame_tensor = (frame_tensor - mean) / std
        frame_tensor = frame_tensor.to(device)

        with torch.no_grad():
            feat = feature_extractor(frame_tensor)  # output shape (1, NUM_FEATURES)
        frame_features[0, j, :] = feat.squeeze(0).cpu()

    frame_mask[0, :length] = 1
    return frame_features, frame_mask

def sequence_prediction(video_path):
    frame_features, frame_mask = prepare_single_video(video_path)
    frame_features = frame_features.to(device)
    frame_mask = frame_mask.to(device)

    with torch.no_grad():
        logits = model(frame_features, frame_mask)
        probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]

    print(f"Prediction for video: {video_path}")
    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i]*100:5.2f}%")

    return probabilities

# Example usage:
video_path = "/content/sample_video.mp4"
sequence_prediction(video_path)
