In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as transforms
from torchvision.io import read_video

import open_clip

import os

In [12]:
class DeadliftVideoDataset(Dataset):
    def __init__(self, video_paths, labels, transform=None, frames_per_video=16):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform
        self.frames_per_video = frames_per_video
        self.label_to_idx = {
            'good movement' : 0,
            'bad movement' : 1
        }
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.label_to_idx[self.labels[idx]]
        
        # Read video frames
        video_frames, _, info = read_video(video_path, pts_unit='sec')
        total_frames = video_frames.shape[0]

        # Sample frames evenly throughout the video
        indices = torch.linspace(0, total_frames - 1, self.frames_per_video).long()
        sampled_frames = video_frames[indices]
        
        # Apply transforms to frames
        if self.transform:
            frames = [self.transform(frame.permute(2, 0, 1)) for frame in sampled_frames]
        else:
            frames = [frame.permute(2, 0, 1) for frame in sampled_frames]


        # Stack frames into a tensor
        frames_tensor = torch.stack(frames)  # Shape: [frames_per_video, C, H, W]

        return frames_tensor, label

In [13]:
transform = transforms.Compose([
    transforms.ConvertImageDtype(torch.float32),
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.Normalize(
        mean=(0.48145466, 0.4578275, 0.40821073),
        std=(0.26862954, 0.26130258, 0.27577711)
    ),
])


In [14]:
model_name = 'ViT-B-32'  # You can choose other architectures
pretrained = 'openai'

model, _, _ = open_clip.create_model_and_transforms(
    model_name=model_name,
    pretrained=pretrained
)
model.eval()  # Set to evaluation mode

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [15]:
class VideoCLIPClassifier(nn.Module):
    def __init__(self, clip_model, num_classes):
        super(VideoCLIPClassifier, self).__init__()
        self.clip_model = clip_model
        self.num_classes = num_classes
        self.fc = nn.Linear(clip_model.visual.output_dim, num_classes)
        
    def forward(self, frames):
        # frames: [batch_size, frames_per_video, C, H, W]
        batch_size, frames_per_video, C, H, W = frames.shape
        frames = frames.view(-1, C, H, W)  # Flatten frames
        with torch.no_grad():
            frame_features = self.clip_model.encode_image(frames)  # [batch_size * frames_per_video, output_dim]
        frame_features = frame_features.view(batch_size, frames_per_video, -1)
        video_features = frame_features.mean(dim=1)  # Average over frames
        logits = self.fc(video_features)  # [batch_size, num_classes]
        return logits

In [16]:
num_classes = 2  # Number of movement labels
classifier = VideoCLIPClassifier(model, num_classes)

In [17]:
import os
from torch.utils.data import DataLoader

# Path to the dataset folder containing the "Adam" subfolder with "bad" and "good" subfolders
dataset_root = './dataset/Adam'

# Initialize empty lists for video paths and labels
video_paths = []
labels = []

# Traverse through "bad" and "good" folders in the "Adam" folder
for label_folder in ['bad', 'good']:
    label_path = os.path.join(dataset_root, label_folder)
    
    # Check if the folder exists
    if not os.path.isdir(label_path):
        print(f"Folder '{label_folder}' does not exist in '{dataset_root}'")
        continue

    # Assign label based on the folder name
    label = 'bad movement' if label_folder == 'bad' else 'good movement'
    
    # Collect video paths and labels
    for video_file in os.listdir(label_path):
        if video_file.endswith('.mp4'):  # Only consider .mp4 files
            video_paths.append(os.path.join(label_path, video_file))
            labels.append(label)

# Print the results to verify
print("Video Paths:", video_paths)
print("Labels:", labels)

# Example dataset and dataloader initialization with the collected paths and labels
dataset = DeadliftVideoDataset(video_paths, labels, transform=transform)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)


Video Paths: ['./dataset/Adam/bad/bad007.mp4', './dataset/Adam/bad/bad013.mp4', './dataset/Adam/bad/bad012.mp4', './dataset/Adam/bad/bad006.mp4', './dataset/Adam/bad/bad010.mp4', './dataset/Adam/bad/bad004.mp4', './dataset/Adam/bad/bad005.mp4', './dataset/Adam/bad/bad011.mp4', './dataset/Adam/bad/bad015.mp4', './dataset/Adam/bad/bad001.mp4', './dataset/Adam/bad/bad014.mp4', './dataset/Adam/bad/bad002.mp4', './dataset/Adam/bad/bad016.mp4', './dataset/Adam/bad/bad017.mp4', './dataset/Adam/bad/bad003.mp4', './dataset/Adam/bad/bad019.mp4', './dataset/Adam/bad/bad018.mp4', './dataset/Adam/bad/bad008.mp4', './dataset/Adam/bad/bad009.mp4', './dataset/Adam/good/good014.mp4', './dataset/Adam/good/good015.mp4', './dataset/Adam/good/good001.mp4', './dataset/Adam/good/good017.mp4', './dataset/Adam/good/good003.mp4', './dataset/Adam/good/good002.mp4', './dataset/Adam/good/good016.mp4', './dataset/Adam/good/good012.mp4', './dataset/Adam/good/good006.mp4', './dataset/Adam/good/good007.mp4', './datase

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classifier = classifier.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.fc.parameters(), lr=1e-4)  # Only train the classification head

In [19]:
num_epochs = 10

for epoch in range(num_epochs):
    classifier.train()
    running_loss = 0.0
    for i, (frames, labels) in enumerate(dataloader):
        frames = frames.to(device)  # [batch_size, frames_per_video, C, H, W]
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = classifier(frames)  # [batch_size, num_classes]
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i}/{len(dataloader)}], Loss: {loss.item():.4f}')

    epoch_loss = running_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')


Epoch [1/10], Step [0/20], Loss: 0.6482


KeyboardInterrupt: 