In [16]:
import os
import pandas as pd
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import cv2
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

In [17]:
# Define the directory where the videos are stored
video_dir = '/kaggle/input/realistic-action-recognition-ucf50/UCF50'  # Replace with your actual video directory

# Load the new CSV file
new_csv_path = '/kaggle/input/har-csv/newcsv.csv'
df_ucf50 = pd.read_csv(new_csv_path)

# Update the filename column to include the video directory
df_ucf50['filename'] = df_ucf50['filename'].apply(lambda x: os.path.join(video_dir, x))

# Display the first few rows of the DataFrame to verify
print(df_ucf50.head())

# Split the dataset
train_df, temp_df = train_test_split(df_ucf50, test_size=0.2, random_state=42, stratify=df_ucf50['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

                                            filename          label
0  /kaggle/input/realistic-action-recognition-ucf...  BaseballPitch
1  /kaggle/input/realistic-action-recognition-ucf...  BaseballPitch
2  /kaggle/input/realistic-action-recognition-ucf...  BaseballPitch
3  /kaggle/input/realistic-action-recognition-ucf...  BaseballPitch
4  /kaggle/input/realistic-action-recognition-ucf...  BaseballPitch


In [18]:
class UCF50Dataset(Dataset):
    VIDEO_EXTENSION = '.avi'

    def __init__(self, df_ucf50, transform=None, num_samples=16, frames_per_video=32):
        # Check that required columns exist
        if 'filename' not in df_ucf50.columns or 'label' not in df_ucf50.columns:
            raise ValueError("DataFrame must contain 'filename' and 'label' columns.")
        
        valid_videos = []
        for _, row in df_ucf50.iterrows():
            video_path = row['filename'].replace('\\', '/')  # Ensure the path uses the correct separators
            if self._is_valid_video(video_path):
                valid_videos.append({'filename': video_path, 'label': row['label']})

        # Ensure we have valid videos
        if len(valid_videos) == 0:
            raise ValueError("No valid videos found. Please check your dataset.")

        self.dataset = pd.DataFrame(valid_videos)
        self.video_paths = self.dataset['filename'].tolist()
        self.labels = self.dataset['label'].tolist()
        self.transform = transform
        self.label_to_index = {label: idx for idx, label in enumerate(sorted(set(self.labels)))}
        self.num_samples = num_samples
        self.frames_per_video = frames_per_video

    def _is_valid_video(self, video_path):
        if not os.path.isfile(video_path):
            logger.warning(f"File does not exist: {video_path}")
            return False
        if not video_path.endswith(self.VIDEO_EXTENSION):
            logger.warning(f"Skipping unsupported video format: {video_path}")
            return False
        return True

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            raise ValueError(f"Error opening video file: {video_path}")

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = self._get_frame_indices(total_frames)
        frames = []
        
        for i in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                logger.warning(f"Failed to read frame {i} from video: {video_path}")
                continue  # Skip this frame if reading failed
            
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert frame to RGB
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)

        cap.release()
        
        if len(frames) == 0:
            raise ValueError(f"No frames extracted from video: {video_path}")
        
        frames = torch.stack(frames)  # Stack frames into a tensor
        label_index = self.label_to_index[label]  # Convert label to index
        return frames, label_index

    def _get_frame_indices(self, total_frames):
        if total_frames < self.frames_per_video:
            return list(range(total_frames))  # Return all frames if fewer than expected
        step = total_frames // self.frames_per_video
        return [i * step for i in range(self.frames_per_video)]  # Sample evenly spaced frames

In [19]:
# Step 4: Define Transforms
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [20]:
# Step 5: Initialize Datasets and DataLoaders
train_dataset = UCF50Dataset(train_df, transform=transform)
val_dataset = UCF50Dataset(val_df, transform=transform)
test_dataset = UCF50Dataset(test_df, transform=transform)

def collate_fn(batch):
    # Assuming each item in batch is (frames, label)
    frames = [item[0] for item in batch]  # Get the list of frames
    labels = [item[1] for item in batch]  # Get the list of labels
    
    # Check if each frame set has the correct shape
    for i in range(len(frames)):
        if frames[i].ndim == 3:  # If the frames are in shape (C, H, W)
            frames[i] = frames[i].unsqueeze(0)  # Add a sequence dimension
        elif frames[i].ndim == 4:  # If already in (T, C, H, W)
            frames[i] = frames[i]  # Do nothing
        else:
            raise ValueError(f"Unexpected frame shape: {frames[i].shape}")

    # Stack frames: assuming each video has the same number of frames and shape
    frames = torch.stack(frames)

    # Now frames should be (B, T, C, H, W)
    return frames, torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, collate_fn=collate_fn, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True)

In [21]:
# Step 6: Define Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class MobileNet(nn.Module):
    def __init__(self, num_classes):
        super(MobileNet, self).__init__()
        # Load MobileNetV2 or MobileNetV3 model
        self.model = models.mobilenet_v2(pretrained=True)
        self.model.classifier[1] = nn.Linear(self.model.classifier[1].in_features, num_classes)  # Change output layer

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(-1, C, H, W)  # Reshape to (B*T, C, H, W)
        x = self.model(x)  # Forward pass through MobileNet
        x = x.view(B, T, -1)  # Reshape back to (B, T, num_classes)
        x = x.mean(dim=1)  # Average across time dimension
        return x

# Setup for training
num_classes = len(train_dataset.label_to_index)
model = MobileNet(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

Using device: cuda




In [22]:
# Step 7: Define Training and Evaluation Functions
def calculate_metrics(labels, preds):
    labels_np = labels.cpu().numpy()
    preds_np = preds.cpu().numpy()
    precision = precision_score(labels_np, preds_np, average='weighted') * 100
    recall = recall_score(labels_np, preds_np, average='weighted') * 100
    f1 = f1_score(labels_np, preds_np, average='weighted') * 100
    return precision, recall, f1

def train(model, dataloader, optimizer, criterion, device):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct = 0
    total = 0

    for inputs, labels in tqdm(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)  # Move to the correct device
        
        # Ensure labels are in the right format (if needed)
        if len(labels.shape) != 1:  # Check if labels are not 1D
            raise ValueError("Labels must be a 1D tensor.")
        
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update parameters

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

def evaluate(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader):
            inputs, labels = inputs.to(device), labels.to(device)  # Move to the correct device
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            all_preds.extend(predicted.cpu().numpy())  # Store predictions
            all_labels.extend(labels.cpu().numpy())  # Store true labels

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    precision, recall, f1 = calculate_metrics(torch.tensor(all_labels), torch.tensor(all_preds))
    return avg_loss, accuracy, precision, recall, f1

In [23]:
# Step 8: Train the Model
num_epochs = 10
best_val_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc, val_precision, val_recall, val_f1 = evaluate(model, val_loader, device)

    logger.info(f'Epoch [{epoch + 1}/{num_epochs}], '
                f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
                f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, '
                f'Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}')

    # Save the model if validation accuracy improves
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_mobilenet_model.pth')


  0%|          | 0/668 [00:11<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.15 GiB. GPU 0 has a total capacity of 15.89 GiB of which 301.12 MiB is free. Process 5537 has 15.59 GiB memory in use. Of the allocated memory 15.29 GiB is allocated by PyTorch, and 13.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Step 9: Test the Model
model.load_state_dict(torch.load('best_mobilenet_model.pth'))
test_loss, test_acc, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)

logger.info(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, '
            f'Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')