# COS 470 - Image Processing and Computer Vision

#  Sports Image Classification -- Video Prediction
### Authors: Ethan Gilles, Josh Thyng, Sam Fickett

---

## Setup

In [36]:
import os
import sys
import math
import torch
import torchvision
from PIL import Image
import numpy as np 
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F

from torchvision.utils import make_grid
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms
import torch.optim as optim
from torchinfo import summary

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.device(device)

class SportsNet(nn.Module):
    def __init__(self):
        super(SportsNet, self).__init__()
        self.model = nn.Sequential(
            # First Convolutional Layer
            nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=0),
            nn.BatchNorm2d(96),  # Batch normalization for 96 channels
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),

            # Second Convolutional Layer
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),  # Batch normalization for 256 channels
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),

            # Third Convolutional Layer
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),  # Batch normalization for 384 channels
            nn.ReLU(),


            # Fourth Convolutional Layer
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),  # Batch normalization for 384 channels
            nn.ReLU(),


            # Fifth Convolutional Layer
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),  # Batch normalization for 256 channels
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),

            # Flatten and Fully Connected Layers
            nn.Flatten(),
            nn.Linear(in_features=6400, out_features=4096),
            nn.BatchNorm1d(4096),  # Batch normalization for fully connected layer
            nn.ReLU(),
            nn.Dropout(),

            nn.Linear(in_features=4096, out_features=2048),
            nn.BatchNorm1d(2048),  # Batch normalization for fully connected layer
            nn.ReLU(),
            nn.Dropout(),

            nn.Linear(in_features=2048, out_features=100),  # Output layer
        )

    def forward(self, x):
        return self.model(x)

# Create model instance
model = SportsNet().to(device)

img_transforms  = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.25), # 25% chance for horizontal flip
    transforms.RandomVerticalFlip(p=0.25),   # 25% change for vertical flip
    transforms.RandomRotation(degrees=45),   # Rotates to a random angle between -45 and 45 degrees
    transforms.ToTensor(),                   # Makes it a PyTorch Tensor

    # Normalization values reused from a source on Kaggle
    # other norm values could be used depending on accuracy:
    # Other Source -> mean=[-0.0932, -0.0971, -0.1260], std=[0.5091, 0.4912, 0.4931]
    # ImageNet -> mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]

    
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Use image folder to load the images and apply transforms
test_data = ImageFolder(root='test', transform=img_transforms)
test_dataloader = DataLoader(dataset=test_data, batch_size=64, shuffle=False, num_workers=2)

## Test on video file

In [45]:
import cv2
class_names = test_dataloader.dataset.classes
model = SportsNet()
model.load_state_dict(torch.load('SportsNet_weights.pth', weights_only=True))
model.eval()
model = model.to(device)


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

video_path = "Golf.mp4"
cap = cv2.VideoCapture(video_path)
frame_predictions = []  # Stores probabilities for all frames
frame_class_indices = []  # Stores predicted class indices for all frames
frame_predictions_with_indices = []  # Stores (frame_index, predicted_class_name)

with torch.no_grad():
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # Process the frame (convert to tensor and normalize)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_pil = Image.fromarray(frame_rgb)
        input_tensor = transform(frame_pil)
        input_batch = input_tensor.unsqueeze(0).to(device)

        # Get predictions from the model
        outputs = model(input_batch)
        probs = F.softmax(outputs, dim=1)

        # Append predictions and class index
        frame_predictions.append(probs.cpu().numpy()[0])
        predicted_class_idx = probs.argmax(dim=1).item()
        frame_class_indices.append(predicted_class_idx)

        predicted_class_name = class_names[predicted_class_idx]
        frame_predictions_with_indices.append((frame_count, predicted_class_name))

cap.release()

# Most common class prediction
most_common_class_idx = max(set(frame_class_indices), key=frame_class_indices.count)
predicted_class_name = class_names[most_common_class_idx]
count_most_common = frame_class_indices.count(most_common_class_idx)

print(f"The video is predicted to be: {predicted_class_name} chosen {count_most_common} times out of {len(frame_class_indices)} frames.")

# Individual frame predictions
print("\nIndividual Frame Predictions:")
for frame_idx, class_name in frame_predictions_with_indices:
    print(f"Frame {frame_idx}: Predicted class = {class_name}")

The video is predicted to be: golf chosen 343 times out of 417 frames.

Individual Frame Predictions:
Frame 1: Predicted class = frisbee
Frame 2: Predicted class = frisbee
Frame 3: Predicted class = frisbee
Frame 4: Predicted class = frisbee
Frame 5: Predicted class = frisbee
Frame 6: Predicted class = frisbee
Frame 7: Predicted class = frisbee
Frame 8: Predicted class = frisbee
Frame 9: Predicted class = frisbee
Frame 10: Predicted class = frisbee
Frame 11: Predicted class = frisbee
Frame 12: Predicted class = frisbee
Frame 13: Predicted class = frisbee
Frame 14: Predicted class = frisbee
Frame 15: Predicted class = frisbee
Frame 16: Predicted class = frisbee
Frame 17: Predicted class = frisbee
Frame 18: Predicted class = frisbee
Frame 19: Predicted class = frisbee
Frame 20: Predicted class = frisbee
Frame 21: Predicted class = frisbee
Frame 22: Predicted class = frisbee
Frame 23: Predicted class = frisbee
Frame 24: Predicted class = ultimate
Frame 25: Predicted class = ultimate
Frame