In [1]:
import os
import cv2
import torch
import gc
from deep_sort_realtime.deepsort_tracker import DeepSort

# Initialize YOLOv5 model with GPU optimization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.hub.load('ultralytics/yolov5', 'yolov5s').to(device)
model.conf = 0.5  # Confidence threshold
model.iou = 0.45  # NMS IoU threshold
classes_of_interest = ['person', 'sports ball']

# Initialize DeepSORT with FP16 if GPU available
deepsort = DeepSort(
    max_age=30,
    n_init=3,
    nn_budget=100,
    embedder='mobilenet',
    half=(device.type == 'cuda')  # Use FP16 only on GPU
)

def process_video(input_path, output_path):
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error opening video: {input_path}")
        return

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # Initialize VideoWriter with same properties as input
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Preprocessing - optimized RGB conversion
        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Inference with autocast for mixed precision
        with torch.no_grad(), torch.amp.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):
            results = model([img_rgb])
        
        # Format detections for DeepSORT (x,y,w,h,conf,class)
        detections = []
        for *xyxy, conf, cls in results.xyxy[0].cpu().numpy():
            class_name = model.names[int(cls)]
            if class_name in classes_of_interest:
                detections.append([
                    [xyxy[0], xyxy[1], xyxy[2]-xyxy[0], xyxy[3]-xyxy[1]],  # x,y,w,h
                    float(conf),
                    class_name
                ])
        
        # Update tracker
        tracks = deepsort.update_tracks(detections, frame=frame)
        
        # Visualization
        for track in tracks:
            if not track.is_confirmed():
                continue
                
            track_id = track.track_id
            ltrb = track.to_ltrb()
            color = (0, 255, 0) if track.get_det_class() == 'person' else (0, 0, 255)
            
            cv2.rectangle(frame, 
                         (int(ltrb[0]), int(ltrb[1])), 
                         (int(ltrb[2]), int(ltrb[3])), 
                         color, 2)
            cv2.putText(frame, f"ID: {track_id}", 
                       (int(ltrb[0]), int(ltrb[1])-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
        
        out.write(frame)
        frame_count += 1
        
        # Periodic cleanup every 100 frames
        if frame_count % 100 == 0:
            gc.collect()

    cap.release()
    out.release()
    print(f"Processed {frame_count} frames: {output_path}")

# Main execution
if __name__ == "__main__":
    input_dir = "SCAI_Dataset"
    output_dir = "Processed_videos"
    os.makedirs(output_dir, exist_ok=True)

    for video_file in sorted(os.listdir(input_dir)):
        if video_file.lower().endswith(('.mp4', '.avi', '.mov')):
            input_path = os.path.join(input_dir, video_file)
            output_path = os.path.join(output_dir, f"tracked_{video_file}")
            print(f"\nProcessing: {video_file}")
            process_video(input_path, output_path)

Using cache found in C:\Users\matab/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-3-26 Python-3.13.2 torch-2.6.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 



Processing: A1606b0e6_0 (5).mp4


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

KeyboardInterrupt: 

In [2]:
import os
import cv2
import torch
import numpy as np
from collections import defaultdict
from torchvision import transforms, models
from deep_sort_realtime.deepsort_tracker import DeepSort

class ActionRecognizer:
    def __init__(self, device='cuda'):
        # Define action classes first
        self.action_classes = ['running', 'walking', 'passing', 'shooting', 'dribbling', 'standing']
        
        # Then initialize other attributes
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.model = self._load_action_model()
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.action_history = defaultdict(lambda: defaultdict(list))
        
    def _load_action_model(self):
        """Load a simplified action recognition model"""
        # Load pre-trained R3D model
        model = models.video.r3d_18(pretrained=True)
        
        # Replace the final fully connected layer
        num_features = model.fc.in_features
        model.fc = torch.nn.Linear(num_features, len(self.action_classes))
        
        model = model.to(self.device)
        model.eval()
        return model
    
    def recognize_actions(self, frame, tracks):
        """Recognize actions for each tracked player"""
        results = []
        
        for track in tracks:
            if not track.is_confirmed():
                continue
                
            track_id = track.track_id
            ltrb = track.to_ltrb()
            
            # Crop player region with padding
            x1, y1, x2, y2 = map(int, ltrb)
            padding = 20
            x1 = max(0, x1 - padding)
            y1 = max(0, y1 - padding)
            x2 = min(frame.shape[1], x2 + padding)
            y2 = min(frame.shape[0], y2 + padding)
            
            player_crop = frame[y1:y2, x1:x2]
            
            if player_crop.size == 0:
                continue
                
            # Preprocess and predict (using single frame for simplicity)
            input_tensor = self.transform(player_crop).unsqueeze(0).to(self.device)
            
            with torch.no_grad():
                # Add temporal dimension (1 frame) and channel dimension
                outputs = self.model(input_tensor.unsqueeze(2))  # Shape: [1, 3, 1, 224, 224]
                probs = torch.nn.functional.softmax(outputs, dim=1)
                top_prob, top_class = torch.max(probs, 1)
                
            action = self.action_classes[top_class.item()]
            confidence = top_prob.item()
            
            # Update action history
            self.action_history[track_id][action].append(confidence)
            
            # Get most frequent action in last 5 frames
            recent_actions = []
            for act, confs in self.action_history[track_id].items():
                recent_actions.extend([act] * min(5, len(confs)))
                
            final_action = max(set(recent_actions), key=recent_actions.count) if recent_actions else action
                
            results.append({
                'track_id': track_id,
                'bbox': ltrb,
                'action': final_action,
                'confidence': confidence
            })
            
        return results

def process_video_with_actions(input_path, output_path):
    """Process video with tracking and action recognition"""
    # Initialize models
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    yolov5 = torch.hub.load('ultralytics/yolov5', 'yolov5s').to(device)
    yolov5.conf = 0.5
    yolov5.iou = 0.45
    classes_of_interest = ['person', 'sports ball']
    
    deepsort = DeepSort(
        max_age=30,
        n_init=3,
        nn_budget=100,
        embedder='mobilenet',
        half=(device.type == 'cuda')
    )
    
    action_recognizer = ActionRecognizer(device)
    
    # Video setup
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error opening video: {input_path}")
        return

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Tracking
        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        with torch.no_grad(), torch.amp.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):
            results = yolov5([img_rgb])
        
        detections = []
        for *xyxy, conf, cls in results.xyxy[0].cpu().numpy():
            class_name = yolov5.names[int(cls)]
            if class_name in classes_of_interest:
                detections.append([
                    [xyxy[0], xyxy[1], xyxy[2]-xyxy[0], xyxy[3]-xyxy[1]],
                    float(conf),
                    class_name
                ])
        
        tracks = deepsort.update_tracks(detections, frame=frame)
        
        # Action Recognition
        player_tracks = [t for t in tracks if t.is_confirmed() and t.get_det_class() == 'person']
        action_results = action_recognizer.recognize_actions(frame, player_tracks)
        
        # Visualization
        for track in tracks:
            if not track.is_confirmed():
                continue
                
            track_id = track.track_id
            ltrb = track.to_ltrb()
            
            color = (255, 0, 0) if track.get_det_class() == 'sports ball' else (0, 255, 0)
            action_info = next((a for a in action_results if a['track_id'] == track_id), None)
            
            if action_info:
                action = action_info['action']
                confidence = action_info['confidence']
                color = (0, 165, 255)  # Orange for players with actions
                cv2.putText(frame, f"{action} ({confidence:.2f})", 
                           (int(ltrb[0]), int(ltrb[3]) + 20), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
            
            cv2.rectangle(frame, 
                         (int(ltrb[0]), int(ltrb[1])), 
                         (int(ltrb[2]), int(ltrb[3])), 
                         color, 2)
            cv2.putText(frame, f"ID: {track_id}", 
                       (int(ltrb[0]), int(ltrb[1])-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
        
        out.write(frame)
        frame_count += 1
        
        if frame_count % 100 == 0:
            print(f"Processed {frame_count} frames")
            torch.cuda.empty_cache()

    cap.release()
    out.release()
    print(f"Finished processing: {output_path}")

if __name__ == "__main__":
    # Install required packages if missing
    try:
        import deep_sort_realtime
    except ImportError:
        print("Installing deep-sort-realtime...")
        os.system("pip install deep-sort-realtime")
    
    input_dir = "Processed_videos"
    output_dir = "Analyzed_videos"
    os.makedirs(output_dir, exist_ok=True)

    for video_file in sorted(os.listdir(input_dir)):
        if video_file.lower().endswith(('.mp4', '.avi', '.mov')):
            input_path = os.path.join(input_dir, video_file)
            output_path = os.path.join(output_dir, f"action_{video_file}")
            print(f"\nProcessing actions for: {video_file}")
            process_video_with_actions(input_path, output_path)


Processing actions for: tracked_A1606b0e6_0 (1).mp4


Using cache found in C:\Users\matab/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-3-26 Python-3.13.2 torch-2.6.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
Using cache found in C:\Users\matab/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-3-26 Python-3.13.2 torch-2.6.0+cpu CPU



Error opening video: Processed_videos\tracked_A1606b0e6_0 (1).mp4

Processing actions for: tracked_A1606b0e6_0 (11).mp4


Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast

Processed 100 frames


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

KeyboardInterrupt: 

In [5]:
action_recognizer = ActionRecognizer()
torch.save(action_recognizer.model, "Deploy/action_model_full.pth")
print("✅ Pretrained model saved to Deploy/action_model_full.pth")


✅ Pretrained model saved to Deploy/action_model_full.pth
