## Demo Models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ultralytics
!pip uninstall -y supervision && pip install -q supervision>=0.23.0
!pip install supervision
!pip install -q git+https://github.com/ifzhang/ByteTrack.git

Found existing installation: supervision 0.25.1
Uninstalling supervision-0.25.1:
  Successfully uninstalled supervision-0.25.1
  Preparing metadata (setup.py) ... [?25l[?25hdone


### SV Detection

In [None]:
import torch
from torchvision import models, transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = models.resnet50(pretrained=False)
model.fc = torch.nn.Linear(model.fc.in_features, 14*2)
model.load_state_dict(torch.load('/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/keypoints_model.pth'))
model = model.to(device)
model.eval()

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

img_path = '/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/1.png'
img = cv2.imread(img_path)
if img is None:
    raise FileNotFoundError(f"Image not found at path: {img_path}")

img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img_transformed = transform(img_rgb).unsqueeze(0)

img_transformed = img_transformed.to(device)

with torch.no_grad():
    outputs = model(img_transformed)

keypoints = outputs.cpu().numpy().flatten()

original_height, original_width = img.shape[:2]

keypoints[::2] *= original_width / 224.0
keypoints[1::2] *= original_height / 224.0

keypoints = keypoints.reshape(-1, 2)

fig, ax = plt.subplots()

ax.imshow(img_rgb)

for x, y in keypoints:
    ax.add_patch(patches.Circle((x, y), radius=5, edgecolor='r', facecolor='none', lw=2))

for i, (x, y) in enumerate(keypoints):
    ax.text(x + 5, y, f'{i+1}', color='yellow', fontsize=8)

plt.axis('off')
plt.show()



FileNotFoundError: Image not found at path: /content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/1.png

## Final Model

In [None]:
import torch
from torchvision import models, transforms
import cv2
import numpy as np
import supervision as sv
from ultralytics import YOLO
import matplotlib.pyplot as plt


In [None]:
def KeypointModelFromVideo(frame):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = models.resnet50(pretrained=False)
    model.fc = torch.nn.Linear(model.fc.in_features, 14 * 2)
    model.load_state_dict(torch.load('/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/keypoints_model.pth'))
    model = model.to(device)
    model.eval()

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img_transformed = transform(img_rgb).unsqueeze(0)
    img_transformed = img_transformed.to(device)

    with torch.no_grad():
        outputs = model(img_transformed)

    keypoints = outputs.cpu().numpy().flatten()

    original_height, original_width = frame.shape[:2]

    keypoints[::2] *= original_width / 224.0
    keypoints[1::2] *= original_height / 224.0

    keypoints = keypoints.reshape(-1, 2)

    # Draw keypoints on the frame
    for x, y in keypoints:
        cv2.circle(frame, (int(x), int(y)), 8, (0, 255, 0), -1)

    for i, (x, y) in enumerate(keypoints):
        cv2.putText(frame, f'{i + 1}', (int(x) + 5, int(y) + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 1)

    # Return both the annotated frame and the keypoints
    return frame, keypoints

In [None]:
def create_small_pitch(pitch_size=(160, 300)):
  image_path = '/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/tennis_court.jpeg'
  pitch = cv2.imread(image_path)

  if pitch is None:
    print("Error: Could not load image.")
  else:
    pitch_rotated = cv2.rotate(pitch, cv2.ROTATE_90_CLOCKWISE)
    pitch_rotated = cv2.resize(pitch_rotated, pitch_size)
  return pitch

pitch_size=(160, 300)
pitch = create_small_pitch(pitch_size=(160, 300))

def map_to_pitch(bbox, pitch_size, frame_size):
    x1, y1, x2, y2 = bbox
    frame_width, frame_height = frame_size
    pitch_width, pitch_height = pitch_size

    x_pitch = int((x1 + x2) / 2 * pitch_width / frame_width)
    y_pitch = int((y1 + y2) / 2 * pitch_height / frame_height)

    return x_pitch, y_pitch

In [None]:
player_model = YOLO('yolo11x.pt')
ball_model = YOLO('/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/ball_detection_model.pt')

video_path = '/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/Tennis.mp4'
cap = cv2.VideoCapture(video_path)

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('/content/drive/MyDrive/Davran_Colab/TennisPlayerDetection/Tennis_detected_2_11_demo.mp4', fourcc, fps, (width, height))


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    player_detections = player_model.predict(frame, conf=0.3)
    player_boxes = []
    player_scores = []
    tracker_ids = []

    for i, det in enumerate(player_detections[0].boxes.data.cpu().numpy()):
        x1, y1, x2, y2, score, cls = det
        if int(cls) == 0:
            player_boxes.append((x1, y1, x2, y2))
            player_scores.append(score)
            tracker_ids.append(i + 1)

    if len(player_scores) > 2:
        top_indices = np.argsort(player_scores)[-2:]
        player_boxes = [player_boxes[i] for i in top_indices]
        player_scores = [player_scores[i] for i in top_indices]
        tracker_ids = [tracker_ids[i] for i in top_indices]

    x_min, y_min, x_max, y_max = 160, 160, 1700, 900
    frame_ball_court = frame.copy()
    cv2.rectangle(frame_ball_court, (0, 0), (frame_ball_court.shape[1], y_min), (0, 0, 0), -1)  # Top black area
    cv2.rectangle(frame_ball_court, (0, y_max), (frame_ball_court.shape[1], frame_ball_court.shape[0]), (0, 0, 0), -1)  # Bottom black area
    cv2.rectangle(frame_ball_court, (0, y_min), (x_min, y_max), (0, 0, 0), -1)  # Left black area
    cv2.rectangle(frame_ball_court, (x_max, y_min), (frame_ball_court.shape[1], y_max), (0, 0, 0), -1)  # Right black area

    ball_detections = ball_model.predict(frame_ball_court, conf=0.1)
    ball_coords = []
    ball_scores = []

    best_score = -1
    best_coords = None

    for det in ball_detections[0].boxes.data.cpu().numpy():
        x1, y1, x2, y2, score, cls = det

        if score > best_score:
            best_score = score
            best_coords = (x1, y1, x2, y2)

    if best_coords is not None:
        ball_coords.append(best_coords)
        ball_scores.append(best_score)

    annotated_frame = frame.copy()

    start_x = width - pitch_size[0] - 20
    start_y = height - pitch_size[1] - 150

    start_x, start_y = 100, 100

    if start_x + pitch_size[0] <= annotated_frame.shape[1] and start_y + pitch_size[1] <= annotated_frame.shape[0]:
        overlay = annotated_frame[start_y:start_y + pitch_size[1], start_x:start_x + pitch_size[0]]
        pitch_resized = cv2.resize(pitch, (overlay.shape[1], overlay.shape[0]))

        alpha = 0.5
        blended_region = cv2.addWeighted(overlay, 1 - alpha, pitch_resized, alpha, 0)
        annotated_frame[start_y:start_y + pitch_size[1], start_x:start_x + pitch_size[0]] = blended_region

    for bbox in player_boxes:
        x_pitch, y_pitch = map_to_pitch(bbox, pitch_size, (width, height))
        cv2.circle(annotated_frame, (start_x + x_pitch, start_y + y_pitch), 5, (255, 255, 0), -1)

    if ball_coords:
        for bbox in ball_coords:
            x_pitch, y_pitch = map_to_pitch(bbox, pitch_size, (width, height))
            cv2.circle(annotated_frame, (start_x + x_pitch, start_y + y_pitch), 5, (0, 0, 255), -1)

    player_detections = sv.Detections(
        xyxy=np.array(player_boxes),
        confidence=np.array(player_scores),
        class_id=np.zeros(len(player_boxes), dtype=int)
    )

    labels = [
        f"#(Player {tracker_id})" for tracker_id in tracker_ids
    ]

    ellipse_annotator = sv.EllipseAnnotator(
        color=sv.ColorPalette.from_hex(['#FF0000', '#000000']),
        thickness=8
    )

    triangle_annotator = sv.TriangleAnnotator(
        color=sv.Color.from_hex('#FFD700'),
        base=25,
        height=21,
        outline_thickness=5
    )
    label_annotator = sv.LabelAnnotator(
        color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493']),
        text_color=sv.Color.from_hex('#0000FF'),
        text_position=sv.Position.BOTTOM_CENTER
    )

    annotated_frame = ellipse_annotator.annotate(
        scene=annotated_frame,
        detections=player_detections
    )

    annotated_frame = label_annotator.annotate(
        scene=annotated_frame,
        detections=player_detections,
        labels=labels
    )

    if ball_coords:
        ball_detections = sv.Detections(
            xyxy=np.array(ball_coords),
            confidence=np.array(ball_scores),
            class_id=np.zeros(len(ball_scores), dtype=int)
        )

        annotated_frame = triangle_annotator.annotate(
            scene=annotated_frame,
            detections=ball_detections
        )

    annotated_frame_2, keypoints = KeypointModelFromVideo(annotated_frame)

    out.write(annotated_frame_2)

cap.release()
out.release()


0: 384x640 5 persons, 2 sports balls, 1 couch, 22.1ms
Speed: 2.7ms preprocess, 22.1ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 736x1280 5 tennis balls, 38.5ms
Speed: 7.2ms preprocess, 38.5ms inference, 1.3ms postprocess per image at shape (1, 3, 736, 1280)

0: 384x640 5 persons, 1 sports ball, 20.6ms
Speed: 2.5ms preprocess, 20.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 736x1280 1 tennis ball, 33.5ms
Speed: 7.3ms preprocess, 33.5ms inference, 1.4ms postprocess per image at shape (1, 3, 736, 1280)

0: 384x640 5 persons, 1 sports ball, 20.7ms
Speed: 3.3ms preprocess, 20.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 736x1280 2 tennis balls, 33.4ms
Speed: 7.2ms preprocess, 33.4ms inference, 1.4ms postprocess per image at shape (1, 3, 736, 1280)

0: 384x640 5 persons, 2 sports balls, 1 tennis racket, 20.9ms
Speed: 2.7ms preprocess, 20.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: