In [1]:
import cv2
from ultralytics import YOLO
import numpy as np
import matplotlib.pyplot as plt

# Load YOLOv11 pose model
model = YOLO("yolo11n-pose.pt")

# Open two cameras
cap1 = cv2.VideoCapture(0)
cap2 = cv2.VideoCapture(1)

if not cap1.isOpened() or not cap2.isOpened():
    print("Error: Cannot open one or both webcams.")
    exit()

# Read one frame from each camera
ret1, frame1 = cap1.read()
ret2, frame2 = cap2.read()

if not ret1 or not ret2:
    print("Error: Cannot read frames from webcams.")
    exit()

# Run pose estimation on both frames
results1 = model(frame1)
results2 = model(frame2)

# Annotated frames for visualization (optional)
annotated1 = results1[0].plot()
annotated2 = results2[0].plot()

cv2.imshow("Camera 0 - YOLOv11 Pose", annotated1)
cv2.imshow("Camera 1 - YOLOv11 Pose", annotated2)
cv2.waitKey(10000)  # Show windows for 10 second

cap1.release()
cap2.release()
cv2.destroyAllWindows()



0: 480x640 1 person, 58.6ms
Speed: 5.9ms preprocess, 58.6ms inference, 76.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 10.3ms
Speed: 3.4ms preprocess, 10.3ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)


In [2]:
def extract_valid_keypoints(results):
    # keypoints shape: (1, 17, 2)
    keypoints = results[0].keypoints.xy.cpu().numpy()  # shape (1, 17, 2)
    print("Keypoints shape:", keypoints.shape)

    # Mask: True if keypoint != [0,0] (i.e. either x or y != 0)
    mask = np.any(keypoints != [0, 0], axis=2)  # shape: (1, 17)
    print("Mask shape:", mask.shape)
    print("Mask:", mask)

    # Apply mask: keep only valid keypoints (for the first detected person)
    valid_pts = keypoints[0, mask[0], :]  # shape: (N_valid, 2)
    print("Valid pts shape:", valid_pts.shape)

    return valid_pts.T  # shape (2, N_valid)

pts1 = extract_valid_keypoints(results1)
pts2 = extract_valid_keypoints(results2)

# To run SfM, the sets must correspond point-to-point
# For simplicity, keep only points that exist in both frames (intersect by index)
min_points = min(pts1.shape[1], pts2.shape[1])
pts1 = pts1[:, :min_points]
pts2 = pts2[:, :min_points]


Keypoints shape: (1, 17, 2)
Mask shape: (1, 17)
Mask: [[ True  True  True  True  True  True  True False False False False False False False False False False]]
Valid pts shape: (7, 2)
Keypoints shape: (1, 17, 2)
Mask shape: (1, 17)
Mask: [[ True  True  True  True  True  True  True False False False False False False False False False False]]
Valid pts shape: (7, 2)


In [3]:
def perform_rank3_svd(D):
    """
    Perform SVD on the measurement matrix D and enforce a rank 3 approximation.

    Note: Later steps in the algorithm use W3 to build up the motion and structure matrices.

    Parameters
    ----------
    D : numpy.ndarray
        The measurement matrix of shape (m, n).

    Returns
    -------
    U3 : numpy.ndarray
        The left singular vectors (first 3 columns) of shape (m, 3).
    W3 : numpy.ndarray
        The diagonal matrix containing the square roots of the top three singular values, of shape (3, 3).
    V3 : numpy.ndarray
        The top 3 rows of Vt of shape (3, n).
    """
    U, S, Vt = np.linalg.svd(D, full_matrices=False)
    U3 = U[:,:3]
    W3 = np.diag(np.sqrt(S[:3]))
    V3 = Vt[:3,:]
    
   # print(D.shape)
   ## print(D)
  #  print(U3.shape, W3.shape, V3.shape)
  #  print(U3, W3, V3)
#
    return U3, W3, V3

def build_ortho_constraints(A):
    """
    Build the orthographic constraints from the motion matrix A.

    Parameters
    ----------
    A : numpy.ndarray, shape (num_frames*2, 3)
        The uncorrected motion matrix from the SVD step, where the top half (ai) corresponds
        to x measurements and the bottom half (aj) corresponds to y measurements.

    Returns
    -------
    ortho_constraints : numpy.ndarray, shape (num_frames*3, 9)
        The left-hand side (lhs) constraint matrix, built using outer products.
    ks (`k` in the slides) : numpy.ndarray, shape (num_frames*3, 1)
        The right-hand side (rhs) vector for the system. The constraints on the squared norms
        are set to 1 (for the first two equations of each frame) while the cross-product
        constraint is set to 0.
    """
    # for-loop, for each frame, fill in left-hand side (lhs) and right-hand side (rhs)
    # for idx in range(0, A.shape[0]//2, 1)
      # lhs[idx] -> ai x ai
      # lhs[idx+1] -> aj x aj
      # lhs[idx+2] -> ai x aj
      # rhs -> [1, 1, 0]^T
    num_frames = A.shape[0]// 2
    ortho_constraints = []
    ks = []
    
    for i in range(num_frames):
        ai = A[i]
        aj = A[i + num_frames]
        
         # lhs[idx] -> ai x ai
        ortho_constraints.append(np.outer(ai, ai).flatten())
        ks.append([1])

        # lhs[idx+1] -> aj x aj
        ortho_constraints.append(np.outer(aj, aj).flatten())
        ks.append([1])

        # rhs -> [1, 1, 0]^T
        ortho_constraints.append(np.outer(ai, aj).flatten())
        ks.append([0])
        
    ortho_constraints = np.array(ortho_constraints)
    ks = np.array(ks)



    return ortho_constraints, ks

def affineSFM(x, y):
  '''
  Affine Structure from Motion (SFM) Algorithm.

  This function implements an Affine Structure from Motion approach to recover
  the 3D structure of a scene and estimate the camera motion from its 2D projections.
  It assumes an orthographic camera model and employs Singular Value Decomposition (SVD)
  to enforce a rank-3 constraint, followed by a correction step that applies orthographic
  projection constraints.

  The algorithm proceeds as follows:
    1. Normalize the measurements: Subtract the mean from each column of x and y so that
        the data is zero mean across each frame.
    2. Construct the measurement matrix D: Stack the transposed normalized x and y coordinates.
        This results in a D of shape (num_frames*2, num_points), where each pair of rows corresponds
        to the x and y measurements for a given frame.
    3. Perform SVD and enforce a rank-3 model: Decompose the measurement matrix and retain only the
        top three singular values and corresponding vectors to satisfy the rank-3 condition.
    4. Apply orthographic constraints: Establish and solve a system of equations based on the
        orthographic projection properties of the affine camera, then use a Cholesky decomposition
        to refine the solution.
    5. Recover the motion and structure matrices:
        - A_corrected: The refined camera motion (affine projection) matrix of shape (num_frames*2, 3)
        - S_corrected: The 3D structure (shape) matrix of the scene of shape (3, num_points)

  Parameters
  ----------
  x : numpy.ndarray, shape (num_points, num_frames)
      A 2D array of x-coordinate feature measurements across frames.
  y : numpy.ndarray, shape (num_points, num_frames)
      A 2D array of y-coordinate feature measurements across frames.

  Returns
  -------
  A_corrected : numpy.ndarray, shape (num_frames*2, 3)
      The refined affine camera motion matrix that maps the 3D structure to the observed 2D coordinates.
  S_corrected : numpy.ndarray, shape (3, num_points)
      The reconstructed 3D structure matrix, representing the positions of feature points in 3D space.
  '''
  ##### YOUR CODE HERE #####
  # Step 1:
  # Normalize x and y to zero mean.
  # Compute the mean for each column (i.e., each frame).
  # After normalization, the shape should still be (num_points, num_frames)
  x_centered = x - np.mean(x, axis=0, keepdims=True)
  y_centered = y - np.mean(y, axis=0, keepdims=True)

  # Step 2:
  # Create measurement matrix D.
  # The shape of D should be (num_frames*2, num_points)
  # D =
  D = np.vstack((x_centered.T, y_centered.T))

  # Step 3:
  # Perform SVD and enforce rank 3
  # See https://numpy.org/doc/2.2/reference/generated/numpy.linalg.svd.html for details.
  U3, W3, V3 = perform_rank3_svd(D)

  # Step 4:
  # Create motion and shape matrices A, S
  A = U3 @ W3  # (2F, 3)
  S = V3       # (3, P)

  # Step 5:
  # Set up the constraints [ad bd cd ae be ce af bf cf] L = k
  # Or lhs * L = rhs
  # [[a b c]' * [d e f] ...] L = [1 1 0 ...]^T
  # Finish `build_ortho_constraints` and make sure passing the sanity check below
  # Check np.outer for outer product.
  ortho_constraints, rhs = build_ortho_constraints(A)

  # Step 6:
  # Solve the least squares for L
  # See https://numpy.org/doc/2.1/reference/generated/numpy.linalg.lstsq.html
  # for more details
  # L =
  L_vec, _, _, _ = np.linalg.lstsq(ortho_constraints, rhs, rcond=None)
  L = L_vec.reshape((3, 3))

  # Step 7:
  # Cholesky decomposition. See https://numpy.org/doc/2.2/reference/generated/numpy.linalg.cholesky.html
  # C =
  
  #C = np.linalg.cholesky(L)
  C = np.linalg.cholesky(L).T

  # Step 8:
  # Update A and S with the constraints
  # to get A_corrected and S_corrected
  A_corrected = A @ C
  S_corrected = np.linalg.inv(C) @ S 

  return A_corrected, S_corrected

##### YOUR CODE HERE #####

In [4]:
import numpy as np
import matplotlib.pyplot as plt

def affineSFM(x1, x2):
    """
    x1, x2: 2xN arrays of corresponding 2D points from two views.
    Returns:
    M: 4x3 motion matrix (affine camera)
    S: 3xN structure matrix (3D points)
    """
    # Number of points
    N = x1.shape[1]

    # Center the points by subtracting centroid
    mean_x1 = np.mean(x1, axis=1, keepdims=True)
    mean_x2 = np.mean(x2, axis=1, keepdims=True)
    x1_centered = x1 - mean_x1
    x2_centered = x2 - mean_x2

    # Build measurement matrix W (4 x N): stack both views
    W = np.vstack((x1_centered, x2_centered))

    # SVD of W
    U, D, VT = np.linalg.svd(W)

    # Keep top 3 singular values
    U3 = U[:, :3]
    D3 = np.diag(D[:3])
    V3 = VT[:3, :]

    # Motion and shape
    M = U3 @ np.sqrt(D3)
    S = np.sqrt(D3) @ V3

    return M, S


# Run SFM
M, S = affineSFM(pts1, pts2)

# Plot 3D points from multiple views
viewpoints = [(-110, 30), (90, 180), (-75, 90)]

for elev, azim in viewpoints:
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(S[0, :], S[1, :], S[2, :], c='r', marker='o')
    ax.view_init(elev=elev, azim=azim)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    plt.title(f'3D Structure: elev={elev}, azim={azim}')
    plt.show()


: 