In [12]:
import cv2
import pandas as pd
import collections


# Ensure you're working with a copy of the DataFrame slice
def ETGaze2Video(video_path, eye_data, output_video="pilot2/processed/output_with_gaze.mp4"):
    # Now, you can safely modify gaze_data
    eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']] = eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']].interpolate()

    # Check for any remaining NaN values after interpolation
    eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']] = eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']].fillna(0.5)
    eye_data['normalized_Unitytime'] = eye_data.groupby('Scene')['Unitytime'].transform(lambda x: x - x.min())

    # Initialize video capture and writer
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))  # Frames per second of the video
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Set target resolution for output video
    target_width, target_height = 1920, 1080  # Example resolution

    # Initialize VideoWriter with target resolution
    out = cv2.VideoWriter(output_video, cv2.VideoWriter_fourcc(*'mp4v'), fps, (target_width, target_height))

    # Scale factor for Vive Pro Eye resolution to target video resolution
    # vive_width, vive_height = 2880, 1600  # Original Vive resolution
    # scale_x = target_width / vive_width
    # scale_y = target_height / vive_height

    # Convert normalized Unitytime to frame indices
    eye_data['frame_idx'] = (eye_data['normalized_Unitytime'] * fps).astype(int)
    gaze_buffer_L = collections.deque(maxlen=30)
    gaze_buffer_R = collections.deque(maxlen=30)
    gaze_buffer_T = collections.deque(maxlen=30)
    
    # Iterate over each frame of the video
    frame_idx = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Resize the frame to target resolution
        # frame_resized = cv2.resize(frame, (target_width, target_height))

        # Find the corresponding eye-tracking data for the current video frame
        # Check for gaze data that matches the current frame index
        gaze_data = eye_data[eye_data['frame_idx'] == frame_idx]

        # Debug: Check if there is any matching gaze data
        if gaze_data.empty:
            print(f"No gaze data found for frame {frame_idx}")
        else:
            # Retrieve gaze coordinates
            x_L, y_L = gaze_data.mean()[['pupilLSensorPosL_X', 'pupilLSensorPosL_Y']]
            x_R, y_R = gaze_data.mean()[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']]
            

            # Scale gaze coordinates to match video resolution
            # x = int(x * scale_x)
            # y = int(y * scale_y)
            x_L, y_L = int(x_L* frame_width), int(y_L* frame_height)
            x_R, y_R = int(x_R* frame_width), int(y_R* frame_height)
            x_T, y_T = int((x_L+x_R)/2) , int((y_L+y_R)/2)
            # print(x,y)

            # Draw the gaze point on the frame
            # if 0 <= x_L < target_width and 0 <= y_L < target_height:#Red, Left eye
            #     cv2.circle(frame,(x_L, y_L), 5, (0, 0, 255), -1)
            # if 0 <= x_R < target_width and 0 <= y_R < target_height:#Green Right eye
            #     cv2.circle(frame,(x_R, y_R), 5, (0, 255, 0), -1)
            # if 0 <= x_T < target_width and 0 <= y_T < target_height:# Blue, both eye
            #     cv2.circle(frame,(x_T, y_T), 5, (255, 0, 0), -1)
            if 0 <= x_L < target_width and 0 <= y_L < target_height:#Red, Left eye
                gaze_buffer_L.append((x_L, y_L))
            if 0 <= x_R < target_width and 0 <= y_R < target_height:#Green Right eye
                gaze_buffer_R.append((x_R, y_R))
            if 0 <= x_T < target_width and 0 <= y_T < target_height:# Blue, both eye
                gaze_buffer_T.append((x_T, y_T))
                
                
            overlay = frame.copy()

            for i, (gx, gy) in enumerate(gaze_buffer_L):#Red, Left eye
                alpha = (i + 1) / 30 *0.5
                color = (int(255 * alpha), 0, 0)
                cv2.circle(overlay,(gx, gy), 5, color, -1)
            for i, (gx, gy) in enumerate(gaze_buffer_R):#Green Right eye
                alpha = (i + 1) / 30*0.5
                color = (0, int(255 * alpha), 0)
                cv2.circle(overlay,(gx, gy), 5, color, -1)
            for i, (gx, gy) in enumerate(gaze_buffer_T):# Blue, both eye
                alpha = (i + 1) / 30*0.5
                color = (0, 0, int(255 * alpha))
                cv2.circle(overlay,(gx, gy), 5, color, -1)
                
                
            frame = cv2.addWeighted(overlay, 0.5, frame, 0.5, 0)
            if gaze_buffer_L:
                cx, cy = gaze_buffer_L[-1]
                cv2.circle(frame, (cx, cy), 7, (0, 0, 255), -1)
            if gaze_buffer_R:
                cx, cy = gaze_buffer_R[-1]
                cv2.circle(frame, (cx, cy), 7, (0, 255, 0), -1)
            if gaze_buffer_T:
                cx, cy = gaze_buffer_T[-1]
                gaze_range_overlay = frame.copy()
                
                cv2.circle(gaze_range_overlay, (cx, cy), 52, (int(255 * alpha), 0, 0), -1)
                frame = cv2.addWeighted(gaze_range_overlay, 0.3, frame, 0.7, 0)
                cv2.circle(frame, (cx, cy), 7, (255, 0, 0), -1)
                

        # Write the frame to the output video
        out.write(frame)
        frame_idx += 1

    cap.release()
    out.release()

In [6]:
# Load video and eye-tracking data
video_path = "pilot2/sangsu/02_ElevatorTest.mp4"
data_path = "pilot2/processed/eyetracker.pkl"
data = pd.read_pickle(data_path)
eye_data = data[data['Scene']==2].copy()
output_video = "pilot2/processed/output_with_gaze.mp4"

In [13]:
scenes = ["02_ElevatorTest", "03_Elevator1", "04_Outside", "05_Hallway", "06_Elevator2", "07_Hall"]
data_path = "pilot2/processed/eyetracker.pkl"
data = pd.read_pickle(data_path)
eye_data = data[data['Scene']==2].copy()


In [16]:
scenes = ["02_ElevatorTest", "03_Elevator1", "04_Outside", "05_Hallway", "06_Elevator2", "07_Hall"]
# scenes = ["02_ElevatorTest"]

for item in scenes:
    video_path = f"pilot2/sangsu/{item}.mp4"
    eye_data = data[data["Scene"]==pd.to_numeric(item[1])]
    output_video = f"pilot2/processed/{item}_gaze.mp4"
    ETGaze2Video(video_path, eye_data, output_video)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']] = eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']].interpolate()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']] = eye_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']].fillna(0.5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [21]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.49-py3-none-any.whl (898 kB)
     ------------------------------------- 898.7/898.7 kB 28.7 MB/s eta 0:00:00
Collecting ultralytics-thop>=2.0.0
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.49 ultralytics-thop-2.0.13


In [24]:
import torch
print("GPU Available:", torch.cuda.is_available())

GPU Available: True


In [None]:
from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
import cv2
import torch
from ultralytics import YOLO  # Example with YOLOv8
import numpy as np
# Load SAM
model_type = "vit_b"
sam_checkpoint = "sam_vit_b.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device)
mask_generator = SamAutomaticMaskGenerator(sam)

# Load YOLO (Human detection model)
yolo = YOLO("yolov8n.pt")  # Pre-trained YOLOv8 model
yolo.to("cuda" if torch.cuda.is_available() else "cpu")  # Load model to GPU
# Video paths
video_path = "pilot2/sangsu/03_Elevator1.mp4"
# eye_data_path = "pilot2/processed/eyetracker.pkl"
# eye_data = pd.read_pickle(eye_data_path)
output_video = "pilot2/processed/output_with_humans.mp4"

# Open video
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, cv2.VideoWriter_fourcc(*"mp4v"), fps, (frame_width, frame_height))

# Color for human masks
human_color = (0, 255, 255, 127)  # Yellow (semi-transparent)

def add_transparent_mask(image, mask, color):
    """Overlay a transparent mask on the image."""
    overlay = image.copy()
    overlay[mask] = color[:3]  # RGB values
    return cv2.addWeighted(overlay, color[3] / 255, image, 1 - color[3] / 255, 0)

# Process each frame
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Detect humans using YOLO
    results = yolo(frame)  # YOLO inference
    human_boxes = []

    # Extract detections from results
    for result in results:
        # Iterate through each detection
        for detection in result.boxes:  # Access bounding boxes
            cls = int(detection.cls)  # Class label
            conf = detection.conf  # Confidence score
            if cls == 0 and conf > 0.5:  # Class 0 corresponds to "person"
                x_min, y_min, x_max, y_max = map(int, detection.xyxy[0])  # Bounding box coordinates
                human_boxes.append([x_min, y_min, x_max, y_max])

    # Generate fine masks for humans using SAM
    for box in human_boxes:
        x_min, y_min, x_max, y_max = box
        sam_input = frame[y_min:y_max, x_min:x_max]  # Crop the human region
        masks = mask_generator.generate(sam_input)

        # Apply the mask (assume the largest mask is the human)
        if masks:
            largest_mask = max(masks, key=lambda m: m['area'])
            full_mask = np.zeros(frame.shape[:2], dtype=bool)
            full_mask[y_min:y_max, x_min:x_max] = largest_mask['segmentation']
            frame = add_transparent_mask(frame, full_mask, human_color)

    # Write the frame
    out.write(frame)

cap.release()
out.release()


0: 384x640 (no detections), 4.8ms
Speed: 3.0ms preprocess, 4.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.0ms
Speed: 1.0ms preprocess, 6.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 chair, 3.0ms
Speed: 1.5ms preprocess, 3.0ms inference, 8.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 3.0ms
Speed: 1.8ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 4.0ms
Speed: 1.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 3.0ms
Speed: 2.9ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 4.0ms
Speed: 3.0ms preprocess, 4.0ms inference, 1.0ms postproc

In [None]:
import os
import cv2
import torch
from segment_anything import SamPredictor, sam_model_registry
import pandas as pd
import matplotlib.pyplot as plt

# 1. Download and Load SAM Model
sam_checkpoint = "./sam_vit_b.pth"  # Replace with appropriate model path
model_type = "vit_b"
device = "cuda" if torch.cuda.is_available() else "cpu"

sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device)
predictor = SamPredictor(sam)

# 2. Process Video Frames
video_path = "pilot2/sangsu/02_ElevatorTest.mp4"
eye_data_path = "pilot2/processed/eyetracker.pkl"
eye_data = pd.read_pickle(eye_data_path)
eye_data['frame_idx'] = eye_data['frame_idx'].astype(int)

cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# 3. Segment and Match Gaze Points
gaze_summary = {"agent": 0, "others": 0}
frame_idx = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Resize and preprocess for SAM
    input_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    predictor.set_image(input_frame)
    masks, _, _ = predictor.predict(box=None, multimask_output=True)  # Modify for AOI detection
    
    # Example: Define AOIs (simplify as agent/other for illustration)
    agent_mask = masks[0]  # Assume the first mask corresponds to the agent
    other_mask = masks[1]  # Adjust logic based on segmentation results
    
    # Match gaze points to AOIs
    gaze_data = eye_data[eye_data['frame_idx'] == frame_idx]
    for _, gaze_point in gaze_data.iterrows():
        x, y = int((gaze_point['pupilLSensorPosL_X']+gaze_point['pupilLSensorPosR_X'])/2), 
        int((gaze_point['pupilLSensorPosL_Y']+gaze_point['pupilLSensorPosR_Y'])/2)
        if agent_mask[y, x]:
            gaze_summary["agent"] += 1 / fps  # Increment time spent in AOI
        elif other_mask[y, x]:
            gaze_summary["others"] += 1 / fps
    
    frame_idx += 1

cap.release()

# 4. Summarize and Visualize Gaze Statistics
summary_df = pd.DataFrame([gaze_summary])
print(summary_df)

# Plotting
plt.bar(gaze_summary.keys(), gaze_summary.values(), color=['blue', 'green'])
plt.title("Gaze Time by AOI")
plt.ylabel("Time (seconds)")
plt.show()

ModuleNotFoundError: No module named 'segment_anything'

In [28]:
gaze_data[['pupilLSensorPosL_X','pupilLSensorPosL_Y']]

Unnamed: 0,pupilLSensorPosL_X,pupilLSensorPosL_Y
1970-01-01 00:00:59.194,0.390934,0.38266
1970-01-01 00:00:59.196,0.391134,0.382301
1970-01-01 00:00:59.198,0.391334,0.381942
1970-01-01 00:00:59.200,0.391534,0.381582
1970-01-01 00:00:59.202,0.391597,0.381212
1970-01-01 00:00:59.204,0.39136,0.380817
1970-01-01 00:00:59.206,0.391124,0.380422
1970-01-01 00:00:59.208,0.390887,0.380027
1970-01-01 00:00:59.210,0.39065,0.379632
1970-01-01 00:00:59.212,0.390762,0.379178


In [None]:
x, y =gaze_data[['pupilLSensorPosR_X', 'pupilLSensorPosR_Y']].mean()
print(x,y)


0.4167138636112213 0.4082764983177185


In [11]:
eye_data.columns

Index(['Scene', 'Unitytime', 'validL', 'validR', 'gazeoriginL_X',
       'gazeoriginL_Y', 'gazeoriginL_Z', 'gazeoriginR_X', 'gazeoriginR_Y',
       'gazeoriginR_Z', 'gazeL_X', 'gazeL_Y', 'gazeL_Z', 'gazeR_X', 'gazeR_Y',
       'gazeR_Z', 'pupilL', 'pupilR', 'eye_opennessL', 'eye_opennessR',
       'pupilLSensorPosL_X', 'pupilLSensorPosL_Y', 'pupilLSensorPosL_Z',
       'pupilLSensorPosR_X', 'pupilLSensorPosR_Y', 'pupilLSensorPosR_Z',
       'convergence_distance_mm', 'convergence_distance_validity',
       'normalized_Unitytime', 'frame_idx'],
      dtype='object')