In [None]:
import math
from matplotlib import pyplot as plt
import numpy as np
import cv2
import pandas as pd
import soundfile as sf
import os
from pathlib import Path

from Scripts.finalCode.getAudioSaliency import compute_audio_saliency_heatmap_vectorized, precompute_integrals
from Scripts.finalCode.getVideoLabels import filterDf, uv_to_tile_index
from Scripts.finalCode.getVideoSaliency import compute_video_saliency_heatmap_vectorized

def normalize_heatmaps(heatmaps):
    """Normalize heatmap to [0, 1] range."""
    # returns a list of mins and maxs for each heatmap
    h_mins = np.min(heatmaps, axis=(1, 2), keepdims=True)
    h_maxs = np.max(heatmaps, axis=(1, 2), keepdims=True)

    return (heatmaps - h_mins) / (h_maxs - h_mins)


def getFrame(cap, output_width, output_height, frame_idx):    
    """
    Read video and yield resized frames.
    """
    
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
    ret, frame = cap.read()        
    resized_frame = cv2.resize(frame, (output_width, output_height), interpolation=cv2.INTER_LINEAR)

    return resized_frame

def process_360_video(video_name, video_path, audio_path, outputX_path, outputY_path,
                      csv_path, erp_height=1920, erp_width=3840, 
                      sample_every_n_frames=5, numHeatmaps=7, participant_id = 1,
                      cols = 16, rows = 9):
    """
    Main pipeline to process a 360 video and extract audio saliency heatmaps.
    
    Parameters:
        video_path: path to ERP format 360 video
        audio_path: path to first-order ambisonic audio file
        output_path: where to save the output .npy file
        erp_height: height of ERP format (pixels)
        erp_width: width of ERP format (pixels)
        sample_every_n_frames: sample every N frames
    """
    
    # Load audio
    print("Loading ambisonic audio...")
    audio_data, audio_samplerate = sf.read(audio_path)
    
    # Check for 4 channels
    if len(audio_data.shape) == 1:
        raise ValueError(f"Audio is mono. Expected 4-channel first-order ambisonics.")
    elif audio_data.shape[1] != 4:
        raise ValueError(f"Audio has {audio_data.shape[1]} channels. Expected 4-channel first-order ambisonics (W, X, Y, Z).")
    
    # Split into channels
    W = audio_data[:, 0]
    X = audio_data[:, 1]
    Y = audio_data[:, 2]
    Z = audio_data[:, 3]
    
    print(f"Audio shape: {audio_data.shape}")
    print(f"Audio sample rate: {audio_samplerate} Hz")
    print("Successfully loaded 4-channel first-order ambisonics audio")
    
    # Open video to get metadata
    print("Opening video...")
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        raise ValueError(f"Could not open video: {video_path}")
    
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f"Video FPS: {video_fps}")
    print(f"Total frames: {total_frames}")
    print(f"Video dimensions: {video_width}x{video_height}")
    
    # Check if resizing is needed
    need_resize = video_width != erp_width or video_height != erp_height
    if need_resize:
        print(f"Video will be resized from {video_width}x{video_height} to {erp_width}x{erp_height}")
    
    # Precompute integrals for coarse tiles (20x20 degrees)
    tile_cache = precompute_integrals(tile_size_deg=20)
    
    # Calculate number of sampled frames
    num_sampled_frames = (total_frames - math.ceil(sample_every_n_frames / 2)) // sample_every_n_frames

    labelDf = filterDf(csv_path, participant_id, video_name)
    tile_indices = []
    
    # Initialize output array
    output_array = np.zeros((num_sampled_frames, numHeatmaps, erp_height, erp_width), dtype=np.float16)
    
    print(f"Output array shape: {output_array.shape}")
    print(f"Processing {num_sampled_frames} frames...")
        
    # Use frame generator (resizes all frames upfront in the stream). Also, only retrieves them one at a time, instead of keeping it all in memory
    for sampled_frame_idx in range(num_sampled_frames):
        frame_idx = sample_every_n_frames * (sampled_frame_idx + 1)

        prevFrame = getFrame(cap, erp_height, erp_width, frame_idx - 1)
        frame = getFrame(cap, erp_height, erp_width, frame_idx)
        
        print(f"Processing frame {frame_idx}/{total_frames} (sample {sampled_frame_idx}/{num_sampled_frames})")
        
        # Compute audio saliency heatmap
        saliency_heatmaps = np.concat(compute_audio_saliency_heatmap_vectorized(W, X, Y, Z, audio_samplerate,
                                                                        frame_idx, video_fps,
                                                                        erp_height, erp_width,
                                                                        tile_cache, sample_every_n_frames,
                                                                        numHeatmaps-2, tile_size_deg=20),
                                      compute_video_saliency_heatmap_vectorized(prevFrame, frame, frame_idx, video_fps,
                                                                        erp_height, erp_width,
                                                                        tile_cache, sample_every_n_frames,
                                                                        numHeatmaps-7, tile_size_deg=20)
                                      )
        
        # Normalize heatmap
        saliency_heatmaps = normalize_heatmaps(saliency_heatmaps)
        
        # Store in output array
        output_array[sampled_frame_idx] = saliency_heatmaps

                # Find the row with closest timestamp
        idx = (labelDf['t'] - labelDf).abs().idxmin()

        # Get u, v coordinates
        u = labelDf.loc[idx, 'u']
        v = labelDf.loc[idx, 'v']

        # Convert to tile index
        tile_idx = uv_to_tile_index(u, v, rows, cols)
        tile_indices.append(tile_idx)
    
    tile_indices_array = np.array(tile_indices)

    print(f"\nGenerated {len(tile_indices_array)} tile indices")
    print(f"Tile index range: {tile_indices_array.min()} to {tile_indices_array.max()}")

    print(f"Saving output to {outputX_path}...")
    np.save(outputY_path, tile_indices_array)
    print(f"Done! Output shape: {output_array.shape}")
    print(f"Saved to: {outputY_path}")

    
    # Save output
    print(f"Saving output to {outputX_path}...")
    np.save(outputX_path, output_array)
    print(f"Done! Output shape: {output_array.shape}")
    print(f"Saved to: {outputX_path}")

    cap.release()
    
if __name__ == "__main__":
    os.chdir("./../..")
    
    # Configuration - modify as needed
    ERP_WIDTH = 1920  # width
    ERP_HEIGHT = 960  # height
    SAMPLE_RATE = 5  # sample every 5 frames
    FILE_NAME = "5020"
    VIDEO_PATH = f"Data/Pre-Processed-Data/{FILE_NAME}.mp4"  # ERP format 360 video
    AUDIO_PATH = f"Data/Pre-Processed-Data/{FILE_NAME}.wav"
    INPUT_CSV_PATH = f"Data/Pre-Processed-Data/{FILE_NAME}.csv"
    OUTPUT_X_PATH = f"FinalTrainingData/{FILE_NAME}_heatmaps.npy"
    OUTPUT_Y_PATH = f"FinalTrainingData/{FILE_NAME}_labels.npy"
    NUM_HEATMAPS = 7
    PARTICIPANT_ID = 1
    TILE_COLS = 16
    TILE_ROWS = 9

    
    # Run the pipeline
    process_360_video(FILE_NAME, VIDEO_PATH, AUDIO_PATH, OUTPUT_X_PATH, OUTPUT_Y_PATH, INPUT_CSV_PATH,
                                      erp_height=ERP_HEIGHT, erp_width=ERP_WIDTH,
                                      sample_every_n_frames=SAMPLE_RATE, numHeatmaps=NUM_HEATMAPS,
                                      participant_id = PARTICIPANT_ID, cols=TILE_COLS, rows=TILE_ROWS)

SyntaxError: invalid syntax (1212452014.py, line 33)