In [None]:
"""
Import all required libraries for video generation pipeline
"""
import os
import json
import glob

import numpy as np
import matplotlib.pyplot as plt
import tqdm
import viser.transforms as tf
import librosa
import torch

from scipy.io import wavfile
from scipy.spatial.transform import Rotation as R
from torchaudio.transforms import GriffinLim
from scipy.signal import fftconvolve
from nerfstudio.viewer.viewer import VISER_NERFSTUDIO_SCALE_RATIO
from nerfstudio.field_components.spatial_distortions import SceneContraction

# NeRAF Video Generation Pipeline

This notebook provides a complete workflow to generate multi-modal audio-visual videos from NeRAF models. It includes:
1. Trajectory creation 
2. Audio synthesis 
3. Image rendering 
4. Video assembly combining audio and synchronized visuals
5. Trajectory visualization mini-maps

**Note:** This example is for the EmptyRoom scene (RAF dataset).

⚠️ **Important:** The following examples are specific to EmptyRoom. Each room/environment may require different parameter adjustments (offsets, scaling factors, rotation angles, etc.). These should be tuned based on visual inspection in the nerfstudio viewer.

In [None]:
# ============================================================================
# CONFIGURATION - Modify these paths according to your environment
# ============================================================================

# Dataset and output paths
path_dataset = '/path/to/raf_dataset'
output_path_traj = '/path/to/output/trajectories_for_videos'
output_path_video = '/path/to/output/EmptyRoom_video'

# Select the room to process
room = os.path.join(path_dataset, "archived/EmptyRoom_processed")

## Step 1: Create Trajectory File 

Extract and process all poses (microphone positions, speaker positions, rotations) from the dataset split to create a camera trajectory for video rendering.

In [None]:
def _process_poses(files, room):
    """
    Extract poses from dataset files.
    
    Args:
        files: List of recording file names to process
        room: Path to the room dataset directory
        
    Returns:
        Dictionary containing:
            - 'rot': Speaker rotations (spherical harmonics representation)
            - 'mic_pose': Microphone positions
            - 'source_pose': Speaker positions  
            - 'rot_degree': Rotation angles in degrees
    """
    mic_poses_list = []
    source_poses_list = []
    rots_list = []
    rot_degrees_list = []
    
    for file_name in files:
        # Load receiver (microphone) and transmitter (speaker) positions
        rx_file = os.path.join(room, 'data', file_name, 'rx_pos.txt')
        tx_file = os.path.join(room, 'data', file_name, 'tx_pos.txt')
        
        with open(rx_file, 'r') as f:
            rx_lines = f.readlines()
            rx_coords = [float(val) for val in rx_lines[0].replace('\n', '').split(',')]
            
        with open(tx_file, 'r') as f:
            tx_lines = f.readlines()
            tx_coords = [float(val) for val in tx_lines[0].replace('\n', '').split(',')]

        # Extract quaternion and positions from transmitter data
        quat_xyzw = tx_coords[:4]  # Quaternion in (x, y, z, w) format
        speaker_position = tx_coords[4:]
        
        # Convert quaternion to Euler angles to extract rotation around Y axis
        rotation = R.from_quat(quat_xyzw)
        view_dir = np.array([1, 0, 0])
        speaker_viewdir = rotation.apply(view_dir)
        
        # Get rotation around Y axis in degrees
        euler_angles = rotation.as_euler('yxz', degrees=True)
        rotation_degree = np.round(euler_angles[0], decimals=0)
        
        # Convert to spherical harmonics representation for audio
        rad_rotation = np.deg2rad(rotation_degree)
        speaker_rotation_sh = np.array([np.cos(rad_rotation), 0, np.sin(rad_rotation)])
        speaker_rotation_sh = (speaker_rotation_sh + 1.0) / 2.0  # Normalize to [0, 1]
        
        # Store poses
        mic_poses_list.append(np.array(rx_coords))
        source_poses_list.append(np.array(speaker_position))
        rots_list.append(speaker_rotation_sh)
        rot_degrees_list.append(rotation_degree)
    
    # Stack all poses
    return {
        'rot': np.array(rots_list),
        'mic_pose': np.array(mic_poses_list),
        'source_pose': np.array(source_poses_list),
        'rot_degree': np.array(rot_degrees_list)
    }

In [None]:
# Load the train/test split from the dataset
split_file = os.path.join(room, 'metadata/data-split.json')
with open(split_file, 'r') as f:
    split_files = json.load(f)

# Combine training and test files to get all recordings
split_files_train = split_files['train'][0]
split_files_test = split_files['test'][0]
split_files_all = split_files_train + split_files_test

print(f"Loaded {len(split_files_all)} recording files from dataset")

In [None]:
# Process all poses
poses = _process_poses(split_files_all, room)
aabb = np.array([poses['mic_pose'].min(axis=0), poses['mic_pose'].max(axis=0)])

In [None]:
# Extract microphone and speaker positions for reference
mic_positions = poses["mic_pose"]
speaker_positions = poses["source_pose"]

In [None]:
# ============================================================================
# Define the trajectory
# ============================================================================
# This trajectory is handcrafted to explore the room in an interesting way
# Adjust these parameters for your specific room layout

step = 0.08  # Distance between frames along the trajectory

# Start from the left side, scan horizontally  
x_trajectory = np.arange(-1.8, -3.2, -step).tolist()
y_trajectory = [3] * len(x_trajectory)

y_steps = np.arange(3, 0.8, -step).tolist()
x_trajectory += [-3.2] * len(y_steps)
y_trajectory += y_steps

x_steps = np.arange(-3.2, -1.3, step).tolist()
x_trajectory += x_steps
y_trajectory += [0.8] * len(x_steps)

y_steps = np.arange(0.8, -1.5, -step).tolist()
x_trajectory += [-1.3] * len(y_steps)
y_trajectory += y_steps

# Pause at corner
x_trajectory += [x_trajectory[-1]] * 20
y_trajectory += [y_trajectory[-1]] * 20

y_steps = np.arange(-1.5, -3.5, -step).tolist()
x_trajectory += [-1.3] * len(y_steps)
y_trajectory += y_steps

x_steps = np.arange(-1.3, 0.5, step).tolist()
x_trajectory += x_steps
y_trajectory += [-3.5] * len(x_steps)

y_steps = np.arange(-3.5, -0.6, step).tolist()
x_trajectory += [0.5] * len(y_steps)
y_trajectory += y_steps

x_steps = np.arange(0.5, -1, -step).tolist()
x_trajectory += x_steps
y_trajectory += [-0.6] * len(x_steps)

# Pause again
x_trajectory += [x_trajectory[-1]] * 20
y_trajectory += [y_trajectory[-1]] * 20

x_steps = np.arange(-1, -2.6, -step).tolist()
x_trajectory += x_steps
y_trajectory += [-0.6] * len(x_steps)

y_steps = np.arange(-0.6, -3.5, -step).tolist()
x_trajectory += [-2.6] * len(y_steps)
y_trajectory += y_steps

x_steps = np.arange(-2.6, -2.4, step).tolist()
y_steps = np.arange(-3.5, -3.3, step).tolist()
x_trajectory += x_steps
y_trajectory += y_steps

y_steps = np.arange(-3.3, -3.1, step).tolist()
x_trajectory += [-2.4] * len(y_steps)
y_trajectory += y_steps

# Final pause
x_trajectory += [x_trajectory[-1]] * 20
y_trajectory += [y_trajectory[-1]] * 20

# Constant height (use average microphone height)
z_trajectory = [np.mean(mic_positions[:, 1])] * len(y_trajectory)


speaker_position = speaker_positions[0]
speaker_rotation = poses["rot"][0]

# Visualize the trajectory in 2D top-down view
plt.figure(figsize=(10, 8))
plt.scatter(mic_positions[:, 2], mic_positions[:, 0], s=0.5, label='Training poses', alpha=0.5)
plt.scatter(speaker_positions[:, 2], speaker_positions[:, 0], s=0.5, label='Speaker positions', alpha=0.5)
plt.plot(y_trajectory, x_trajectory, 'r-', linewidth=2, label='Camera trajectory')
plt.scatter(speaker_position[2], speaker_position[0], c='red', s=100, marker='*', label='Speaker location')
plt.legend()
plt.xlabel('Y position')
plt.ylabel('X position')
plt.title('Top-down view of room with camera trajectory')
plt.axis('equal')
plt.show()

In [None]:
# Create trajectory array and save to file
trajectory_data = np.array([x_trajectory, z_trajectory, y_trajectory]).T
trajectory_dict = {
    'rots': speaker_rotation,
    'mic_poses': trajectory_data,
    'source_poses': speaker_position
}

output_file = os.path.join(output_path_traj, "test_trajectory_emptyroom.npy")
os.makedirs(output_path_traj, exist_ok=True)
np.save(output_file, trajectory_dict)

print(f"Trajectory saved to: {output_file}")
print(f"Trajectory length: {len(trajectory_data)} frames")

## Step 2: Generate Audio from NeRAF 

Use the NeRAF model to synthesize room impulse responses for each pose along the trajectory.

### Render Audio 

Use the `AVN_RENDER_POSES` environment variable to specify the trajectory for rendering. Execute the following command in your terminal:

```bash
AVN_RENDER_POSES=/path/to/trajectory.npy \
  ns-eval \
  --load-config ../weights/RAF/EmptyRoom_NeRAF/NeRAF/2024-10-10_041356/config.yml \
  --render-output-path ./path/to/output/
```


## Step 3: Generate Images from NeRAF 

Render images from the trained NeRAF model along the camera trajectory.

### Step 3a: Open Nerfstudio Viewer

Open the nerfstudio viewer in your terminal:
```bash
ns-viewer --load-config ../weights/RAF/EmptyRoom_NeRAF/NeRAF/2024-10-10_041356/config.yml
```

In [None]:
# Initialize scene contraction for coordinate space transformation
scene_contraction = SceneContraction(order=float("inf"))

In [None]:
# Load the trajectory file that will be used for rendering
trajectory_file = os.path.join(output_path_traj, "test_trajectory_emptyroom.npy")
trajectory = np.load(trajectory_file, allow_pickle=True).item()

print(f"Loaded trajectory with {len(trajectory['mic_poses'])} frames")

### Step 3b: Nerfstudio Trajectory Calibration 

The following step transforms the trajectory poses to align with Nerfsudio's viewer. 

⚠️ **Important:** The calibration parameters (angle corrections, translations, scaling factors) are specific to each scene and must be adjusted by visual inspection in the viewer.


In [None]:
# ============================================================================
# Transform trajectory poses to NeRFStudio coordinate system
# ============================================================================

extrinsics = []

# CALIBRATION PARAMETERS - Adjust these for your room/model
angle_correction = 14  # Rotation correction - specific to EmptyRoom
translation_offset = np.array([1.6, -0.5, -1.4])  # Position offset - specific to EmptyRoom
scale_factor = 2.3  # Scale factor for coordinate space - specific to EmptyRoom

# Process each frame in the trajectory
viewpoint_positions = trajectory['mic_poses']
last_angle = 0

for frame_idx, camera_pos in enumerate(viewpoint_positions):
    # Compute camera orientation based on trajectory direction
    next_frame_idx = min(frame_idx + 5, len(viewpoint_positions) - 1)
    next_pos = viewpoint_positions[next_frame_idx]
    
    # If we're at a pause point (same position), keep previous angle
    if next_pos[0] == camera_pos[0] and next_pos[2] == camera_pos[2]:
        yaw_angle = last_angle
    else:
        # Compute angle pointing towards next position
        yaw_angle = np.rad2deg(np.arctan2(next_pos[0] - camera_pos[0], 
                                          next_pos[2] - camera_pos[2]))
    last_angle = yaw_angle
    
    # Create rotation (pitch=90°, roll=0°, yaw=computed)
    euler_angles = np.array([90, 0, yaw_angle - 180 + angle_correction])
    rotation_quat = R.from_euler('xyz', euler_angles, degrees=True).as_quat()
    # Convert to (w, x, y, z)
    rotation_quat = np.array([rotation_quat[3], rotation_quat[0], 
                              rotation_quat[1], rotation_quat[2]])
    
    # Transform position to viewer coordinates
    position = np.array([camera_pos[0], -camera_pos[2], camera_pos[1]])
    
    # Apply initial rotation correction
    angle_rot = R.from_euler('xyz', [0, 0, angle_correction], degrees=True)
    position = angle_rot.apply(position)
    
    # Apply translation offset
    position = position + translation_offset
    
    # Create Nerfstudio camera pose matrix
    camera_pose = tf.SE3.from_rotation_and_translation(
        tf.SO3(rotation_quat),
        (position / VISER_NERFSTUDIO_SCALE_RATIO) * scale_factor,
    )
    
    extrinsics.append(camera_pose.as_matrix())

# Duplicate last frame 
extrinsics.append(extrinsics[-1])

print(f"Computed extrinsics for {len(extrinsics)} frames")

In [None]:
# Load example camera path template
example_json_path = "./example.json"
with open(example_json_path, 'r') as f:
    camera_path_data = json.load(f)

# Create the final camera path configuration
camera_path_config = camera_path_data.copy()
camera_path_config["default_fov"] = 90
camera_path_config["default_transition_sec"] = 1
camera_path_config["fps"] = 20
camera_path_config["seconds"] = len(extrinsics) / camera_path_config["fps"]
camera_path_config["render_width"] = 512
camera_path_config["render_height"] = 512
camera_path_config["keyframes"] = []
camera_path_config["camera_path"] = []

# Create keyframe and camera path templates
keyframe_template = {
    'matrix': [],
    'fov': 90.0,
    'aspect': 1.0,
    'override_transition_enabled': False,
    'override_transition_sec': None
}

camera_path_template = {
    'camera_to_world': [],
    'fov': 90.0,
    'aspect': 1.0
}

# Populate keyframes and camera path with extrinsics
for extrinsic_matrix in extrinsics:
    matrix_flat = extrinsic_matrix.flatten().tolist()
    
    # Add keyframe
    keyframe = keyframe_template.copy()
    keyframe['matrix'] = matrix_flat
    camera_path_config["keyframes"].append(keyframe)
    
    # Add camera path entry
    cam_path = camera_path_template.copy()
    cam_path['camera_to_world'] = matrix_flat
    camera_path_config["camera_path"].append(cam_path)

# Save the corrected camera path
save_path = os.path.join(room, "camera_paths/test-sim-traj.json")
os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, 'w') as f:
    json.dump(camera_path_config, f, indent=2)

print(f"Camera path saved to: {save_path}")

### Step 3c: Validate and Generate Final Rendering Command

1. **Load and validate the trajectory in the viewer:**
   - In the nerfstudio viewer, go to the "Render" panel
   - Click "Load path" and select the generated camera path JSON file
   - Verify the camera positions align correctly with the room geometry

2. **Adjust calibration if needed:**
   - If the path doesn't align, return to the previous cell and modify:
     - `angle_correction`: Rotation offset in degrees
     - `translation_offset`: Position offset (x, y, z)
     - `scale_factor`: Overall coordinate scaling
   - Re-run the trajectory generation and reload in the viewer

3. **Configure rendering parameters in the viewer:**
   - FOV: 90°
   - Resolution: 512×512
   - Spline tension: 0.5  
   - FPS: 20
   - Transition duration: 0.1 seconds (for RAF dataset)

4. **Generate the rendering command:**
   - In the viewer panel, click "Generate command" to interpolate keyframes
   - The command will be displayed in the terminal
   - Copy and modify it as needed:

```bash
ns-render camera-path \
  --load-config ./path/to/config.yml \
  --camera-path-filename ./path/to/camera-path.json \
  --output-format images \
  --output-path ./output_directory/
```

**Expected output:** Sequential frame files in the output directory

## Step 4: Assemble Audio-Visual Video

Combine the synthesized audio and rendered images to create the final video file.

### Prepare for Video Assembly

This section convoles the synthesized RIRs with an anechoic audio file.

In [None]:
# Audio processing configuration
SAMPLE_RATE = 48000  # Hz
STFT_N_FFT = 1024
STFT_HOP_LENGTH = 256
STFT_WIN_LENGTH = 512

# Initialize Griffin-Lim algorithm for magnitude-to-waveform conversion (on GPU for speed)
griffin_lim = GriffinLim(n_fft=STFT_N_FFT, hop_length=STFT_HOP_LENGTH, 
                         win_length=STFT_WIN_LENGTH, power=1).cuda()

print("Audio processing configuration initialized")

In [None]:
# Load the anechoic audio source (dry audio without spatial effects)
anechoic_audio_path = "./DontMeanAthing_Mix.wav"
input_audio, sample_rate_audio = librosa.load(anechoic_audio_path, sr=SAMPLE_RATE, mono=True)

# Locate all generated images and audio features (STFTs) from the rendering
rendered_images = sorted(glob.glob(os.path.join(output_path_video, '*.jpg')))
generated_stfts = sorted(glob.glob(os.path.join(output_path_video, 'eval_*.npy')))

print(f"Found {len(rendered_images)} images and {len(generated_stfts)} STFT files")

In [None]:
# Synchronize audio with video frames
fps = 10  # Frames per second of generated audio/images
time_between_frames = 1 / fps  # Duration of each frame in seconds

# Create timestamps for each frame
frame_timestamps = np.arange(0, len(generated_stfts) * time_between_frames, time_between_frames)
# Add one more timestamp for the final frame
frame_timestamps = np.append(frame_timestamps, frame_timestamps[-1] + time_between_frames)

# Prepare audio to match the total duration
total_duration_seconds = frame_timestamps[-1]
total_samples_needed = int(total_duration_seconds * SAMPLE_RATE)

# Tile the input audio to match required duration
input_audio = np.tile(input_audio, int(np.ceil(total_samples_needed / len(input_audio))))
input_audio = input_audio[:total_samples_needed]

print(f"Synchronized audio: {total_duration_seconds:.2f}s ({total_samples_needed} samples)")

In [None]:
# ============================================================================
# Generate audio by convolving RIRs with anechoic signal
# ============================================================================

max_output_samples = int(frame_timestamps[-1] * SAMPLE_RATE)
output_audio = np.zeros((2, max_output_samples))  # Stereo output

# Setup overlap-add windowing for smooth transitions between frames
frame_length = int(time_between_frames * SAMPLE_RATE)
window_overlap = frame_length // 2  # 50% overlap between frames
hann_window = scipy.signal.windows.hann(int(frame_length + window_overlap * 2), sym=False)
hann_normalization = np.zeros((1, max_output_samples))

print(f"Processing {len(generated_stfts)} audio frames...")

for frame_idx in tqdm.tqdm(range(len(generated_stfts))):
    # Load the STFT magnitude spectrum for this frame
    stft_magnitude = np.load(generated_stfts[frame_idx])
    
    # Compute time position for this frame with overlap-add
    time_position = int(frame_idx * frame_length - window_overlap)
    
    # Special handling for first and last frames
    if frame_idx == 0:
        time_position = 0
        window = hann_window[window_overlap:].copy()
        window[:frame_length // 2] = 1.0  # Ramp up
    elif frame_idx == len(generated_stfts) - 1:
        window = hann_window[:-window_overlap].copy()
        window[-frame_length // 2:] = 1.0  # Ramp down
    else:
        window = hann_window
    
    # Accumulate window values for normalization
    hann_normalization[0, time_position:time_position + len(window)] += window
    
    # Convert STFT magnitude back to time-domain waveform using Griffin-Lim
    magnitude_clipped = np.clip(np.exp(stft_magnitude) - 1e-3, 0.0, 10000.0)
    rir_waveform = griffin_lim(torch.from_numpy(magnitude_clipped).cuda()).cpu().numpy()
    
    # Convolve anechoic signal with the RIR
    convolved_0 = fftconvolve(input_audio, rir_waveform[0, :])
    # RAF dataset is mono, so duplicate for stereo (this is different in SoundSpaces)
    output_audio_frame = np.vstack((convolved_0, convolved_0))
    
    # Apply windowing and overlap-add
    output_audio_frame = output_audio_frame.astype(np.float32)
    output_audio_frame = output_audio_frame[:, time_position:time_position + len(window)]
    output_audio_frame = output_audio_frame * window
    
    output_audio[:, time_position:time_position + len(window)] += output_audio_frame

# Normalize
output_audio = output_audio / np.max(np.abs(output_audio)) 
output_audio = output_audio.astype(np.float32)

# Save the final spatialized audio
audio_output_path = os.path.join(output_path_video, "audio.wav")
wavfile.write(audio_output_path, SAMPLE_RATE, output_audio.T)
print(f"Audio saved to: {audio_output_path}")

In [None]:
# Combine rendered images and spatialized audio into final video
video_framerate = 20  # fps for final video
output_video_path = os.path.join(output_path_video, "video.mp4")

ffmpeg_command = (
    f"ffmpeg -r {video_framerate} -i {output_path_video}/%05d.jpg "
    f"-i {audio_output_path} "
    f"-c:v libx264 -c:a aac -pix_fmt yuv420p -shortest "
    f"{output_video_path} -y"
)

print("Executing FFmpeg to create video...")
os.system(ffmpeg_command)
print(f"✓ Video saved to: {output_video_path}")

## Step 5: Generate Trajectory Visualization Mini-Maps (Optional)

Create a video showing the camera trajectory overlaid on a top-down view of the room for visualization purposes.

This step is optional but useful for visualization. It generates a synchronized mini-map showing the camera path over a top-down view of the scene, which can be inserted as a corner graphic in the final video.

In [None]:
# ============================================================================
# Prepare top-down view for trajectory visualization
# ============================================================================

# Load background image (top-down view of the room)
topview_image_path = "./top_view_empty.png"
topview_image = plt.imread(topview_image_path)

# Define mapping between image coordinates and room coordinates
# These values must be determined by visual inspection of the image
image_bounds = {
    'x_min': 300,    # left edge
    'x_max': 1510,   # right edge
    'y_min': 200,    # top edge
    'y_max': 970,    # bottom edge
}

# Corresponding room coordinate bounds
room_bounds_x = [aabb[0][2], aabb[1][2]]  # X range from point cloud
room_bounds_y = [aabb[0][0], aabb[1][0]]  # Y range from point cloud

# Compute scaling factors to map room coordinates to image coordinates
scale_x = ((image_bounds['x_max'] - image_bounds['x_min']) / 
           (room_bounds_x[1] - room_bounds_x[0])) - 43
scale_y = ((image_bounds['y_max'] - image_bounds['y_min']) / 
           (room_bounds_y[1] - room_bounds_y[0])) - 30

# Transform trajectory points to image coordinates
trajectory_x_image = ((x_trajectory - room_bounds_y[0]) * scale_y + 
                      image_bounds['y_min']) + 100
trajectory_y_image = ((y_trajectory - room_bounds_x[0]) * scale_x + 
                      image_bounds['x_min']) + 200

# Transform speaker position to image coordinates
speaker_x_image = ((speaker_position[0] - room_bounds_y[0]) * scale_y + 
                   image_bounds['y_min']) + 100
speaker_y_image = ((speaker_position[2] - room_bounds_x[0]) * scale_x + 
                   image_bounds['x_min']) + 200

# Preview the trajectory on the map
plt.figure()
plt.imshow(topview_image)
plt.plot(trajectory_y_image, -trajectory_x_image + 1150, 'r-')
plt.scatter(speaker_y_image, -speaker_x_image + 1150, c='red')
plt.show()

In [None]:
# ============================================================================
# Generate mini-map video frames
# ============================================================================

minimap_output_dir = os.path.join(output_path_video, "maps_ER_video")
os.makedirs(minimap_output_dir, exist_ok=True)

print(f"Generating {len(trajectory_x_image)} minimap frames...")

for frame_index in tqdm.tqdm(range(len(trajectory_x_image))):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    # Display background image
    ax.imshow(topview_image)
    
    # Plot trajectory up to current frame
    ax.scatter(trajectory_y_image[:frame_index + 1], 
              -trajectory_x_image[:frame_index + 1] + 1150,
              c='green', label='Trajectory', s=20)
    
    # Plot speaker location
    ax.scatter(speaker_y_image, -speaker_x_image + 1150, 
              c='blue', label='Source', s=20, marker='x')
    
    # Configure display
    ax.axis("off")
    ax.set_xlim([image_bounds['x_min'], image_bounds['x_max']])
    ax.set_ylim([image_bounds['y_max'], image_bounds['y_min']])
    ax.legend(fontsize=10)
    
    # Save frame
    frame_path = os.path.join(minimap_output_dir, f"map_{frame_index:05d}.png")
    plt.savefig(frame_path, transparent=False)
    plt.close(fig)

print(f"Generated {len(trajectory_x_image)} minimap frames")

In [None]:
# Create mini-map video from generated frames
minimap_video_path = os.path.join(minimap_output_dir, "minimaps.mp4")
minimap_fps = 10  # frames per second

ffmpeg_minimap_command = (
    f"ffmpeg -r {minimap_fps} -i {minimap_output_dir}/map_%05d.png "
    f"-c:v libx264 -pix_fmt yuv420p "
    f"{minimap_video_path} -y"
)

print("Creating mini-map video...")
os.system(ffmpeg_minimap_command)
print(f"✓ Mini-map video saved to: {minimap_video_path}")