In [None]:
%pip install opencv-python

In [None]:
# %%
import os
import subprocess
import csv
import random
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import shutil # Though not explicitly used in the final version, it's good for file ops
from tqdm import tqdm
import warnings
from concurrent.futures import ProcessPoolExecutor, as_completed
# import os # already imported

# Suppress FutureWarning from sklearn.cluster.KMeans regarding n_init
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.cluster._kmeans")

In [None]:
# %%
# --- General Parameters ---
# How many dishes to attempt to download for training and testing
NUM_DISHES_TO_DOWNLOAD_TRAIN = 10000 # Reduced for quicker testing, set to 10000 for full run
NUM_DISHES_TO_DOWNLOAD_TEST = 10   # Example for test set, not used in this script's main loop
RANDOM_SAMPLE_FROM_SPLIT = True    # True to randomly sample from split files, False to take first N

# --- Base Local Directory ---
# Ensure this path is correct for your system
# LOCAL_BASE_DIR = "/users/eleves-b/2023/georgii.kuznetsov/CNN_nutrition/nutrition5k"
LOCAL_BASE_DIR = "/Data/nutrition5k"

# --- Google Cloud Storage Bucket ---
GSUTIL_BUCKET_BASE = "gs://nutrition5k_dataset/nutrition5k_dataset/"

# --- Metadata and Split Files ---
METADATA_DIR = os.path.join(LOCAL_BASE_DIR, "metadata")
TRAIN_SPLIT_FILE_RGB = os.path.join(LOCAL_BASE_DIR, "dish_ids/splits/rgb_train_ids.txt")
TEST_SPLIT_FILE_RGB = os.path.join(LOCAL_BASE_DIR, "dish_ids/splits/rgb_test_ids.txt") # For future use

# --- Overhead Imagery (Realsense) ---
# Remote subdirectory on GCS for overhead images
IMAGERY_SUBDIR_REMOTE = "imagery/realsense_overhead"
# Local subdirectory to store overhead images
IMAGERY_SUBDIR_LOCAL = "imagery/realsense_overhead"
IMAGERY_DIR_LOCAL_FULL = os.path.join(LOCAL_BASE_DIR, IMAGERY_SUBDIR_LOCAL)
# Filenames of overhead images to download per dish
FILENAME_ON_BUCKET = ["depth_color.png", "depth_raw.png", "rgb.png"]

# --- Video Processing (Side Angles) ---
# Remote subdirectory on GCS for videos
VIDEO_SUBDIR_REMOTE = "imagery/side_angles"
# Local subdirectory to store videos (temporarily) and extracted frames
VIDEO_SUBDIR_LOCAL = "imagery/side_angles"
VIDEO_DIR_LOCAL_FULL = os.path.join(LOCAL_BASE_DIR, VIDEO_SUBDIR_LOCAL)
# Subdirectory within each dish's video folder to save extracted frames
FRAMES_SUBDIR = "extracted_frames"
# Filenames of videos to download and process per dish
VIDEO_FILENAMES = ["camera_A.h264", "camera_B.h264", "camera_C.h264", "camera_D.h264"]

# --- Frame Extraction Parameters ---
NUM_FRAMES_PER_VIDEO = 5     # Number of diverse frames to extract per video
SAMPLE_EVERY_NTH_FRAME = 1   # Sample every Nth frame from video for diversity analysis pool

MAX_WORKERS = os.cpu_count() if os.cpu_count() else 4 # Number of parallel processes

In [None]:
# %%
os.makedirs(METADATA_DIR, exist_ok=True)
os.makedirs(IMAGERY_DIR_LOCAL_FULL, exist_ok=True) # For overhead images
os.makedirs(VIDEO_DIR_LOCAL_FULL, exist_ok=True)   # For videos and their extracted frames
os.makedirs(os.path.join(LOCAL_BASE_DIR, "dish_ids", "splits"), exist_ok=True) # For split files
print(f"Base local directory set to: {LOCAL_BASE_DIR}")
print(f"Overhead imagery will be stored in: {IMAGERY_DIR_LOCAL_FULL}")
print(f"Video frames will be stored in subdirectories under: {VIDEO_DIR_LOCAL_FULL}")

In [None]:
import cv2
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA # Assuming PCA might be used

def extract_frame_features(frame): # This is the more robust version used by GMM
    """
    Extract features from a frame for diversity analysis.
    Using color histogram and basic texture features.
    """
    small_frame = cv2.resize(frame, (64, 64))
    
    hsv = cv2.cvtColor(small_frame, cv2.COLOR_BGR2HSV)
    hist_h = cv2.calcHist([hsv], [0], None, [32], [0, 180])
    hist_s = cv2.calcHist([hsv], [1], None, [32], [0, 256])
    hist_v = cv2.calcHist([hsv], [2], None, [32], [0, 256])
    
    sum_h = np.sum(hist_h); hist_h = hist_h.flatten() / sum_h if sum_h > 0 else hist_h.flatten() * 0
    sum_s = np.sum(hist_s); hist_s = hist_s.flatten() / sum_s if sum_s > 0 else hist_s.flatten() * 0
    sum_v = np.sum(hist_v); hist_v = hist_v.flatten() / sum_v if sum_v > 0 else hist_v.flatten() * 0
    
    gray = cv2.cvtColor(small_frame, cv2.COLOR_BGR2GRAY)
    sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
    edge_magnitude = np.sqrt(sobelx**2 + sobely**2)
    
    edge_hist_values = edge_magnitude.flatten()
    valid_edge_values = edge_hist_values[edge_hist_values >= 0]
    
    if len(valid_edge_values) > 0:
        hist_range_max = np.percentile(valid_edge_values, 99.9) + 1e-5
        if hist_range_max <= 1e-5: hist_range_max = 256 # Fallback if percentile is 0 or very small
        edge_hist, _ = np.histogram(valid_edge_values, bins=32, range=(0, hist_range_max))
    else:
        edge_hist = np.zeros(32)

    sum_edge = np.sum(edge_hist)
    edge_hist = edge_hist.flatten() / sum_edge if sum_edge > 0 else edge_hist.flatten() * 0
        
    features = np.concatenate([hist_h, hist_s, hist_v, edge_hist])
    return features


def select_diverse_frames_gmm(video_path, num_frames_to_select=5, sample_every_nth=5, max_gmm_components_eval=10):
    """
    Select diverse frames from a video using GMM and BIC for model selection.
    Returns: (selected_frames_list, selected_frame_indices_list, messages_list)
    """
    messages = []
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        messages.append(f"Error: Could not open video {video_path}")
        return [], [], messages
    
    frames_pool = []
    features_pool = []
    frame_indices_pool = []
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % sample_every_nth == 0:
            try:
                current_features = extract_frame_features(frame)
                frames_pool.append(frame) # Add frame only if feature extraction succeeds
                features_pool.append(current_features)
                frame_indices_pool.append(frame_count)
            except Exception as e:
                messages.append(f"Warning: Could not extract features for frame {frame_count} from {video_path}. Error: {e}")
        frame_count += 1
    cap.release()

    messages.append(f'Sampled {len(frames_pool)} usable frames from video (total frames processed: {frame_count})')
    
    if not frames_pool:
        messages.append(f"Warning: No usable frames successfully sampled from {video_path}")
        return [], [], messages
    
    if len(frames_pool) <= num_frames_to_select:
        messages.append(f"Info: Sampled frames ({len(frames_pool)}) <= requested ({num_frames_to_select}). Returning all sampled frames.")
        return frames_pool, frame_indices_pool, messages

    features_array = np.array(features_pool)
    n_samples, n_features = features_array.shape
    pca_applied = False

    if n_samples <= 1:
        messages.append(f"Info: Only {n_samples} frame available post-sampling. Returning this frame.")
        return frames_pool[:n_samples], frame_indices_pool[:n_samples], messages

    if n_features > 50 and n_samples > 50 :
        pca_n_components = min(50, n_samples -1, n_features -1)
        if pca_n_components > 1 : 
             pca = PCA(n_components=pca_n_components, random_state=42)
             features_array = pca.fit_transform(features_array)
             pca_applied = True
    elif n_samples > 2 and n_features > 1 :
        max_pca_components_small = min(n_samples - 1, n_features -1, 10)
        if max_pca_components_small > 1:
            pca = PCA(n_components=max_pca_components_small, random_state=42)
            features_array = pca.fit_transform(features_array)
            pca_applied = True
    
    if pca_applied:
        n_samples, n_features = features_array.shape
        messages.append(f"Info: PCA applied. New feature dimensions: {n_features}")

    upper_bound_gmm_test = min(max_gmm_components_eval, n_samples)
    if num_frames_to_select + 5 < upper_bound_gmm_test:
        upper_bound_gmm_test = min(upper_bound_gmm_test, num_frames_to_select + 5)
    
    upper_bound_gmm_test = max(1, upper_bound_gmm_test) # Ensure at least 1
    if upper_bound_gmm_test == 1 and n_samples > 1: upper_bound_gmm_test = 2

    min_gmm_components_test = 1
    n_components_range = range(min_gmm_components_test, upper_bound_gmm_test + 1)

    if not list(n_components_range) or n_samples < min_gmm_components_test :
         messages.append(f"Warning: Not enough samples ({n_samples}) for GMM (Range: {list(n_components_range)}). Returning first {num_frames_to_select} frames from pool.")
         return frames_pool[:num_frames_to_select], frame_indices_pool[:num_frames_to_select], messages

    bics = []
    lowest_bic = np.infty
    actual_n_components_for_gmm = 0
    messages.append(f"Info: Evaluating GMM components in range: {list(n_components_range)}")

    for n_comp in n_components_range:
        if n_comp > n_samples or n_comp <=0 : continue
        try:
            gmm = GaussianMixture(n_components=n_comp, random_state=42, covariance_type='diag', n_init=3)
            gmm.fit(features_array)
            bic_val = gmm.bic(features_array)
            bics.append(bic_val)
            if bic_val < lowest_bic:
                lowest_bic = bic_val
                actual_n_components_for_gmm = n_comp
        except ValueError as e:
            messages.append(f"Warning: Error fitting GMM with {n_comp} components: {e}. Skipping.")
            bics.append(np.infty)
            continue

    if actual_n_components_for_gmm == 0:
        messages.append(f"Warning: GMM fitting failed to find a suitable model. Fallback: Returning first {num_frames_to_select} frames from pool.")
        return frames_pool[:num_frames_to_select], frame_indices_pool[:num_frames_to_select], messages

    messages.append(f"Info: Best GMM by BIC has {actual_n_components_for_gmm} components (BIC: {lowest_bic:.2f})")
    
    final_n_gmm_components = min(actual_n_components_for_gmm, num_frames_to_select)
    final_n_gmm_components = max(1, final_n_gmm_components)
    final_n_gmm_components = min(final_n_gmm_components, n_samples)

    messages.append(f"Info: Fitting final GMM with {final_n_gmm_components} components.")
    try:
        final_gmm = GaussianMixture(n_components=final_n_gmm_components, random_state=42, covariance_type='diag', n_init=3)
        final_gmm.fit(features_array)
        labels = final_gmm.predict(features_array)
        component_means = final_gmm.means_
    except ValueError as e:
        messages.append(f"Error: Fitting final GMM with {final_n_gmm_components} components failed: {e}. Fallback.")
        return frames_pool[:num_frames_to_select], frame_indices_pool[:num_frames_to_select], messages

    selected_frames_out = []
    selected_frame_numbers_out = []
    
    for i in range(final_n_gmm_components):
        cluster_member_indices = np.where(labels == i)[0]
        if len(cluster_member_indices) > 0:
            current_mean_idx = i if component_means.ndim > 1 and i < component_means.shape[0] else 0
            current_mean = component_means[current_mean_idx]

            distances = np.linalg.norm(features_array[cluster_member_indices] - current_mean, axis=1)
            closest_in_cluster_idx_in_pool = cluster_member_indices[np.argmin(distances)]
            
            selected_frames_out.append(frames_pool[closest_in_cluster_idx_in_pool])
            selected_frame_numbers_out.append(frame_indices_pool[closest_in_cluster_idx_in_pool])

    if selected_frame_numbers_out:
        sorted_indices = np.argsort(selected_frame_numbers_out)
        selected_frames_out = [selected_frames_out[i] for i in sorted_indices]
        selected_frame_numbers_out = [selected_frame_numbers_out[i] for i in sorted_indices]
            
    return selected_frames_out, selected_frame_numbers_out, messages

In [None]:
def download_file_gsutil(remote_full_path, local_full_path, description="file"):
    os.makedirs(os.path.dirname(local_full_path), exist_ok=True)
    command = ["gsutil", "-q", "cp", remote_full_path, local_full_path]
    try:
        # Using a timeout for gsutil can be beneficial
        process = subprocess.run(command, check=True, capture_output=True, text=True, timeout=300) # 5 min timeout
        return True, ""
    except subprocess.CalledProcessError as e:
        error_msg = f"Error downloading {description} ({os.path.basename(remote_full_path)}): gsutil stderr: {e.stderr.strip()}"
        return False, error_msg
    except subprocess.TimeoutExpired:
        error_msg = f"Timeout downloading {description} ({os.path.basename(remote_full_path)})"
        return False, error_msg
    except FileNotFoundError:
        error_msg = "Error: gsutil command not found. Is it installed and in your PATH?"
        return False, error_msg
    except Exception as e: # Catch any other potential errors
        error_msg = f"Unexpected error downloading {description} ({os.path.basename(remote_full_path)}): {str(e)}"
        return False, error_msg

In [None]:
def download_dish_imagery(dish_id,
                         remote_imagery_subdir,
                         local_imagery_dir_full,
                         image_filename_on_bucket):
    # Use f-string for clarity in GCS path construction
    remote_image_path = f"{GSUTIL_BUCKET_BASE.rstrip('/')}/{remote_imagery_subdir.strip('/')}/{dish_id}/{image_filename_on_bucket}"

    local_dish_image_dir = os.path.join(local_imagery_dir_full, dish_id)
    local_image_path = os.path.join(local_dish_image_dir, image_filename_on_bucket)

    success, msg = download_file_gsutil(remote_image_path, local_image_path, description=f"image {dish_id}/{image_filename_on_bucket}")
    return success, msg

In [None]:
def download_and_process_dish_videos(dish_id, num_frames_per_video_to_extract, dish_processing_messages_list):
    dish_video_temp_dir = os.path.join(VIDEO_DIR_LOCAL_FULL, dish_id, "temp_videos")
    dish_frames_output_dir = os.path.join(VIDEO_DIR_LOCAL_FULL, dish_id, FRAMES_SUBDIR)
    os.makedirs(dish_video_temp_dir, exist_ok=True)
    os.makedirs(dish_frames_output_dir, exist_ok=True)
    
    all_videos_processed_successfully_flag = True # True if all videos downloaded and processed without critical errors

    if not VIDEO_FILENAMES: # Global config check
        dish_processing_messages_list.append(f"Dish {dish_id}: Video processing skipped (VIDEO_FILENAMES is empty).")
        return True # No videos to process means success in this context

    for video_filename in VIDEO_FILENAMES:
        # Use f-string for GCS path
        video_remote_path = f"{GSUTIL_BUCKET_BASE.rstrip('/')}/{VIDEO_SUBDIR_REMOTE.strip('/')}/{dish_id}/{video_filename}"
        video_local_path = os.path.join(dish_video_temp_dir, video_filename)
        
        download_success, dl_msg = download_file_gsutil(video_remote_path, video_local_path, 
                                                description=f"video {video_filename} for dish {dish_id}")
        
        if download_success:
            try:
                # select_diverse_frames_gmm now returns (frames, numbers, list_of_gmm_messages)
                selected_frames, frame_numbers, gmm_messages_list = select_diverse_frames_gmm(
                    video_local_path, 
                    num_frames_to_select=num_frames_per_video_to_extract,
                    sample_every_nth=SAMPLE_EVERY_NTH_FRAME
                )
                
                # Append all messages from GMM processing to the main dish messages list
                for gmm_msg_item in gmm_messages_list:
                    dish_processing_messages_list.append(f"Dish {dish_id}, Video {video_filename} GMM: {gmm_msg_item}")

                if selected_frames:
                    camera_id = os.path.splitext(video_filename)[0]
                    for i_frame, (frame_content, frame_num) in enumerate(zip(selected_frames, frame_numbers)):
                        frame_filename = f"{camera_id}_frame_{frame_num:06d}.png"
                        frame_path = os.path.join(dish_frames_output_dir, frame_filename)
                        # Ensure the directory for this specific frame exists (should be handled by outer makedirs)
                        # os.makedirs(os.path.dirname(frame_path), exist_ok=True) # Redundant if dish_frames_output_dir is made
                        # cv2.imwrite(frame_path, frame_content, [cv2.IMWRITE_JPEG_QUALITY, 95])
                        cv2.imwrite(frame_path, frame_content, [cv2.IMWRITE_JPEG_QUALITY, 100])
                    dish_processing_messages_list.append(f"Dish {dish_id}, Video {video_filename}: Saved {len(selected_frames)} frames.")
                else:
                    # Log if no frames were selected, GMM messages should explain why
                    dish_processing_messages_list.append(f"Dish {dish_id}, Video {video_filename}: No frames selected/saved by GMM (see GMM logs for details).")
                    # This isn't necessarily a critical failure of the video processing itself,
                    # GMM might have validly found no diverse frames or video was too short/problematic.

            except Exception as e: # Catch unexpected errors during frame selection/saving
                dish_processing_messages_list.append(f"Dish {dish_id}, Video {video_filename}: CRITICAL frame extraction/saving error: {e}")
                all_videos_processed_successfully_flag = False # Mark as failure for this video
            finally:
                if os.path.exists(video_local_path):
                    try:
                        os.remove(video_local_path)
                    except OSError as e_del:
                        dish_processing_messages_list.append(f"Dish {dish_id}, Video {video_filename}: Warning - Failed to delete temp video {video_local_path}: {e_del}")
        else: # Download failed
            dish_processing_messages_list.append(f"Dish {dish_id}, Video {video_filename}: Download failed. {dl_msg}")
            all_videos_processed_successfully_flag = False # Mark as failure for this video
        
    # Cleanup temp video directory
    if os.path.exists(dish_video_temp_dir):
        try:
            if not os.listdir(dish_video_temp_dir): # Only remove if empty
                os.rmdir(dish_video_temp_dir)
            else:
                dish_processing_messages_list.append(f"Dish {dish_id}: Warning - Temp video dir {dish_video_temp_dir} not empty after processing. Contents: {os.listdir(dish_video_temp_dir)}")
        except OSError as e_rmdir:
            dish_processing_messages_list.append(f"Dish {dish_id}: Warning - Could not remove temp video dir {dish_video_temp_dir}: {e_rmdir}")

    return all_videos_processed_successfully_flag


def download_dish_data(dish_id_tuple):
    """
    Wrapper for ProcessPoolExecutor. Takes a tuple (dish_id, num_frames_per_video).
    Returns (dish_id, success_status, list_of_error_messages)
    """
    dish_id, num_frames_per_video = dish_id_tuple
    # print(f"Processing dish: {dish_id}") # Too noisy for parallel execution
    overall_dish_success = True
    error_messages = []

    if FILENAME_ON_BUCKET:
        for img_filename in FILENAME_ON_BUCKET:
            success, msg = download_dish_imagery(dish_id, IMAGERY_SUBDIR_REMOTE, IMAGERY_DIR_LOCAL_FULL, img_filename)
            if not success:
                error_messages.append(f"Dish {dish_id}, Image {img_filename}: {msg}")
                overall_dish_success = False
    
    if VIDEO_FILENAMES:
        video_processing_success = download_and_process_dish_videos(dish_id, num_frames_per_video, error_messages)
        if not video_processing_success:
            # Specific errors are already added to error_messages by download_and_process_dish_videos
            overall_dish_success = False
            
    return dish_id, overall_dish_success, error_messages

In [None]:
def get_dish_ids_from_split_file(filepath, limit, random_sample=False):
    all_dish_ids = []
    try:
        with open(filepath, 'r') as f:
            all_dish_ids = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Error: Split file {filepath} not found.")
        return []

    if not all_dish_ids:
        print(f"Warning: Split file {filepath} is empty.")
        return []

    if random_sample:
        if len(all_dish_ids) <= limit:
            return all_dish_ids
        return random.sample(all_dish_ids, limit)
    else:
        return all_dish_ids[:limit]

In [None]:
# %%
# train_dish_ids = get_dish_ids_from_split_file(
#     TRAIN_SPLIT_FILE_RGB,
#     NUM_DISHES_TO_DOWNLOAD_TRAIN,
#     random_sample=RANDOM_SAMPLE_FROM_SPLIT
# )
train_dish_ids = get_dish_ids_from_split_file(
    TEST_SPLIT_FILE_RGB,
    NUM_DISHES_TO_DOWNLOAD_TRAIN,
    random_sample=RANDOM_SAMPLE_FROM_SPLIT
)

if train_dish_ids:
    print(f"Selected {len(train_dish_ids)} dish IDs for training set processing.")
    print(f"First few IDs: {train_dish_ids[:5]}")
else:
    print("No training dish IDs loaded. Check split file path and content.")

In [None]:
if train_dish_ids:
    print(f"Selected {len(train_dish_ids)} unique dish IDs for processing.")
    # print(f"First few IDs: {train_dish_ids_main[:5]}") # Might be long if many
else:
    print("No dish IDs loaded. Check split file paths and content. Exiting.")
    exit() # Or handle as appropriate

if train_dish_ids:
    print(f"\nStarting download and processing for {len(train_dish_ids)} dishes.")
    print(f"Using up to {MAX_WORKERS} worker processes.")
    print(f"Overhead images to download per dish: {FILENAME_ON_BUCKET if FILENAME_ON_BUCKET else 'None'}")
    print(f"Videos to process per dish: {len(VIDEO_FILENAMES)} ({VIDEO_FILENAMES if VIDEO_FILENAMES else 'None'})")
    print(f"Frames to extract per video: {NUM_FRAMES_PER_VIDEO}")
    
    successfully_processed_dish_count = 0
    failed_dishes_info = [] # To store (dish_id, error_messages)

    # Prepare arguments for map function (or submit)
    # Each item for a worker needs to be (dish_id, NUM_FRAMES_PER_VIDEO)
    tasks = [(dish_id, NUM_FRAMES_PER_VIDEO) for dish_id in train_dish_ids]

    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Using submit and as_completed to get results as they finish and update tqdm
        futures = {executor.submit(download_dish_data, task): task[0] for task in tasks}
        
        for future in tqdm(as_completed(futures), total=len(tasks), desc="Processing Dishes"):
            processed_dish_id = futures[future] # Get the original dish_id for this future
            try:
                # The download_dish_data now returns (dish_id, success_status, list_of_error_messages)
                # The dish_id returned by the function should match processed_dish_id
                returned_dish_id, success, errors = future.result()
                if success:
                    successfully_processed_dish_count += 1
                else:
                    failed_dishes_info.append({'id': returned_dish_id, 'errors': errors})
                if errors: # Log errors even for "successful" dishes if they had minor issues
                        for error_msg in errors:
                            print(f"Log: {error_msg}") # Print to console, or log to file
            except Exception as e:
                # This catches errors in the worker process execution itself, or if future.result() re-raises an unhandled one
                failed_dishes_info.append({'id': processed_dish_id, 'errors': [f"Critical error during processing: {e}"]})
                print(f"Dish {processed_dish_id} CRITICAL FAILURE: {e}") # Log critical failure

    print(f"\n--- Processing Complete ---")
    print(f"Successfully processed data for {successfully_processed_dish_count}/{len(train_dish_ids)} dishes.")
    
    if failed_dishes_info:
        print(f"\n--- Issues Encountered for {len(failed_dishes_info)} Dishes ---")
        for failure in failed_dishes_info:
            print(f"Dish ID: {failure['id']}")
            for err in failure['errors']:
                print(f"  - {err}")

else:
    print("No training dish IDs to process.")

In [None]:
# %%
%pip install matplotlib seaborn

In [None]:
# %%
# --- Visualization Setup ---
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler # For feature scaling before PCA

# Configure matplotlib for inline display in Jupyter
%matplotlib inline
sns.set_theme(style="whitegrid")

# Helper function to plot frames
def plot_frames(frames, titles=None, figsize=(15, 5), max_cols=5):
    """Helper function to plot a list of frames."""
    num_frames = len(frames)
    if num_frames == 0:
        print("No frames to plot.")
        return
    
    # Determine number of rows and columns for the subplot
    cols = min(num_frames, max_cols)
    rows = (num_frames + cols - 1) // cols  # Ceiling division

    fig, axes = plt.subplots(rows, cols, figsize=figsize)
    axes = np.array(axes).flatten() # Flatten to 1D array for easy iteration

    for i, frame in enumerate(frames):
        ax = axes[i]
        # Convert BGR (OpenCV default) to RGB for matplotlib
        ax.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        ax.axis('off')
        if titles and i < len(titles):
            ax.set_title(titles[i])
    
    # Turn off any remaining empty subplots
    for j in range(num_frames, len(axes)):
        axes[j].axis('off')
        
    plt.tight_layout()
    plt.show()

# --- Select a sample dish and video for detailed walkthrough ---
SAMPLE_DISH_ID_VIS = "dish_1551235699"
SAMPLE_VIDEO_FILENAME_VIS = "camera_A.h264" # A common camera angle

# 1. Determine SAMPLE_DISH_ID_VIS
# Try to use a dish from 'train_dish_ids' if it's populated and its frames were processed
if 'train_dish_ids' in globals() and train_dish_ids:
    # Iterate through a few processed IDs to find one with frames
    # for potential_dish_id in train_dish_ids[2:3]: # Check first 5
    #     expected_frames_dir = os.path.join(VIDEO_DIR_LOCAL_FULL, potential_dish_id, FRAMES_SUBDIR)
    #     if os.path.exists(expected_frames_dir) and os.listdir(expected_frames_dir):
    #         SAMPLE_DISH_ID_VIS = potential_dish_id
    #         print(f"Using processed dish '{SAMPLE_DISH_ID_VIS}' for visualization (frames found at {expected_frames_dir}).")
    #         break
    if SAMPLE_DISH_ID_VIS is None and train_dish_ids : # If no frames found for first few, but list exists
        potential_dish_id = train_dish_ids[0]
        print(f"Could not confirm processed frames for initial sample of 'train_dish_ids'.")
        print(f"Proceeding with dish '{potential_dish_id}' but its video might need re-downloading if already processed and deleted.")
        SAMPLE_DISH_ID_VIS = potential_dish_id


# Fallback if no suitable dish from train_dish_ids is found or train_dish_ids is empty
if SAMPLE_DISH_ID_VIS is None:
    SAMPLE_DISH_ID_VIS = "00001" # Fallback dish ID
    print(f"Falling back to default sample dish ID: '{SAMPLE_DISH_ID_VIS}'.")

# 2. Define paths for the visualization video
# The visualization will use a video from the 'temp_videos' location, re-downloading if necessary.
VIS_VIDEO_TEMP_DIR_FOR_DISH = os.path.join(VIDEO_DIR_LOCAL_FULL, SAMPLE_DISH_ID_VIS, "temp_videos")
VIS_VIDEO_PATH = os.path.join(VIS_VIDEO_TEMP_DIR_FOR_DISH, SAMPLE_VIDEO_FILENAME_VIS)

# Ensure the parent directory for the temporary visualization video exists
os.makedirs(VIS_VIDEO_TEMP_DIR_FOR_DISH, exist_ok=True)

# 3. Check if the video file exists. If not, attempt to download it for visualization.
if not os.path.exists(VIS_VIDEO_PATH):
    print(f"Video for visualization ({VIS_VIDEO_PATH}) not found locally.")
    print(f"Attempting to download {SAMPLE_VIDEO_FILENAME_VIS} for dish {SAMPLE_DISH_ID_VIS} for visualization purposes...")
    
    # Construct the GCS path for the video
    # Ensure GSUTIL_BUCKET_BASE and VIDEO_SUBDIR_REMOTE are defined from the main script
    video_remote_gcs_path = f"{GSUTIL_BUCKET_BASE.rstrip('/')}/{VIDEO_SUBDIR_REMOTE.strip('/')}/{SAMPLE_DISH_ID_VIS}/{SAMPLE_VIDEO_FILENAME_VIS}"
    
    # Ensure download_file_gsutil function is defined from the main script
    success, msg = download_file_gsutil(video_remote_gcs_path, VIS_VIDEO_PATH, description=f"sample video for visualization")
    
    if success:
        print(f"Successfully downloaded {SAMPLE_VIDEO_FILENAME_VIS} for dish {SAMPLE_DISH_ID_VIS} to {VIS_VIDEO_PATH}.")
    else:
        print(f"Failed to download sample video for visualization: {msg}")
        # This implies that subsequent steps requiring the raw video might fail.
else:
    print(f"Video for visualization ({VIS_VIDEO_PATH}) already exists locally.")

# Final check and status message
if os.path.exists(VIS_VIDEO_PATH):
    print(f"Using video file for visualization: {VIS_VIDEO_PATH}")
else:
    print(f"CRITICAL WARNING: Video file for visualization ({VIS_VIDEO_PATH}) is NOT available. "
          f"The step-by-step raw video processing visualization will likely fail or be skipped.")

# Also ensure the directory for frames extracted by the *main script* exists for reference if needed,
# but the visualization re-processes VIS_VIDEO_PATH.
sample_dish_processed_frames_dir = os.path.join(VIDEO_DIR_LOCAL_FULL, SAMPLE_DISH_ID_VIS, FRAMES_SUBDIR)
os.makedirs(sample_dish_processed_frames_dir, exist_ok=True) # For main script's output, not directly used by viz re-processing

In [None]:
# %%
# --- Step 2a: Load video and extract candidate frames & features ---
candidate_frames_pool = []
candidate_features_pool = []
candidate_frame_indices_pool = []

if os.path.exists(VIS_VIDEO_PATH):
    cap_vis = cv2.VideoCapture(VIS_VIDEO_PATH)
    if not cap_vis.isOpened():
        print(f"Error: Could not open video {VIS_VIDEO_PATH} for visualization.")
    else:
        frame_count_vis = 0
        while True:
            ret, frame = cap_vis.read()
            if not ret:
                break
            if frame_count_vis % SAMPLE_EVERY_NTH_FRAME == 0: # Same sampling as in main script
                try:
                    # We need the frame itself for visualization, and features for analysis
                    # Make sure extract_frame_features is defined (it should be from original notebook)
                    features = extract_frame_features(frame) 
                    candidate_frames_pool.append(frame)
                    candidate_features_pool.append(features)
                    candidate_frame_indices_pool.append(frame_count_vis)
                except Exception as e:
                    print(f"Warning: Could not extract features for frame {frame_count_vis} from {VIS_VIDEO_PATH}. Error: {e}")
            frame_count_vis += 1
        cap_vis.release()
        print(f"Extracted {len(candidate_frames_pool)} candidate frames (and their features) from {VIS_VIDEO_PATH}.")
        
        if candidate_frames_pool:
            print("Showing first 5 candidate frames from the pool:")
            plot_frames(candidate_frames_pool[:5], titles=[f"Frame {candidate_frame_indices_pool[i]}" for i in range(min(5, len(candidate_frames_pool)))])
        else:
            print("No candidate frames could be extracted. Further visualization steps might fail.")
else:
    print(f"Video {VIS_VIDEO_PATH} not found. Skipping candidate frame extraction visualization.")

candidate_features_array = np.array(candidate_features_pool)
# Scale features before PCA for better results
if candidate_features_array.ndim > 1 and candidate_features_array.shape[0] > 1 :
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(candidate_features_array)
else:
    scaled_features = candidate_features_array # Not enough data to scale

In [None]:
# %%
# --- Step 2c: PCA and Visualize Feature Space ---
if 'scaled_features' in globals() and scaled_features.shape[0] > 1 and scaled_features.shape[1] > 1: # Need at least 2 samples and 2 features for PCA
    pca_vis = PCA(n_components=2, random_state=42)
    features_2d = pca_vis.fit_transform(scaled_features) # Use scaled features

    plt.figure(figsize=(10, 7))
    sns.scatterplot(x=features_2d[:, 0], y=features_2d[:, 1], alpha=0.7)
    plt.title(f'2D PCA of Frame Features from {os.path.basename(VIS_VIDEO_PATH)}')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()
    
    print("Each point in the scatter plot represents a frame from our candidate pool.")
    print("Clusters of points suggest groups of visually similar frames.")
else:
    print("Not enough data or features to perform PCA and visualize feature space.")
    features_2d = None # So later cells can check

In [None]:
# %%
# --- Step 2d: GMM Clustering and BIC for Model Selection ---
if 'features_2d' in globals() and features_2d is not None and 'scaled_features' in globals() and scaled_features.shape[0] > 1:
    n_samples_vis = scaled_features.shape[0]
    
    max_components_to_test = min(10, n_samples_vis -1 if n_samples_vis > 1 else 1) 
    max_components_to_test = max(1, max_components_to_test)
    
    if n_samples_vis <= 1:
        print("Not enough samples for GMM. Skipping GMM visualization.")
    else:
        test_n_components_range = range(1, max_components_to_test + 1)
        
        bics_vis = []
        lowest_bic_vis = np.infty
        best_n_components_vis = 0

        print(f"Evaluating GMM with {list(test_n_components_range)} components using BIC...")
        for n_comp_vis in test_n_components_range:
            if n_comp_vis == 0: continue
            if n_comp_vis > n_samples_vis : 
                bics_vis.append(np.nan)
                continue
            try:
                # Make sure GaussianMixture is imported
                gmm_vis = GaussianMixture(n_components=n_comp_vis, random_state=42, covariance_type='diag', n_init=3)
                gmm_vis.fit(scaled_features) 
                bic_val = gmm_vis.bic(scaled_features)
                bics_vis.append(bic_val)
                if bic_val < lowest_bic_vis:
                    lowest_bic_vis = bic_val
                    best_n_components_vis = n_comp_vis
            except ValueError as e:
                print(f"Warning: GMM with {n_comp_vis} components failed: {e}")
                bics_vis.append(np.nan)

        if best_n_components_vis > 0:
            plt.figure(figsize=(10, 6))
            # Filter out NaN BICs for plotting if any occurred
            valid_bics_indices = [i for i, bic in enumerate(bics_vis) if not np.isnan(bic)]
            valid_components_range = [test_n_components_range[i] for i in valid_bics_indices]
            valid_bics_values = [bics_vis[i] for i in valid_bics_indices]

            if valid_components_range: # Ensure there's something to plot
                plt.plot(valid_components_range, valid_bics_values, marker='o')
                plt.title('BIC Scores for GMM Components')
                plt.xlabel('Number of Components')
                plt.ylabel('BIC Score (Lower is Better)')
                plt.xticks(valid_components_range)
                plt.axvline(best_n_components_vis, color='r', linestyle='--', label=f'Best N Components: {best_n_components_vis}')
                plt.legend()
                plt.show()
            else:
                print("No valid BIC scores to plot.")

            print(f"Best number of GMM components according to BIC: {best_n_components_vis}")

            final_gmm_vis = GaussianMixture(n_components=2, random_state=42, covariance_type='diag', n_init=3)
            final_gmm_vis.fit(scaled_features)
            labels_vis = final_gmm_vis.predict(scaled_features)

            if features_2d is not None:
                plt.figure(figsize=(12, 8))
                sns.scatterplot(x=features_2d[:, 0], y=features_2d[:, 1], hue=labels_vis, palette='viridis', alpha=0.7, legend='full')
                plt.title(f'Frame Clusters (GMM) on 2D PCA ({best_n_components_vis} clusters found)')
                plt.xlabel('Principal Component 1')
                plt.ylabel('Principal Component 2')
                plt.show()
            else:
                print("PCA plot not available for cluster visualization.")
        else:
            print("Could not determine the best number of GMM components. Skipping GMM visualization.")
else:
    print("Skipping GMM visualization as previous steps might have failed or yielded insufficient data.")

In [None]:
# %%
# --- Step 2d: GMM Clustering and BIC for Model Selection ---
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.mixture import GaussianMixture
from matplotlib.patches import Ellipse # For drawing ellipses
# import matplotlib.colors # Might be useful for advanced color handling, not strictly needed here

# Helper function to draw GMM ellipses
def plot_gmm_ellipse(ax, mean_pca, covariance_pca, color, n_std=2.0, **kwargs):
    """
    Plots an n_std sigma ellipse of a 2D Gaussian component on the given axes.
    - ax: Matplotlib axes object
    - mean_pca: 2D mean in PCA space (center of the ellipse)
    - covariance_pca: 2x2 covariance matrix in PCA space
    - color: Color for the ellipse edge
    - n_std: Number of standard deviations for the ellipse size (e.g., 2.0 for approx 95% region)
    """
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_pca) # Use eigh for symmetric matrices
    
    # Ensure eigenvalues are non-negative (can be slightly negative due to numerical issues)
    eigenvalues = np.maximum(eigenvalues, 0)

    # Order eigenvalues and eigenvectors: largest eigenvalue first
    order = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[order]
    eigenvectors = eigenvectors[:, order]

    # Angle of the first principal component (largest eigenvector)
    # The eigenvector gives the direction (cos, sin), arctan2 gives the angle
    vx, vy = eigenvectors[:, 0] 
    angle = np.degrees(np.arctan2(vy, vx))

    # Width and height of the ellipse (full diameters, not semi-axes for Ellipse patch)
    # Eigenvalues are variances, so sqrt(eigenvalues) are standard deviations
    width, height = 2 * n_std * np.sqrt(eigenvalues)
    
    ellipse = Ellipse(xy=mean_pca, width=width, height=height, angle=angle,
                      facecolor='none', edgecolor=color, lw=2, **kwargs)
    ax.add_patch(ellipse)


# Assuming 'features_2d', 'scaled_features', and 'pca_2d' (the PCA transformer object)
# are defined in previous cells/steps.
if ('features_2d' in globals() and features_2d is not None and
    'scaled_features' in globals() and scaled_features is not None and
    'pca_2d' in globals() and pca_2d is not None and # Crucial for projecting GMM parameters
    scaled_features.shape[0] > 1): # Ensure there's enough data
    
    n_samples_vis = scaled_features.shape[0]
    
    # Determine max components to test, ensure it's at least 1 and not more than n_samples-1
    max_components_to_test = min(10, n_samples_vis -1 if n_samples_vis > 1 else 1) 
    max_components_to_test = max(1, max_components_to_test) # Ensure it's at least 1
    
    # This initial check effectively handles n_samples_vis <= 1 or situations where GMM isn't meaningful
    if n_samples_vis <= 1 or max_components_to_test == 0 : # max_components_to_test can't be 0 with current logic
        print("Not enough samples for GMM or max_components_to_test is 0. Skipping GMM visualization.")
    else:
        test_n_components_range = range(1, max_components_to_test + 1)
        
        bics_vis = []
        lowest_bic_vis = np.infty
        best_n_components_vis = 0 # Initialize, indicates no best model found yet

        print(f"Evaluating GMM with {list(test_n_components_range)} components using BIC...")
        for n_comp_vis in test_n_components_range:
            # n_comp_vis will always be <= n_samples_vis due to max_components_to_test logic
            try:
                gmm_vis = GaussianMixture(n_components=n_comp_vis, random_state=42, 
                                          covariance_type='diag', n_init=10) # n_init=10 for robustness
                gmm_vis.fit(scaled_features) 
                bic_val = gmm_vis.bic(scaled_features)
                bics_vis.append(bic_val)
                if not np.isnan(bic_val) and bic_val < lowest_bic_vis:
                    lowest_bic_vis = bic_val
                    best_n_components_vis = n_comp_vis
            except ValueError as e:
                print(f"Warning: GMM with {n_comp_vis} components failed: {e}")
                bics_vis.append(np.nan)

        if best_n_components_vis > 0: # Proceed only if a best model was found
            plt.figure(figsize=(10, 6))
            
            # Filter out NaN BICs for plotting
            valid_indices = [i for i, bic in enumerate(bics_vis) if not np.isnan(bic)]
            # Components corresponding to valid BICs
            plot_components = [test_n_components_range[i] for i in valid_indices]
            plot_bics = [bics_vis[i] for i in valid_indices]

            if plot_components: # Ensure there's something to plot
                plt.plot(plot_components, plot_bics, marker='o')
                plt.title('BIC Scores for GMM Components')
                plt.xlabel('Number of Components')
                plt.ylabel('BIC Score (Lower is Better)')
                plt.xticks(plot_components)
                plt.axvline(best_n_components_vis, color='r', linestyle='--', 
                            label=f'Best N Components: {best_n_components_vis}')
                plt.legend()
                plt.show()
            else:
                print("No valid BIC scores to plot.")

            print(f"Best number of GMM components according to BIC: {best_n_components_vis}")

            # Fit final GMM with the best number of components
            final_gmm_vis = GaussianMixture(n_components=best_n_components_vis, # Use determined best_n_components_vis
                                            random_state=42, covariance_type='diag', n_init=10)
            final_gmm_vis.fit(scaled_features)
            labels_vis = final_gmm_vis.predict(scaled_features)

            # features_2d should not be None if we passed the initial check
            fig, ax = plt.subplots(figsize=(12, 8)) # Get both figure and axes

            # Define a palette for consistent coloring
            # Ensure n_colors for palette is at least 1
            num_colors_for_palette = max(1, best_n_components_vis)
            palette = sns.color_palette('viridis', n_colors=num_colors_for_palette)
            
            sns.scatterplot(x=features_2d[:, 0], y=features_2d[:, 1], hue=labels_vis, 
                            palette=palette, alpha=0.7, legend='full', ax=ax)
            
            ax.set_title(f'Frame Clusters (GMM) on 2D PCA ({best_n_components_vis} clusters found)')
            ax.set_xlabel('Principal Component 1')
            ax.set_ylabel('Principal Component 2')

            # Project GMM means and covariances to PCA space and plot ellipses
            # final_gmm_vis.means_ are (n_components, n_features_original)
            # final_gmm_vis.covariances_ are (n_components, n_features_original) for 'diag' type
            
            # Transform GMM means from original feature space to 2D PCA space
            # pca_2d is the fitted PCA object: pca_2d.transform expects (n_samples, n_features_original)
            means_pca_space = pca_2d.transform(final_gmm_vis.means_) # Result is (n_components, 2)
            
            # pca_2d.components_ has shape (n_pca_components, n_features_original), e.g., (2, D)
            
            for i in range(best_n_components_vis):
                mean_pca_component = means_pca_space[i] # Center of ellipse in 2D
                
                # Covariance for component i in original feature space (diagonal elements)
                cov_orig_diag_elements_i = final_gmm_vis.covariances_[i] # Shape: (n_features_original,)
                # Construct full diagonal covariance matrix in original space
                cov_orig_full_diag_i = np.diag(cov_orig_diag_elements_i) # Shape: (D, D)
                
                # Project this original covariance to PCA space: C_pca = P @ C_orig @ P.T
                # P is pca_2d.components_
                cov_pca_component = pca_2d.components_ @ cov_orig_full_diag_i @ pca_2d.components_.T
                # cov_pca_component should be a (2, 2) matrix
                
                # Color for this component's ellipse (matches scatter plot hue)
                # The GMM components are typically indexed 0 to N-1, matching palette indices
                component_color = palette[i % len(palette)]

                plot_gmm_ellipse(ax, mean_pca_component, cov_pca_component, 
                                 color=component_color, n_std=2.0) # 2 standard deviations

            plt.show()
        else:
            print("Could not determine the best number of GMM components. Skipping GMM visualization.")
else:
    # More informative message if conditions are not met
    missing_reasons = []
    if 'features_2d' not in globals() or features_2d is None:
        missing_reasons.append("features_2d (PCA-reduced data) is missing")
    if 'scaled_features' not in globals() or scaled_features is None:
        missing_reasons.append("scaled_features (input to GMM) is missing")
    elif 'scaled_features' in globals() and scaled_features is not None and scaled_features.shape[0] <= 1:
        missing_reasons.append("scaled_features has insufficient samples (<=1)")
    if 'pca_2d' not in globals() or pca_2d is None:
        missing_reasons.append("pca_2d (PCA transformer object) is missing, needed for ellipse projection")
        
    if missing_reasons:
        print(f"Skipping GMM visualization. Reasons: {'; '.join(missing_reasons)}.")
    else: 
        # This case should ideally not be reached if the main 'if' condition is structured well
        print("Skipping GMM visualization as previous steps might have failed or yielded insufficient data.")

In [None]:
# %%
# --- Step 2e: Selecting representative frames (simplified illustration) ---
selected_vis_frames = []
selected_vis_frame_indices = []

if 'final_gmm_vis' in globals() and 'labels_vis' in globals() and 'candidate_frames_pool' in globals() and candidate_frames_pool:
    num_actual_clusters = final_gmm_vis.n_components
    
    for i in range(num_actual_clusters):
        cluster_member_indices_in_pool = np.where(labels_vis == i)[0]
        if len(cluster_member_indices_in_pool) > 0:
            cluster_features = scaled_features[cluster_member_indices_in_pool]
            component_mean = final_gmm_vis.means_[i]
            distances = np.linalg.norm(cluster_features - component_mean, axis=1)
            closest_in_cluster_local_idx = np.argmin(distances)
            original_pool_idx = cluster_member_indices_in_pool[closest_in_cluster_local_idx]
            
            selected_vis_frames.append(candidate_frames_pool[original_pool_idx])
            selected_vis_frame_indices.append(candidate_frame_indices_pool[original_pool_idx])

    if selected_vis_frames:
        sorted_indices = np.argsort(selected_vis_frame_indices)
        selected_vis_frames_sorted = [selected_vis_frames[i] for i in sorted_indices]
        selected_vis_frame_indices_sorted = [selected_vis_frame_indices[i] for i in sorted_indices]
        
        print(f"Selected {len(selected_vis_frames_sorted)} representative frames based on GMM clusters (simplified illustration):")
        plot_frames(selected_vis_frames_sorted, 
                    titles=[f"Cluster Rep. (Frame {idx})" for idx in selected_vis_frame_indices_sorted],
                    max_cols=max(1, len(selected_vis_frames_sorted))) # Show all in one row if possible
    else:
        print("No frames selected in this illustrative step (perhaps GMM failed or no clusters found).")
else:
    print("GMM results not available, skipping illustrative frame selection.")

In [None]:
# %%
# --- Step 3: Show results using the script's main function ---
# Ensure NUM_FRAMES_PER_VIDEO and SAMPLE_EVERY_NTH_FRAME are defined from original notebook
# Default if not (though they should be):
if 'NUM_FRAMES_PER_VIDEO' not in globals(): NUM_FRAMES_PER_VIDEO = 5
if 'SAMPLE_EVERY_NTH_FRAME' not in globals(): SAMPLE_EVERY_NTH_FRAME = 5


print(f"Running the script's 'select_diverse_frames_gmm' function for {VIS_VIDEO_PATH}...")
print(f"Targeting NUM_FRAMES_PER_VIDEO = {NUM_FRAMES_PER_VIDEO}")

if os.path.exists(VIS_VIDEO_PATH):
    # Make sure select_diverse_frames_gmm is defined (from original notebook)
    final_selected_frames, final_selected_frame_numbers, messages = select_diverse_frames_gmm(
        VIS_VIDEO_PATH,
        num_frames_to_select=NUM_FRAMES_PER_VIDEO, 
        sample_every_nth=SAMPLE_EVERY_NTH_FRAME    
    )
    
    print("\nMessages from select_diverse_frames_gmm:")
    for msg in messages:
        print(f"- {msg}")
    print("\n")

    if final_selected_frames:
        print(f"Final selected diverse frames ({len(final_selected_frames)} frames):")
        
        first_frame_display = None
        if 'candidate_frames_pool' in globals() and candidate_frames_pool:
            first_frame_display = candidate_frames_pool[0]
        
        if first_frame_display is not None:
            plot_frames([first_frame_display], titles=["Original Video (First Sampled Frame)"], figsize=(6,4), max_cols=1)

        plot_frames(final_selected_frames, 
                    titles=[f"Selected Frame {num}" for num in final_selected_frame_numbers],
                    max_cols=max(1, NUM_FRAMES_PER_VIDEO)) 
    else:
        print("No frames were selected by 'select_diverse_frames_gmm'. Check messages above for reasons.")
else:
    print(f"Video {VIS_VIDEO_PATH} not found. Cannot run 'select_diverse_frames_gmm'.")