In [None]:
# Import necessary libraries
import cv2  # For image processing
import numpy as np  # For numerical operations
import os  # For handling file paths and directories
import matplotlib.pyplot as plt  # For plotting and visualization
import pandas as pd  # For handling CSV files and data manipulation

# Define constants for directory paths
IMAGE_DIR = './sdss_explore/batch1'  # Directory containing input images
SAVE_DIR = './sdss_explore/batch1'  # Directory to save filtered results
os.makedirs(SAVE_DIR, exist_ok=True)  # Create the save directory if it doesn't exist
CSV_FILE_PATH = 'doubleIn4bandsMain_updated.csv'  # Path to the CSV file with metadata

# Global list to store filenames with the highest red pixel intensity peak in the range [100, 255]
filenames_with_peaks = []

# Function to check if the highest peak intensity of the red channel falls within the specified range
def check_highest_peak_in_range(red_channel):
    # Compute the histogram of pixel intensities for the red channel
    histogram = cv2.calcHist([red_channel], [0], None, [256], [0, 256])
    # Find the intensity value corresponding to the highest peak in the histogram
    highest_peak_intensity = np.argmax(histogram)
    # Return whether the highest peak is in the range [100, 255]
    return 100 <= highest_peak_intensity <= 255

# Function to process a single image and check the red channel's peak intensity
def process_image(image_path):
    # Load the image from the specified path
    image = cv2.imread(image_path)
    if image is None:  # Handle case where the image cannot be loaded
        print(f"Failed to load image: {image_path}")
        return False
    # Convert the image from BGR (OpenCV default) to RGB format
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Extract the red channel from the image
    red_channel = image_rgb[:, :, 0]
    # Check if the red channel's peak intensity is within the specified range
    return check_highest_peak_in_range(red_channel)

# Function to filter a CSV file based on matching filenames
def filter_csv_by_filenames(csv_file_path, filenames):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    # Ensure the column 'objid' exists in the CSV
    if 'objid' not in df.columns:
        print("Column 'objid' not found in CSV.")
        return None
    # Convert 'objid' to strings for comparison
    df['objid'] = df['objid'].astype(str)
    # Remove file extensions from filenames to match 'objid'
    filenames_no_ext = remove_extension(filenames)
    # Filter the DataFrame for rows where 'objid' matches filenames
    filtered_df = df[df['objid'].isin(filenames_no_ext)]
    return filtered_df

# Helper function to remove file extensions from filenames
def remove_extension(filenames):
    return [os.path.splitext(filename)[0] for filename in filenames]

# Function to display images that passed the filtering
def show_images(image_dir, filenames):
    num_images = len(filenames)  # Number of images to display
    grid_size = int(np.ceil(np.sqrt(num_images)))  # Grid size for displaying images
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(15, 15))  # Create a grid of subplots

    # Flatten the axes array for easier indexing
    if num_images == 1:
        axes = np.array([[axes]])  # Handle case of a single image
    else:
        axes = axes.flatten()

    # Loop through each image and display it in the grid
    for ax, filename in zip(axes, filenames):
        image_path = os.path.join(image_dir, filename)
        image = cv2.imread(image_path)
        if image is None:  # Handle missing images
            print(f"Failed to load image: {image_path}")
            continue
        # Convert the image to RGB and display it
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        ax.imshow(image_rgb)
        ax.set_title(filename, fontsize=8)
        ax.axis('off')  # Hide axes for cleaner display

    # Hide unused subplots
    for ax in axes[len(filenames):]:
        ax.axis('off')

    plt.tight_layout()  # Adjust layout for better visualization
    plt.show()  # Display the grid of images

# Main function to process images, filter filenames, and update the CSV
def main(image_dir, csv_file_path, save_dir):
    global filenames_with_peaks  # Access the global list for storing filenames

    # Iterate through all image files in the directory
    for filename in os.listdir(image_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # Process only image files
            image_path = os.path.join(image_dir, filename)
            if process_image(image_path):  # Check if the image passes the red intensity check
                filenames_with_peaks.append(filename)

    # Output filenames that passed the red intensity check
    if filenames_with_peaks:
        print("Images with highest Red pixel intensity peak in the range [100, 255]:")
        for filename in filenames_with_peaks:
            print(filename)
    else:
        print("No images with highest Red pixel intensity peak in the range [100, 255] found.")

    # Display the filtered images
    show_images(image_dir, filenames_with_peaks)

    # Filter the CSV file based on the selected filenames
    filtered_rows = filter_csv_by_filenames(csv_file_path, filenames_with_peaks)
    if filtered_rows is not None and not filtered_rows.empty:
        # Print and save the filtered rows
        print("Filtered rows:")
        print(filtered_rows)
        filtered_csv_path = os.path.join(save_dir, 'falsePositivesBatch3.csv')
        filtered_rows.to_csv(filtered_csv_path, index=False)
        print(f"Filtered rows saved to {filtered_csv_path}")
    else:
        print("No matching rows found in CSV.")

# Execute the script
if __name__ == "__main__":
    main(IMAGE_DIR, CSV_FILE_PATH, SAVE_DIR)
