In [4]:
# data_processing.py

import h5py
import numpy as np
from synthesizer.conversions import lnu_to_absolute_mag
import pandas as pd
import unyt
from unyt import erg, Hz, s
import cmasher as cmr
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
import sys
import glob

sys.path.append("/home/jovyan/camels/proj1/")

from setup_params import get_photometry, get_luminosity_function, get_colour_distribution#, get_colour_dir_name, get_magnitude_mask # get_safe_name
from variables_config import get_config

def get_safe_name(name, filter_system_only=False):
    """
    Convert string to path-safe version and/or extract filter system.
    
    Args:
        name: String to process (e.g., "GALEX FUV" or "UV1500")
        filter_system_only: If True, returns only the filter system (e.g., "GALEX" or "UV")
    
    Returns:
        Processed string (e.g., "GALEX_FUV" or "GALEX")
    """
    # Replace spaces with underscores
    safe_name = name.replace(' ', '_')
    
    # If we only want the filter system, return the first part
    if filter_system_only:
        return safe_name.split('_')[0]
    
    return safe_name

def get_colour_dir_name(band1, band2):
    """
    Create a standardized directory name for colour plots.
    Examples:
        ("GALEX FUV", "GALEX NUV") -> "GALEX_FUV-NUV"
        ("UVM2", "SUSS") -> "UVM2-SUSS"
    """
    # Extract the relevant parts of the filter names
    if ' ' in band1:
        system1, filter1 = band1.split(' ', 1)
    else:
        system1, filter1 = band1, band1

    if ' ' in band2:
        system2, filter2 = band2.split(' ', 1)
    else:
        system2, filter2 = band2, band2
        
    # If both filters are from the same system, use shortened version
    if system1 == system2:
        return f"{get_safe_name(system1)}_{filter1}-{filter2}"
    else:
        return f"{get_safe_name(band1)}-{get_safe_name(band2)}"


   
def get_magnitude_mask(photo, filters, mag_limits=None):
    """
    Create a magnitude mask based on provided limits.
    
    Args:
        photo (dict): Photometry data dictionary
        filters (list): List of filters to check
        mag_limits (dict): Dictionary of magnitude limits for each filter
    
    Returns:
        numpy.ndarray: Boolean mask array, or None if no limits provided
    """
    if not mag_limits:
        return None
        
    # Start with all True
    combined_mask = np.ones(len(photo[filters[0]]), dtype=bool)
    
    # Apply limits for each filter
    for band in filters:
        if band in mag_limits:
            combined_mask &= (photo[band] < mag_limits[band])
            
    return combined_mask


def process_data(input_dir, redshift_values, uvlf_limits, n_bins_lf, lf_data_dir, 
                colour_limits, n_bins_colour, colour_data_dir, category, bands, 
                colour_pairs=None, mag_limits=None, simulation=None, dataset="CV"):
    """Process data for any combination of bands and color pairs"""
    sim_prefix = "Simba" if simulation == "SIMBA" else simulation
    print('uvlf limits: ',uvlf_limits)
    photo_files = [f for f in os.listdir(input_dir) if f.endswith('_photometry.hdf5')]
    
    for filename in photo_files:
        sim_name = filename.replace(f'{sim_prefix}_', '').replace('_photometry.hdf5', '')
        for snap, redshift_info in redshift_values.items():
            try:
                spec_type = "intrinsic" if category == "intrinsic" else "attenuated"
                
                # Process filters
                if bands is not None:
                    filters_to_process = [bands] if isinstance(bands, str) else bands
                    print('working on:', filters_to_process)
                    photo = get_photometry(
                        sim_name=sim_name,
                        spec_type=spec_type,
                        snap=snap,
                        sps="BC03",
                        model=sim_prefix,
                        filters=filters_to_process,
                        photo_dir=input_dir
                    )
                    '''
                    # Add debug prints here
                    for band in filters_to_process:
                        print(f"\nDEBUG: {simulation} {spec_type} {band} magnitudes:")
                        print(f"Raw magnitude range: {np.min(photo[band])} to {np.max(photo[band])}")
                        print(f"UVLF limits being used: {uvlf_limits}")
                    '''
                    # Process UVLFs
                    for band in filters_to_process:
                        phi, phi_sigma, hist, bin_lims = get_luminosity_function(
                            photo, band, *uvlf_limits, n_bins=n_bins_lf
                        )
                        
                        #print(f"After binning, bin limits: {bin_lims[0]} to {bin_lims[-1]}")
                        
                        bin_centers = 0.5 * (bin_lims[1:] + bin_lims[:-1])
                        #print(f"Bin centers range: {bin_centers[0]} to {bin_centers[-1]}")

                        uvlf_df = pd.DataFrame({
                            'magnitude': bin_centers,
                            'phi': phi,
                            'phi_sigma': phi_sigma,
                            'hist': hist
                        })
                        
                        filter_system = get_safe_name(band, filter_system_only=True)
                        output_dir = os.path.join(lf_data_dir[category][filter_system], 
                                                get_safe_name(redshift_info['label']))
                        os.makedirs(output_dir, exist_ok=True)
                        
                        uvlf_filename = f"UVLF_{sim_name}_{get_safe_name(band)}_{get_safe_name(redshift_info['label'])}_{spec_type}.txt"
                        output_path = os.path.join(output_dir, uvlf_filename)
                        print("Outputted LF to:", output_path)
                        uvlf_df.to_csv(output_path, index=False, sep='\t')
                
                # Process colours
                if colour_pairs:
                    for band1, band2 in colour_pairs:
                        if band1 in photo and band2 in photo:
                            mask = get_magnitude_mask(photo, [band1, band2], mag_limits)
                            colour_dist, bin_lims = get_colour_distribution(
                                photo, band1, band2, *colour_limits,
                                n_bins=n_bins_colour, mask=mask
                            )
                            
                            bin_centers = 0.5 * (bin_lims[1:] + bin_lims[:-1])
                            colour_df = pd.DataFrame({
                                'colour': bin_centers,
                                'distribution': colour_dist
                            })
                            
                            filter_system = get_colour_dir_name(band1, band2)
                            output_dir = os.path.join(colour_data_dir[category],
                                                    filter_system,
                                                    get_safe_name(redshift_info['label']))
                            os.makedirs(output_dir, exist_ok=True)
                            
                            colour_filename = f"Colour_{sim_name}_{filter_system}_{get_safe_name(redshift_info['label'])}_{spec_type}.txt"
                            output_path = os.path.join(output_dir, colour_filename)
                            print("Outputted colours to:", output_path)
                            colour_df.to_csv(output_path, index=False, sep='\t')
                
            except Exception as e:
                continue

def process_all_data(input_dir, redshift_values, uvlf_limits, n_bins_lf, lf_data_dir, 
                    colour_limits, n_bins_colour, colour_data_dir, mag_limits, 
                    simulation=None, dataset="CV"):
    """Process all combinations of data types and bands"""
    config = get_config(dataset=dataset, simulation=simulation)
    band_combinations = config["filters"]
    colour_combinations = config["colour_pairs"]
    
    for category in ["attenuated", "intrinsic"]:
        # Process bands for UVLF
        process_data(
            input_dir=input_dir,
            redshift_values=redshift_values,
            uvlf_limits=uvlf_limits,
            n_bins_lf=n_bins_lf,
            lf_data_dir=lf_data_dir,
            colour_limits=colour_limits,
            n_bins_colour=n_bins_colour,
            colour_data_dir=colour_data_dir,
            category=category,
            bands=band_combinations[category],
            colour_pairs=colour_combinations,  # Pass the color pairs here
            mag_limits=mag_limits,
            simulation=simulation,
            dataset=dataset
        )


In [5]:


# sim_name="CV_0",
# spec_type="attenuated",
# snap=None,  # Made this None by default
# sps="BC03",
# model="IllustrisTNG",
# # photo_dir="/disk/xray15/aem2/data/6pams/",
# photo_dir="/home/jovyan/Data/Photometry",
# filters=[
#     "GALEX FUV",
#     "GALEX NUV",
# ]
# if snap is None:
#     # Use first available snapshot if none specified
#     snap = get_available_snapshots(photo_dir)[0]

# # photo_file = f"{photo_dir}/alice_galex_LH.h5"
# photo_file = f"{photo_dir}/{model}_{sim_name}_photometry.hdf5"
# print("accessing: ", photo_file)
# photo = {}
# with h5py.File(photo_file, "r") as hf:
#     for filt in filters:
#         # Updated path to match your file structure
#         photo[filt] = hf[
#             f"{sim_name}/snap_{snap}/{sps}/photometry/luminosity/{spec_type}/{filt}"
#         ][:]
#         photo[filt] *= unyt_quantity.from_string("1 erg/s/Hz")
#         photo[filt] = lnu_to_absolute_mag(photo[filt])

# photo


In [7]:
if __name__ == "__main__":
    simulations = ["SIMBA", "IllustrisTNG", "Astrid", "Swift-EAGLE"]
    datasets = ["1P"]#"CV"]#,
    
    for simulation in simulations:
        for dataset in datasets:
            config = get_config(dataset=dataset, simulation=simulation)
            print(f"\nProcessing {dataset} {simulation}")

            print("N. LF bins: ", config["n_bins_lf"])
            # Process all data including colors
            process_all_data(
                input_dir=config["input_dir"],
                redshift_values=config["redshift_values"],
                uvlf_limits=config["uvlf_limits"],
                n_bins_lf=config["n_bins_lf"],
                lf_data_dir=config["lf_data_dir"],
                colour_limits=config["colour_limits"],
                n_bins_colour=config["n_bins_colour"],
                colour_data_dir=config["colour_data_dir"],
                mag_limits=config["mag_limits"],
                simulation=simulation,
                dataset=dataset
            )


Processing CV SIMBA
N. LF bins:  15
uvlf limits:  (-25, -14)
working on: ['GALEX FUV', 'GALEX NUV']
Outputted LF to: /home/jovyan/camels/proj1/CV_set/CV_outputs/LFs/SIMBA/attenuated/GALEX/z2.0/UVLF_CV_10_GALEX_FUV_z2.0_attenuated.txt
Outputted LF to: /home/jovyan/camels/proj1/CV_set/CV_outputs/LFs/SIMBA/attenuated/GALEX/z2.0/UVLF_CV_10_GALEX_NUV_z2.0_attenuated.txt
Outputted colours to: /home/jovyan/camels/proj1/CV_set/CV_outputs/colours/SIMBA/attenuated/GALEX_FUV-NUV/z2.0/Colour_CV_10_GALEX_FUV-NUV_z2.0_attenuated.txt
working on: ['GALEX FUV', 'GALEX NUV']
Outputted LF to: /home/jovyan/camels/proj1/CV_set/CV_outputs/LFs/SIMBA/attenuated/GALEX/z1.5/UVLF_CV_10_GALEX_FUV_z1.5_attenuated.txt
Outputted LF to: /home/jovyan/camels/proj1/CV_set/CV_outputs/LFs/SIMBA/attenuated/GALEX/z1.5/UVLF_CV_10_GALEX_NUV_z1.5_attenuated.txt
Outputted colours to: /home/jovyan/camels/proj1/CV_set/CV_outputs/colours/SIMBA/attenuated/GALEX_FUV-NUV/z1.5/Colour_CV_10_GALEX_FUV-NUV_z1.5_attenuated.txt
working on