## Creating npy arrays for the labels 

In [11]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm

In [19]:
# Configuración de directorios
BASE_DIR = '../Datasets_DeepShadows/'
JPEG_DIRS = {
    'train': os.path.join(BASE_DIR, 'Jpeg_data/Training/'),
    'val': os.path.join(BASE_DIR, 'Jpeg_data/Validation/'),
    'test': os.path.join(BASE_DIR, 'Jpeg_data/Test/')
}

In [20]:
LABEL_DIR = os.path.join(BASE_DIR, 'Galaxies_data/')
os.makedirs(LABEL_DIR, exist_ok=True)


In [21]:
# Archivos de referencia
LSB_PATH = os.path.join(BASE_DIR, 'Datasets/random_LSBGs_all.csv')
ARTIFACT_PATH = os.path.join(BASE_DIR, 'Datasets/random_negative_all_2.csv')

In [22]:
# Rutas a los conjuntos de datos
DATASET_PATHS = {
    'train': os.path.join(BASE_DIR, 'Datasets/Baseline_training.csv'),
    'val': os.path.join(BASE_DIR, 'Datasets/Baseline_validation.csv'),
    'test': os.path.join(BASE_DIR, 'Datasets/Baseline_test.csv')
}

In [23]:
# Función para extraer coordenadas de nombres de archivo
def parse_filename(filename):
    """
    Extrae RA y DEC de nombres en formato {ra}_{dec}_{index}_256pix.jpeg
    Ejemplo: 0.16791_-59.994767_3422_256pix.jpeg
    """
    match = re.match(r'^([\d\.\-]+)_([\d\.\-]+)_\d+_256pix\.jpe?g', filename, re.IGNORECASE)
    if match:
        try:
            ra = float(match.group(1))
            dec = float(match.group(2))
            return (ra, dec)
        except ValueError:
            return (None, None)
    return (None, None)

In [24]:
# Cargar los catálogos de referencia
def load_reference_catalogs():
    """Carga y combina los catálogos de galaxias y artefactos"""
    # Cargar galaxias LSB
    lsb_df = pd.read_csv(LSB_PATH)
    lsb_df = lsb_df[['ra', 'dec']].drop_duplicates().dropna()
    lsb_df['label'] = 1
    print(f"Loaded {len(lsb_df)} LSB galaxies")
    
    # Cargar artefactos (no galaxias)
    art_df = pd.read_csv(ARTIFACT_PATH)
    art_df = art_df[['ra', 'dec']].drop_duplicates().dropna()
    art_df['label'] = 0
    print(f"Loaded {len(art_df)} artifacts")
    
    # Combinar ambos catálogos
    ref_df = pd.concat([lsb_df, art_df], ignore_index=True)
    
    # Verificar conflictos (mismo punto en ambos catálogos)
    duplicates = ref_df[ref_df.duplicated(subset=['ra', 'dec'], keep=False)]
    conflict_count = len(duplicates) // 2
    if conflict_count > 0:
        print(f"Warning: Found {conflict_count} objects in both catalogs!")
        # Priorizar galaxias sobre artefactos
        ref_df = ref_df.sort_values('label', ascending=False)
        ref_df = ref_df.drop_duplicates(subset=['ra', 'dec'], keep='first')
    
    print(f"Total reference objects: {len(ref_df)}")
    return ref_df

In [25]:
# Función principal para crear etiquetas
def create_label_arrays(ref_df):
    """Crea arrays de etiquetas para cada conjunto de datos"""
    for set_name in ['train', 'val', 'test']:
        print(f"\n{'='*50}")
        print(f"Processing {set_name} set")
        print(f"{'='*50}")
        
        # Obtener nombres de archivos de imágenes
        jpeg_dir = JPEG_DIRS[set_name]
        image_files = [f for f in os.listdir(jpeg_dir) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        print(f"Found {len(image_files)} images")
        
        # Procesar cada imagen y asignar etiqueta
        labels = []
        missing_in_ref = []
        
        for filename in tqdm(image_files, desc="Processing images"):
            coords = parse_filename(filename)
            if coords == (None, None):
                continue
                
            ra, dec = coords
            
            # Buscar en el catálogo de referencia
            match = ref_df[
                (np.isclose(ref_df['ra'], ra, atol=1e-5)) & 
                (np.isclose(ref_df['dec'], dec, atol=1e-5))
            ]
            
            if not match.empty:
                # Tomar la primera coincidencia (debería ser única)
                label = match.iloc[0]['label']
                labels.append(label)
            else:
                # Guardar para reporte
                missing_in_ref.append((filename, ra, dec))
                labels.append(0)  # Asumir como artefacto si no está en referencia
        
        # Convertir a array numpy
        labels_array = np.array(labels, dtype=np.int32)
        
        # Guardar
        output_path = os.path.join(LABEL_DIR, f'y_{set_name}.npy')
        np.save(output_path, labels_array)
        
        # Reportar estadísticas
        galaxy_count = np.sum(labels_array == 1)
        artifact_count = np.sum(labels_array == 0)
        missing_count = len(missing_in_ref)
        
        print(f"\nSaved {len(labels_array)} labels to {output_path}")
        print(f"Galaxies (LSB): {galaxy_count} ({galaxy_count/len(labels_array):.2%})")
        print(f"Artifacts: {artifact_count} ({artifact_count/len(labels_array):.2%})")
        print(f"Images not found in reference: {missing_count}")
        
        # Guardar lista de imágenes no encontradas en referencia
        if missing_in_ref:
            missing_df = pd.DataFrame(missing_in_ref, columns=['filename', 'ra', 'dec'])
            missing_csv = os.path.join(LABEL_DIR, f'missing_in_ref_{set_name}.csv')
            missing_df.to_csv(missing_csv, index=False)
            print(f"Saved list of missing images to {missing_csv}")

In [26]:
# Ejecución principal
if __name__ == "__main__":
    print("Loading reference catalogs...")
    ref_df = load_reference_catalogs()
    
    print("\nGenerating label arrays...")
    create_label_arrays(ref_df)
    
    print("\nAll label arrays created successfully!")

Loading reference catalogs...
Loaded 19995 LSB galaxies
Loaded 20000 artifacts
Total reference objects: 39216

Generating label arrays...

Processing train set
Found 29580 images


Processing images: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29580/29580 [00:10<00:00, 2885.03it/s]



Saved 29580 labels to ../Datasets_DeepShadows/Galaxies_data/y_train.npy
Galaxies (LSB): 15210 (51.42%)
Artifacts: 14370 (48.58%)
Images not found in reference: 5
Saved list of missing images to ../Datasets_DeepShadows/Galaxies_data/missing_in_ref_train.csv

Processing val set
Found 4990 images


Processing images: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4990/4990 [00:01<00:00, 3000.50it/s]



Saved 4990 labels to ../Datasets_DeepShadows/Galaxies_data/y_val.npy
Galaxies (LSB): 2595 (52.00%)
Artifacts: 2395 (48.00%)
Images not found in reference: 1
Saved list of missing images to ../Datasets_DeepShadows/Galaxies_data/missing_in_ref_val.csv

Processing test set
Found 4983 images


Processing images: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4983/4983 [00:01<00:00, 2916.94it/s]


Saved 4983 labels to ../Datasets_DeepShadows/Galaxies_data/y_test.npy
Galaxies (LSB): 2645 (53.08%)
Artifacts: 2338 (46.92%)
Images not found in reference: 0

All label arrays created successfully!



