# 1.0 Creación de Dataset Inicial

## Inputs

### Conexión con Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Direcciones e Inputs

In [None]:
# Parámetros de división y rutas
MASTER_CSV = "/content/drive/MyDrive/Proyecto_Integrador/data/dataset.csv"
OUT_DIR    = "/content/drive/MyDrive/Proyecto_Integrador/Deteccion/datasets"

# Ratios (suman 1.0)
R_TRAIN, R_VAL, R_TEST = 0.70, 0.15, 0.15

# Umbrales para etiqueta binaria
THRESH_NEG = 0.25   # <= 0.25  -> clase 0 (sin barra)
THRESH_POS = 0.50   # >= 0.50  -> clase 1 (con barra)

SEED = 42

### Librerias

In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GroupShuffleSplit, StratifiedShuffleSplit

os.makedirs(OUT_DIR, exist_ok=True)
np.random.seed(SEED)


## Pre-Procesamiento

### Limpieza y etiquetas

In [None]:
df = pd.read_csv(MASTER_CSV)

# Verificación de columnas esperadas
need = {"name","objra","objdec","Bars"}
missing = need - set(df.columns)
assert not missing, f"Faltan columnas en el maestro: {missing}"

# Normalizamos tipos y renombramos para el pipeline
df = df.copy()
df["image_id"] = df["name"].astype(str)
df["ra"]  = pd.to_numeric(df["objra"], errors="coerce")
df["dec"] = pd.to_numeric(df["objdec"], errors="coerce")
df["label_str"] = pd.to_numeric(df["Bars"], errors="coerce")

# Limpiezas básicas
before = len(df)
df = df.dropna(subset=["image_id","ra","dec","label_str"]).drop_duplicates(subset=["image_id"])
print(f"Filas originales: {before} | tras limpieza: {len(df)}")

# Etiqueta binaria con ambiguos
def to_bin(x):
    if x <= THRESH_NEG: return 0
    if x >= THRESH_POS: return 1
    return -1  # ambiguo

df["label_bin"] = df["label_str"].apply(to_bin)

print("Recuento label_bin (incluye ambiguos):")

vc = df["label_bin"].value_counts(dropna=False)
vc = vc.sort_index()
print(vc)



Filas originales: 10126 | tras limpieza: 10126
Recuento label_bin (incluye ambiguos):
label_bin
0    6783
1    3343
Name: count, dtype: int64


### Balance

In [None]:
df_clean = df[df["label_bin"].isin([0,1])].copy()
df_amb   = df[df["label_bin"] == -1].copy()

print("Limpio:", len(df_clean), "| Ambiguos:", len(df_amb))
print("Balance limpio:")
print(df_clean["label_bin"].value_counts(normalize=True).rename("ratio"))


Limpio: 10126 | Ambiguos: 0
Balance limpio:
label_bin
0    0.66986
1    0.33014
Name: ratio, dtype: float64


## Train | Test | Val Datasets Split

### Balanceo y función

In [None]:
def split_stratified_clean(df_clean, r_train=0.7, r_val=0.15, r_test=0.15, seed=42):
    assert abs(r_train + r_val + r_test - 1.0) < 1e-8
    # 1) train vs temp con estratificación
    df_train, df_temp = train_test_split(
        df_clean,
        test_size=(1 - r_train),
        random_state=seed,
        stratify=df_clean["label_bin"]
    )
    # 2) val vs test dentro de temp
    val_frac = r_val / (r_val + r_test)
    df_val, df_test = train_test_split(
        df_temp,
        test_size=(1 - val_frac),
        random_state=seed,
        stratify=df_temp["label_bin"]
    )
    return df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

df_train, df_val, df_test = split_stratified_clean(
    df_clean, r_train=R_TRAIN, r_val=R_VAL, r_test=R_TEST, seed=SEED
)

print("Tamaños → train/val/test:", len(df_train), len(df_val), len(df_test))
for name, d in [("train", df_train), ("val", df_val), ("test", df_test)]:
    print(f"\n{name} balance:")
    print(d["label_bin"].value_counts(normalize=True).rename("ratio"))


Tamaños → train/val/test: 7088 1519 1519

train balance:
label_bin
0    0.669865
1    0.330135
Name: ratio, dtype: float64

val balance:
label_bin
0    0.670178
1    0.329822
Name: ratio, dtype: float64

test balance:
label_bin
0    0.669519
1    0.330481
Name: ratio, dtype: float64


###

### Guardar Dataframes

In [None]:
cols_out = ["image_id","label_bin","label_str","ra","dec"]
for extra in ["name","Bars"]:
    if extra in df.columns and extra not in cols_out:
        cols_out.append(extra)

train_csv = os.path.join(OUT_DIR, "train.csv")
val_csv   = os.path.join(OUT_DIR, "val.csv")
test_csv  = os.path.join(OUT_DIR, "test.csv")
amb_csv   = os.path.join(OUT_DIR, "ambiguous.csv")

df_train[cols_out].to_csv(train_csv, index=False)
df_val[cols_out].to_csv(val_csv, index=False)
df_test[cols_out].to_csv(test_csv, index=False)
df_amb[cols_out].to_csv(amb_csv, index=False)

print("Guardados:")
print(" -", train_csv)
print(" -", val_csv)
print(" -", test_csv)
print(" -", amb_csv, "(referencia)")


Guardados:
 - /content/drive/MyDrive/Proyecto_Integrador/Deteccion/datasets/train.csv
 - /content/drive/MyDrive/Proyecto_Integrador/Deteccion/datasets/val.csv
 - /content/drive/MyDrive/Proyecto_Integrador/Deteccion/datasets/test.csv
 - /content/drive/MyDrive/Proyecto_Integrador/Deteccion/datasets/ambiguous.csv (referencia)


### Revisión de Faltantes en descarga - No Impacta

In [None]:
ROOT  = "data"
BANDS = ["g","r","z"]
EXTS  = [".fits", ".png"]

def missing_for(df_subset, sample=200):
    miss = []
    ids = df_subset["image_id"].unique()
    ids = ids if len(ids) <= sample else np.random.choice(ids, size=sample, replace=False)
    for img_id in ids:
        for b in BANDS:
            ok = False
            for ext in EXTS:
                p = os.path.join(ROOT, b, f"{img_id}{ext}")
                if os.path.exists(p):
                    ok = True; break
            if not ok:
                miss.append((img_id, b))
    return miss

for name, d in [("train", df_train), ("val", df_val), ("test", df_test)]:
    missing = missing_for(d)
    print(f"[{name}] faltantes (muestra): {len(missing)}")
    if missing[:5]: print("Ejemplos:", missing[:5])


[train] faltantes (muestra): 600
Ejemplos: [('manga-8319-12705', 'g'), ('manga-8319-12705', 'r'), ('manga-8319-12705', 'z'), ('manga-8591-3704', 'g'), ('manga-8591-3704', 'r')]
[val] faltantes (muestra): 600
Ejemplos: [('manga-11745-9102', 'g'), ('manga-11745-9102', 'r'), ('manga-11745-9102', 'z'), ('manga-8449-1902', 'g'), ('manga-8449-1902', 'r')]
[test] faltantes (muestra): 600
Ejemplos: [('manga-11942-12704', 'g'), ('manga-11942-12704', 'r'), ('manga-11942-12704', 'z'), ('manga-12675-1902', 'g'), ('manga-12675-1902', 'r')]
