In [None]:
# IMPORTS Y PARÁMETROS

import os
import re
import glob
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

#  Rutas
NPZ_DIR   = Path("../data/Tinamidae_5s_npz")          # embeddings por audio (npz)
WAV_DIR   = Path("../data/Tinamidae_5s")              # WAVs 
TIPS_CSV  = Path("../traits/traits_high-level.csv")   # medianas (tips robustecidos) para ASR
OUT_ROOT  = Path("../results/asr_KNN")                # salida base

#  Parámetros KNN 
K_NEIGHBORS = 100
METRIC      = "cosine"

#  Salidas 
OUT_KNN_DIR = OUT_ROOT / "KNN_results_TIPS"
OUT_ORIG    = OUT_ROOT / "NN_AUDIO" / "ORIG" / "TIPS"

OUT_KNN_DIR.mkdir(parents=True, exist_ok=True)
OUT_ORIG.mkdir(parents=True, exist_ok=True)

def _safe_name(s: str) -> str:
    s = str(s).strip().replace(" ", "_")
    s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
    return s


In [None]:
# CARGAR EMBEDDINGS TIP DESDE NPZ

npz_files = sorted(NPZ_DIR.rglob("*.npz"))
print("NPZ files:", len(npz_files))

X_list = []
wav_paths = []

for f in npz_files:
    d = np.load(f)
    emb = d["embedding"].astype(np.float32).ravel()  # (1280,)
    X_list.append(emb)

    wav_path = WAV_DIR / f.relative_to(NPZ_DIR).with_suffix(".wav")
    wav_paths.append(wav_path)

X = np.stack(X_list, axis=0)  # (n_wavs, 1280)
wav_paths = np.array(wav_paths, dtype=object)

print("X shape:", X.shape)
print("Example wav:", wav_paths[0], "| exists:", Path(wav_paths[0]).exists())


In [None]:
# CARGAR TARGETS (MEDIANAS) DESDE traits_high-level.csv

df_tips = pd.read_csv(TIPS_CSV)
print("TIPS rows:", df_tips.shape[0], "| cols:", df_tips.shape[1])
print(df_tips.head())

# Detectar columna ID (species/tip)
candidate_id_cols = ["species", "sp", "taxon", "tip", "label", "name"]
id_col = None
for c in candidate_id_cols:
    if c in df_tips.columns:
        id_col = c
        break

if id_col is None:
    # fallback: primera columna no-numérica
    nonnum = [c for c in df_tips.columns if not pd.api.types.is_numeric_dtype(df_tips[c])]
    if len(nonnum) == 0:
        raise ValueError("No encontré columna ID (species/sp/...) y todas las columnas parecen numéricas.")
    id_col = nonnum[0]

tip_ids = df_tips[id_col].astype(str).values

# columnas embedding: todo lo numérico (y excluye id_col)
emb_cols = [c for c in df_tips.columns if c != id_col and pd.api.types.is_numeric_dtype(df_tips[c])]

Y = df_tips[emb_cols].to_numpy(dtype=np.float32)  # (n_tips, 1280)
print("ID col:", id_col)
print("Y shape:", Y.shape)
print("First tip:", tip_ids[0])


In [None]:
# CALCULAR K-NN (TIPS) Y GUARDAR CSV

knn = NearestNeighbors(n_neighbors=K_NEIGHBORS, metric=METRIC)
knn.fit(X)

distances, indices = knn.kneighbors(Y)  # (n_tips, K)

knn_df = pd.DataFrame(indices, columns=[f"knn_{i+1}" for i in range(K_NEIGHBORS)])
knn_df.insert(0, "tip", tip_ids)

knn_csv_path = OUT_KNN_DIR / "knn_results_TIPS.csv"
knn_df.to_csv(knn_csv_path, index=False)

print("Saved:", knn_csv_path)
print(knn_df.head())


In [None]:
# COPIAR WAVs KNN POR TIP

rows = []
missing = 0

for i, tip in enumerate(tip_ids):
    tip_dirname = _safe_name(tip)
    tip_dir = OUT_ORIG / f"tip_{tip_dirname}"
    tip_dir.mkdir(parents=True, exist_ok=True)

    for r, (j, d) in enumerate(zip(indices[i], distances[i]), start=1):
        src = Path(wav_paths[j])
        if not src.exists():
            missing += 1
            continue

        sp  = src.parent.name.replace(" ", "_")
        dst = tip_dir / f"{r:03d}_d{d:.6f}_{sp}_{src.stem}.wav"
        shutil.copy2(src, dst)

        rows.append({
            "tip": str(tip),
            "rank": int(r),
            "wav_index": int(j),
            "distance": float(d),
            "src_wav": str(src),
            "dst_wav": str(dst),
        })

manifest = pd.DataFrame(rows)
manifest_path = OUT_ORIG / "neighbors_manifest_ORIG_TIPS.csv"
manifest.to_csv(manifest_path, index=False)

print("Saved:", manifest_path)
print("Missing WAVs skipped:", missing)
print("Example folder:", OUT_ORIG / f"tip_{_safe_name(tip_ids[0])}")
