In [None]:

# IMPORTS Y PARÁMETROS

import os
import glob
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

#  Rutas
NPZ_DIR   = Path("../data/Tinamidae_5s_npz")                 # embeddings por audio (npz)
WAV_DIR   = Path("../data/Tinamidae_5s")                      # WAVs 
ASR_CSV   = Path("../results/asr_MCC_OU_nodes_high-level.csv")# nodos ASR (1280D)
OUT_ROOT  = Path("../results/asr_KNN")                        # salida KNN

#  Parámetros KNN 
K_NEIGHBORS = 100
METRIC      = "cosine"


In [None]:

# CARGAR EMBEDDINGS TIP DESDE NPZ

npz_files = sorted(NPZ_DIR.rglob("*.npz"))
print("NPZ files:", len(npz_files))

X_list = []
wav_paths = []

for f in npz_files:
    # Cargar el archivo npz
    d = np.load(f)
    # Acceder a la key 'embedding'
    emb = d['embedding'].astype(np.float32).ravel()   # (1280,)
    X_list.append(emb)

    # Crear la ruta del archivo wav correspondiente
    wav_path = WAV_DIR / f.relative_to(NPZ_DIR).with_suffix(".wav")
    wav_paths.append(wav_path)

X = np.stack(X_list, axis=0)  # (n_tips, 1280)
wav_paths = np.array(wav_paths, dtype=object)

print("X shape:", X.shape)
print("Example wav path:", wav_paths[0])
print("Example wav exists:", wav_paths[0], Path(wav_paths[0]).exists())


In [None]:

# CARGAR TARGETS ASR DESDE CSV

df_asr = pd.read_csv(ASR_CSV)
print("ASR rows:", df_asr.shape[0], "| cols:", df_asr.shape[1])
print(df_asr.head())

# columna 'node' y 1280 columnas numéricas ( embedding)
node_ids = df_asr["node"].astype(str).values

emb_cols = [c for c in df_asr.columns if c != "node"]
Y = df_asr[emb_cols].to_numpy(dtype=np.float32)  # (n_nodes, 1280)

print("Y shape:", Y.shape)
print("First node:", node_ids[0])


In [None]:
# CALCULAR K-NN Y GUARDAR RESULTADOS

# Crear el directorio para los KNN 
out_knn_dir = OUT_ROOT / "KNN_results"
out_knn_dir.mkdir(parents=True, exist_ok=True)

# Inicializar NearestNeighbors
knn = NearestNeighbors(n_neighbors=K_NEIGHBORS, metric=METRIC)
knn.fit(X)  # Ajustamos el modelo a los embeddings de los TIPs

# Buscar los k-NN para cada nodo ASR
distances, indices = knn.kneighbors(Y)  # distances (n_nodes, K), indices (n_nodes, K)

# Crear un DataFrame con los resultados
knn_df = pd.DataFrame(indices, columns=[f"knn_{i+1}" for i in range(K_NEIGHBORS)])
knn_df["node"] = node_ids

# Guardar el DataFrame en un CSV
knn_csv_path = out_knn_dir / "knn_results.csv"
knn_df.to_csv(knn_csv_path, index=False)

print(f"KNN results saved to: {knn_csv_path}")
print(knn_df.head())


In [None]:
# COPIAR WAVs KNN POR NODO

OUT_ORIG = OUT_ROOT / "NN_AUDIO" / "ORIG" / "ASR"
OUT_ORIG.mkdir(parents=True, exist_ok=True)

rows = []

for i, node in enumerate(node_ids):
    node_dir = OUT_ORIG / f"node_{node}"
    node_dir.mkdir(parents=True, exist_ok=True)

    for r, (j, d) in enumerate(zip(indices[i], distances[i]), start=1):
        src = Path(wav_paths[j])
        sp  = src.parent.name.replace(" ", "_")
        dst = node_dir / f"{r:03d}_d{d:.6f}_{sp}_{src.stem}.wav"
        shutil.copy2(src, dst)

        rows.append({
            "node": node,
            "rank": r,
            "tip_index": int(j),
            "distance": float(d),
            "src_wav": str(src),
            "dst_wav": str(dst),
        })

manifest = pd.DataFrame(rows)
manifest_path = OUT_ORIG / "neighbors_manifest_ORIG.csv"
manifest.to_csv(manifest_path, index=False)

print("Saved:", manifest_path)
print("Example node folder:", OUT_ORIG / f"node_{node_ids[0]}")
