# Definición del Problema
## Objetivo: Clasificar segmentos de audio como "música" o "no música".

## Métricas principales:
- Precision (evitar falsos positivos en música, ej: no etiquetar ruido como música).
- Recall (capturar la mayor cantidad de música real).
- F1-score (balance entre ambas).

## Dataset:
- Número de ejemplos (14,661 no-música / 7,499 música).
- Features: Embeddings de audio (shape [seq_length, 128]).

## Librerías

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tensorflow.keras.metrics import Metric
from tensorflow.keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler

## Paths

In [50]:
# === Descarga local temporalmente ===
balanced_train_segments_path = "c:/Users/sbrxb/waveled/data/raw/balanced_train_segments.csv"
ontology_path = "c:/Users/sbrxb/waveled/data/raw/ontology.json"
dir_bal_train = "c:/Users/sbrxb/waveled/data/raw/bal_train" 
dir_eval = "c:/Users/sbrxb/waveled/data/raw/eval" 
class_labels_indices_path = "c:/Users/sbrxb/waveled/data/raw/class_labels_indices.csv"

## Dataframes

In [51]:
with open(balanced_train_segments_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Leer el archivo línea por línea y reconstruir las filas correctamente
fixed_rows = []
for line in lines[3:]:  
    parts = line.strip().split(",")  
    if len(parts) >= 4:  
        fixed_rows.append([parts[0], parts[1], parts[2], ",".join(parts[3:])]) 

# Crear un DataFrame
df_segments = pd.DataFrame(fixed_rows, columns=["YTID", "start_seconds", "end_seconds", "positive_labels"])

In [52]:
# Cargar el JSON ontology
with open(ontology_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Crear el DataFrame
df_ontology = pd.DataFrame(data)

keywords = ["music", "musical", "song", "instrument", "singing"]

# Crear expresión regular con OR entre palabras
pattern = "|".join(keywords)

# Crear la columna is_music basada en las palabras clave
df_ontology["is_music"] = df_ontology["name"].str.lower().str.contains(pattern).astype(int)

df_ontology.head()

Unnamed: 0,id,name,description,citation_uri,positive_examples,child_ids,restrictions,is_music
0,/m/0dgw9r,Human sounds,Sounds produced by the human body through the ...,,[],"[/m/09l8g, /m/01w250, /m/09hlz4, /m/0bpl036, /...",[abstract],0
1,/m/09l8g,Human voice,The human voice consists of sound made by a hu...,http://en.wikipedia.org/wiki/Human_voice,[],"[/m/09x0r, /m/07p6fty, /m/03qc9zr, /m/02rtxlg,...",[abstract],0
2,/m/09x0r,Speech,Speech is the vocalized form of human communic...,http://en.wikipedia.org/wiki/Speech,"[youtu.be/8uI9H5jGRV8?start=30&end=40, youtu.b...","[/m/05zppz, /m/02zsn, /m/0ytgt, /m/01h8n0, /m/...",[],0
3,/m/05zppz,"Male speech, man speaking",Speech uttered by an adult male human.,,"[youtu.be/6niRPYpLOpQ?start=30&end=40, youtu.b...",[],[],0
4,/m/02zsn,"Female speech, woman speaking",Speech uttered by an adult female human.,,"[youtu.be/4l05nCOnIRg?start=30&end=40, youtu.b...",[],[],0


In [53]:
# Cargar el CSV en un DataFrame
df_class_labels_indices = pd.read_csv(class_labels_indices_path)

df_class_labels_indices.head()

Unnamed: 0,index,mid,display_name
0,0,/m/09x0r,Speech
1,1,/m/05zppz,"Male speech, man speaking"
2,2,/m/02zsn,"Female speech, woman speaking"
3,3,/m/0ytgt,"Child speech, kid speaking"
4,4,/m/01h8n0,Conversation


In [54]:
# Merge entre 'class_labels_indices' y 'ontology'
df_ontology_labels = pd.merge(df_class_labels_indices, df_ontology, left_on='mid', right_on='id', how='left')

# Eliminamos las columnas 'mid' y 'display_name'
df_ontology_labels = df_ontology_labels.drop(columns=['mid', 'display_name'])

# Diccionario con los índices como claves y las id como valores
id_labels_dict = df_ontology_labels.set_index('index')['id'].to_dict()

# Establecemos 'index' como índice
df_ontology_labels.set_index('index', inplace=True)

df_ontology_labels.head()

Unnamed: 0_level_0,id,name,description,citation_uri,positive_examples,child_ids,restrictions,is_music
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,/m/09x0r,Speech,Speech is the vocalized form of human communic...,http://en.wikipedia.org/wiki/Speech,"[youtu.be/8uI9H5jGRV8?start=30&end=40, youtu.b...","[/m/05zppz, /m/02zsn, /m/0ytgt, /m/01h8n0, /m/...",[],0
1,/m/05zppz,"Male speech, man speaking",Speech uttered by an adult male human.,,"[youtu.be/6niRPYpLOpQ?start=30&end=40, youtu.b...",[],[],0
2,/m/02zsn,"Female speech, woman speaking",Speech uttered by an adult female human.,,"[youtu.be/4l05nCOnIRg?start=30&end=40, youtu.b...",[],[],0
3,/m/0ytgt,"Child speech, kid speaking","Speech uttered by a human child, i.e. a human ...",,"[youtu.be/iPIGoScYduI?start=210&end=220, youtu...",[],[],0
4,/m/01h8n0,Conversation,"Interactive, spontaneous spoken communication ...",http://en.wikipedia.org/wiki/Conversation,"[youtu.be/4FQxw_49xAk?start=30&end=40, youtu.b...",[],[],0


# Funciones para Preprocesamiento

In [55]:
@tf.function(reduce_retracing=True)
def parse_music_example(example_proto, music_ids, id_labels_dict, seq_length=10):
    """
    Procesa un ejemplo de música serializado en formato TFRecord.
    
    Args:
        example_proto: Ejemplo serializado en formato TFRecord
        music_ids: Tensor con los IDs de música que queremos detectar
        id_labels_dict: Diccionario que mapea índices numéricos a IDs semánticos
        seq_length: Longitud máxima de la secuencia de audio (default: 10)
        
    Returns:
        audio_embeddings: Tensor con los embeddings de audio normalizados y paddeados [seq_length, 128]
        is_music: Tensor float32 (0.0 o 1.0) indicando si el ejemplo contiene alguna música de interés
    """
    # Convertir el diccionario a tensor constante
    id_labels_tensor = tf.constant(list(id_labels_dict.values()))

    # Definición de características del TFRecord
    context_features = {
        "video_id": tf.io.FixedLenFeature([], tf.string),
        "labels": tf.io.VarLenFeature(tf.int64)
    }
    sequence_features = {
        "audio_embedding": tf.io.FixedLenSequenceFeature([], tf.string)
    }

    # Parsear el ejemplo protobuf
    context, sequences = tf.io.parse_single_sequence_example(
        example_proto, context_features=context_features, sequence_features=sequence_features
    )

    # Procesamiento de embeddings de audio
    audio_embeddings = tf.io.decode_raw(sequences['audio_embedding'], tf.uint8)
    audio_embeddings = tf.reshape(audio_embeddings, [-1, 128])
    audio_embeddings = (tf.cast(audio_embeddings, tf.float32) - 127.5) / 127.5
    audio_embeddings = audio_embeddings[:seq_length]
    padding = [[0, seq_length - tf.shape(audio_embeddings)[0]], [0, 0]]
    audio_embeddings = tf.pad(audio_embeddings, padding)
    audio_embeddings.set_shape([seq_length, 128])

    # Procesamiento de etiquetas
    labels = tf.sparse.to_dense(context['labels'])

    # Convertir labels a IDs y comparar con music_ids
    id_labels = tf.gather(id_labels_tensor, labels)
    
    # Verificar si alguna etiqueta coincide con music_ids
    is_music = tf.reduce_any(tf.equal(tf.expand_dims(id_labels, -1), music_ids))
    
    return audio_embeddings, tf.cast(is_music, tf.float32)

In [56]:
def create_dataset(tfrecord_dir, music_ids, batch_size=32, seq_length=10, is_training=True):
    """
    Crea un pipeline de datos TensorFlow configurable para entrenamiento o evaluación.
    
    Args:
        tfrecord_dir: Ruta al directorio con archivos .tfrecord
        music_ids: Lista de IDs de música a detectar
        batch_size: Tamaño del batch (default: 32)
        seq_length: Longitud de secuencia para embeddings de audio (default: 10)
        is_training: Bool (True para dataset de entrenamiento, False para test) (default: True)
        
    Returns:
        Dataset configurado con la estructura:
        - audio_embeddings: [batch_size, seq_length, 128]
        - is_music: [batch_size] (0.0 o 1.0)
    """
    # Validación de archivos
    tfrecord_files = tf.io.gfile.glob(os.path.join(tfrecord_dir, "*.tfrecord"))
    if not tfrecord_files:
        raise ValueError(f"No se encontraron archivos .tfrecord en {tfrecord_dir}")
        
    # Preprocesamiento de music_ids (convertir a tensor constante una sola vez)
    music_ids_tensor = tf.constant([str(id) for id in music_ids], dtype=tf.string)
    
    # Crear dataset base desde los archivos TFRecord
    dataset = tf.data.TFRecordDataset(tfrecord_files, num_parallel_reads=tf.data.AUTOTUNE)
    
    # Definir y aplicar función de parseo
    parse_fn = lambda x: parse_music_example(x, music_ids_tensor, id_labels_dict, seq_length)
    dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
    
    '''
    # Configuración específica por modo
    if is_training:
        # Aumento de datos (por revisar)
    '''
    
    # Creación de batches
    dataset = dataset.batch(batch_size)
    
    # Prefetch para ambos modos
    return dataset.prefetch(tf.data.AUTOTUNE)

# Modelos y Ajuste de Hiperparámetros

In [64]:
music_ids = set(df_ontology_labels[df_ontology_labels["is_music"] == 1]["id"].astype(str))

# Crear dataset
full_dataset = create_dataset(
    tfrecord_dir=dir_bal_train,
    music_ids=music_ids 
)

# Tamaño del Dataset
count = 0
for batch in full_dataset:
    count += len(batch)  # Tamaño real de cada batch
    
dataset_size = count

# Dividir dataset
val_size = int(.2 * dataset_size)
train_ds = full_dataset.skip(val_size).shuffle(buffer_size=1000, seed=10)  
val_ds = full_dataset.take(val_size)

In [65]:
# Convertir TF Dataset a arrays
def dataset_to_numpy(dataset):
    X, y = [], []
    for audio_emb, label in dataset.unbatch():
        X.append(audio_emb.numpy().flatten())  # Aplanar [seq_length, 128] a [seq_length * 128]
        y.append(label.numpy())
    return np.array(X), np.array(y)

X_train, y_train = dataset_to_numpy(train_ds)
X_val, y_val = dataset_to_numpy(val_ds)

In [68]:
# Normalización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Regresión Logística

In [69]:
# Búsqueda de hiperparámetros
params_lr = {
    'C': [0.01, 0.1, 1, 10], 
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

logreg = LogisticRegression(class_weight='balanced', random_state=10)
grid_lr = GridSearchCV(logreg, params_lr, cv=3, scoring='f1')
grid_lr.fit(X_train_scaled, y_train)

print("Mejores parámetros (Regresión Logística):", grid_lr.best_params_)

Mejores parámetros (Regresión Logística): {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}


## Máquinas de Vectores de Soporte (SVM)

In [None]:
params_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm = SVC(class_weight='balanced', random_state=10)
grid_svm = GridSearchCV(svm, params_svm, cv=3, scoring='f1')
grid_svm.fit(X_train_scaled, y_train)

print("Mejores parámetros (SVM):", grid_svm.best_params_)

## Random Forest

In [None]:
params_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf = RandomForestClassifier(class_weight='balanced', random_state=10)
grid_rf = GridSearchCV(rf, params_rf, cv=3, scoring='f1')
grid_rf.fit(X_train, y_train)  

print("Mejores parámetros (Random Forest):", grid_rf.best_params_)

## Gradient Boosting (XGBoost)

In [None]:
params_xgb = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6],
    'subsample': [0.8, 1.0]
}

xgb = XGBClassifier(
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),  
    random_state=10
)
grid_xgb = GridSearchCV(xgb, params_xgb, cv=3, scoring='f1')
grid_xgb.fit(X_train, y_train)  

print("Mejores parámetros (XGBoost):", grid_xgb.best_params_)

## Evaluación y Comparación de Modelos

In [None]:
models = {
    "Regresión Logística": grid_lr.best_estimator_,
    "SVM": grid_svm.best_estimator_,
    "Random Forest": grid_rf.best_estimator_,
    "XGBoost": grid_xgb.best_estimator_
}

results = []
for name, model in models.items():
    if name in ["Regresión Logística", "SVM"]:
        y_pred = model.predict(X_val_scaled)
    else:
        y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    precision = classification_report(y_val, y_pred, output_dict=True)['1']['precision']
    recall = classification_report(y_val, y_pred, output_dict=True)['1']['recall']
    results.append({
        'Modelo': name,
        'F1-score': round(f1, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4)
    })

# Resultados en tabla
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
models = {
    "Regresión Logística": grid_lr.best_estimator_,
    "SVM": grid_svm.best_estimator_,
    "Random Forest": grid_rf.best_estimator_,
    "XGBoost": grid_xgb.best_estimator_
}

plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red', 'purple']
linestyles = ['-', '--', '-.', ':']

# Iterar sobre cada modelo
for i, (name, model) in enumerate(models.items()):
    # Usar X_train_scaled para modelos que requieren escalado
    X_train_plot = X_train_scaled if name in ["Regresión Logística", "SVM"] else X_train
    
    # Calcular curva de aprendizaje
    train_sizes, train_scores, val_scores = learning_curve(
        estimator=model,
        X=X_train_plot,
        y=y_train,
        cv=3,
        scoring='f1',
        train_sizes=np.linspace(0.1, 1.0, 5),
        n_jobs=-1
    )
    
    # Calcular media y desviación estándar
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    # Graficar
    plt.plot(train_sizes, train_mean, 
             color=colors[i], linestyle=linestyles[i], 
             marker='o', label=f'{name} (Train)')
    plt.fill_between(train_sizes, 
                    train_mean - train_std,
                    train_mean + train_std,
                    alpha=0.1, color=colors[i])
    
    plt.plot(train_sizes, val_mean, 
             color=colors[i], linestyle=linestyles[i], 
             marker='s', label=f'{name} (Val)')
    plt.fill_between(train_sizes,
                    val_mean - val_std,
                    val_mean + val_std,
                    alpha=0.1, color=colors[i])

plt.title('Curvas de Aprendizaje Comparativas', fontsize=14)
plt.xlabel('Tamaño del Conjunto de Entrenamiento', fontsize=12)
plt.ylabel('F1-Score', fontsize=12)
plt.legend(loc='best', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.ylim(0.5, 1.0) 
plt.show()

# Modelo regresión logística PRUEBAS

In [57]:
music_ids = set(df_ontology_labels[df_ontology_labels["is_music"] == 1]["id"].astype(str))

# Crear dataset
full_dataset = create_dataset(
    tfrecord_dir=dir_bal_train,
    music_ids=music_ids 
)

# Tamaño del Dataset
count = 0
for batch in full_dataset:
    count += len(batch)  # Tamaño real de cada batch
    
dataset_size = count

# Dividir dataset
val_size = int(.2 * dataset_size)
train_ds = full_dataset.skip(val_size).shuffle(buffer_size=1000, seed=10)  
val_ds = full_dataset.take(val_size)

# Verificar tamaños
print(f"Total ejemplos: {dataset_size}")
print(f"Entrenamiento: {dataset_size - val_size}")
print(f"Validación: {val_size}")

Total ejemplos: 1386
Entrenamiento: 1109
Validación: 277


In [58]:
# Construir modelo
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(10, 128)),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer='l2'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


In [59]:
# Compilar
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 
             tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall')]
)

In [60]:
# Conteo de clases
pos = 0
neg = 0
for _, y in full_dataset.unbatch():  
    if y.numpy() == 1.0:
        pos += 1
    else:
        neg += 1

print("\n=== Distribución de Clases ===")
print(f"Ejemplos positivos (música): {pos}")
print(f"Ejemplos negativos (NO música): {neg}")

total_samples = pos + neg 
class_weight = {
    0: total_samples / (2 * neg),  # Peso para NO música
    1: total_samples / (2 * pos)    # Peso para música
}
print(f"\nPesos de clase para balanceo: {class_weight}")


=== Distribución de Clases ===
Ejemplos positivos (música): 7499
Ejemplos negativos (NO música): 14661

Pesos de clase para balanceo: {0: 0.7557465384353046, 1: 1.477530337378317}


In [61]:
# Entrenar
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    class_weight=class_weight 
)

Epoch 1/10
    403/Unknown [1m4s[0m 3ms/step - accuracy: 0.7664 - loss: 1.1845 - precision: 0.6291 - recall: 0.8257



[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7681 - loss: 1.1735 - precision: 0.6310 - recall: 0.8259 - val_accuracy: 0.8395 - val_loss: 0.5433 - val_precision: 0.7312 - val_recall: 0.8457
Epoch 2/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8565 - loss: 0.4913 - precision: 0.7471 - recall: 0.8649 - val_accuracy: 0.8447 - val_loss: 0.4913 - val_precision: 0.7414 - val_recall: 0.8444
Epoch 3/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8593 - loss: 0.4563 - precision: 0.7524 - recall: 0.8657 - val_accuracy: 0.8430 - val_loss: 0.4956 - val_precision: 0.7411 - val_recall: 0.8375
Epoch 4/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8587 - loss: 0.4519 - precision: 0.7481 - recall: 0.8764 - val_accuracy: 0.8450 - val_loss: 0.4837 - val_precision: 0.7444 - val_recall: 0.8388
Epoch 5/10
[1m416/416[0m [32m━━━

In [62]:
# Crear dataset de test
test_ds = create_dataset(
    tfrecord_dir=dir_eval,  
    music_ids=music_ids,
    batch_size=32
)

In [None]:
y_true = []
y_probs = []  

for x, y in test_ds.unbatch():
    y_true.append(y.numpy())
    y_probs.append(model.predict(tf.expand_dims(x, axis=0), verbose=0)[0][0])

y_true = np.array(y_true)
y_probs = np.array(y_probs)

# Umbral óptimo
fpr, tpr, thresholds = roc_curve(y_true, y_probs)
optimal_idx = np.argmax(tpr - fpr)  
optimal_threshold = thresholds[optimal_idx]

print(f"\nUmbral óptimo según ROC: {optimal_threshold:.4f}")

y_pred_optimal = (y_probs > optimal_threshold).astype(int)

# Métricas con el nuevo umbral
print("\n=== Métricas con Umbral Óptimo ===")
print(classification_report(y_true, y_pred_optimal, target_names=["No música", "Música"]))

# Comparación con el umbral por defecto (0.5)
print("\n=== Métricas con Umbral=0.5 ===")
print(classification_report(y_true, (y_probs > 0.5).astype(int), target_names=["No música", "Música"]))

# Matriz de confusión
conf_mat = confusion_matrix(y_true, y_pred_optimal)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicción: No música', 'Predicción: Música'],
            yticklabels=['Real: No música', 'Real: Música'])
plt.title('Matriz de Confusión - Umbral Óptimo')
plt.ylabel('Etiqueta Real')
plt.xlabel('Etiqueta Predicha')
plt.show()

# Para el umbral por defecto (0.5)
conf_mat_default = confusion_matrix(y_true, (y_probs > 0.5).astype(int))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat_default, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicción: No música', 'Predicción: Música'],
            yticklabels=['Real: No música', 'Real: Música'])
plt.title('Matriz de Confusión - Umbral=0.5')
plt.ylabel('Etiqueta Real')
plt.xlabel('Etiqueta Predicha')
plt.show()


Umbral óptimo según ROC: 0.4768

=== Métricas con Umbral Óptimo ===
              precision    recall  f1-score   support

   No música       0.92      0.83      0.87     13612
      Música       0.71      0.85      0.77      6759

    accuracy                           0.84     20371
   macro avg       0.82      0.84      0.82     20371
weighted avg       0.85      0.84      0.84     20371


Matriz de Confusión:
[[11313  2299]
 [ 1029  5730]]

=== Métricas con Umbral=0.5 ===
              precision    recall  f1-score   support

   No música       0.91      0.84      0.87     13612
      Música       0.72      0.84      0.77      6759

    accuracy                           0.84     20371
   macro avg       0.82      0.84      0.82     20371
weighted avg       0.85      0.84      0.84     20371

