## Librerías 

In [31]:
import tensorflow as tf
import os
import pandas as pd
import json

## Paths

In [34]:
balanced_train_segments_path = "c:/Users/sbrxb/waveled/data/raw/balanced_train_segments.csv"
ontology_path = "c:/Users/sbrxb/waveled/data/raw/ontology.json"
dir_bal_train = "c:/Users/sbrxb/waveled/data/raw/bal_train" 

## Dataframe de balanced train segments

In [28]:
with open(balanced_train_segments_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Leer el archivo línea por línea y reconstruir las filas correctamente
fixed_rows = []
for line in lines[3:]:  # Omitir los primeros 3 encabezados
    parts = line.strip().split(",")  # Separar por comas
    if len(parts) >= 4:  # Asegurar que haya al menos 4 partes
        fixed_rows.append([parts[0], parts[1], parts[2], ",".join(parts[3:])])  # Unir la última parte como string

# Crear un DataFrame manualmente
df_segments = pd.DataFrame(fixed_rows, columns=["YTID", "start_seconds", "end_seconds", "positive_labels"])

# Mostrar las primeras filas
df_segments.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels
0,--PJHxphWEs,30.0,40.0,"""/m/09x0r,/t/dd00088"""
1,--ZhevVpy1s,50.0,60.0,"""/m/012xff"""
2,--aE2O5G5WE,0.0,10.0,"""/m/03fwl,/m/04rlf,/m/09x0r"""
3,--aO5cdqSAg,30.0,40.0,"""/t/dd00003,/t/dd00005"""
4,--aaILOrkII,200.0,210.0,"""/m/032s66,/m/073cg4"""


Estructura de los datos:

YTID: ID del video en YouTube.

start_seconds: Segundo de inicio del segmento de audio.

end_seconds: Segundo de fin del segmento de audio.

positive_labels: Etiquetas asociadas al audio.

## Dataframe de ontology JSON

The AudioSet ontology is a collection of sound events organized in a hierarchy. The ontology covers a wide range of everyday sounds, from human and animal sounds, to natural and environmental sounds, to musical and miscellaneous sounds.

In [21]:
# Cargar el JSON ontology
with open(ontology_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Crear el DataFrame
df_ontology = pd.DataFrame(data)

# Filtrar por la palabra 'music' en la columna 'name'
df_ontology = df_ontology[df_ontology["name"].str.contains("music", case=False, na=False)]

df_ontology.head(5)


Unnamed: 0,id,name,description,citation_uri,positive_examples,child_ids,restrictions
156,/m/04rlf,Music,Music is an art form and cultural activity who...,http://en.wikipedia.org/wiki/Music,"[youtu.be/E83fRA1wxKU?start=420&end=430, youtu...","[/m/04szw, /m/0kpv1t, /t/dd00027, /t/dd00028, ...",[]
157,/m/04szw,Musical instrument,Sounds specifically associated with instrument...,http://en.wikipedia.org/wiki/Musical_instrument,"[youtu.be/7umb9yJpkKU?start=70&end=80, youtu.b...","[/m/0fx80y, /m/05148p4, /m/0l14md, /m/05pd6, /...",[]
171,/m/05148p4,Keyboard (musical),Sounds of a musical instrument played using a ...,http://en.wikipedia.org/wiki/Keyboard_instrument,"[youtu.be/11bWvQaQhrM?start=440&end=450, youtu...","[/m/05r5c, /m/013y1f, /m/0l14qv, /m/03q5t]",[]
244,/m/05229,Musical ensemble,The sound of a group of people who perform ins...,http://en.wikipedia.org/wiki/Musical_ensemble,[],[],[blacklist]
247,/m/0kpv1t,Music genre,Portmanteau class holding categories that repr...,,[],"[/m/064t9, /m/0glt670, /m/06by7, /m/06j6l, /m/...",[abstract]


## Dataframe de segments con sólo ids de música

In [30]:
ids_music = set(df_ontology["id"])

df_segments_filtered = df_segments[df_segments["positive_labels"].apply(lambda x: any(id_ in x.split(",") for id_ in ids_music))]
df_segments_filtered


Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels
2,--aE2O5G5WE,0.000,10.000,"""/m/03fwl,/m/04rlf,/m/09x0r"""
11,-0SdAVK79lg,30.000,40.000,"""/m/0155w,/m/01lyv,/m/0342h,/m/042v_gx,/m/04r..."
13,-0mG4W5Hlq8,270.000,280.000,"""/m/04rlf,/m/05fw6t,/m/07r4k75,/m/09x0r,/m/0y..."
17,-1TLtjPtnms,10.000,20.000,"""/m/03lty,/m/04rlf,/m/07szfh9"""
43,-5xOcMJpTUk,70.000,80.000,"""/m/018vs,/m/0342h,/m/042v_gx,/m/04rlf,/m/04s..."
...,...,...,...,...
22095,zo1D2kSxnxs,60.000,70.000,"""/m/04rlf,/m/07lnk,/m/07s72n,/m/0cfdd"""
22105,zqga01RTsB4,20.000,30.000,"""/m/0283d,/m/03mb9,/m/04rlf,/m/07gxw,/m/07lnk..."
22121,ztTVZolaOAg,80.000,90.000,"""/m/0342h,/m/04rlf,/m/04szw,/m/05r5wn,/m/0fx80y"""
22134,zw-cQWbrGBE,20.000,30.000,"""/g/122z_qxw,/m/04rlf,/m/07qsvvw"""


## Directorio bal_train con .tfrecords

In [51]:
# Obtener los YTID filtrados como un conjunto para búsqueda eficiente
ytid_filtrados = set(df_segments_filtered["YTID"])

In [52]:
# Listar los archivos .tfrecord
tfrecord_files = [os.path.join(dir_bal_train, f) for f in os.listdir(dir_bal_train) if f.endswith(".tfrecord")]

In [53]:
# Función para parsear los registros del TFRecord
def _parse_function(proto):
    feature_description = {
        "video_id": tf.io.FixedLenFeature([], tf.string),  
        "start_time_seconds": tf.io.FixedLenFeature([], tf.float32),  
        "end_time_seconds": tf.io.FixedLenFeature([], tf.float32),  
        "label": tf.io.VarLenFeature(tf.int64)  
    }
    
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    
    # Convertir labels de int64 a string (si es necesario)
    labels = tf.sparse.to_dense(parsed_features["label"])
    labels = tf.strings.as_string(labels)
    
    parsed_features["label"] = labels
    return parsed_features


In [57]:

# Ruta de un archivo .tfrecord como ejemplo
tfrecord_file = dir_bal_train + "/__.tfrecord"

raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

for raw_record in raw_dataset.take(1):  # Tomamos un solo registro
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)


features {
  feature {
    key: "video_id"
    value {
      bytes_list {
        value: "__OPriJqvWs"
      }
    }
  }
  feature {
    key: "start_time_seconds"
    value {
      float_list {
        value: 360
      }
    }
  }
  feature {
    key: "labels"
    value {
      int64_list {
        value: 0
        value: 393
      }
    }
  }
  feature {
    key: "end_time_seconds"
    value {
      float_list {
        value: 370
      }
    }
  }
}



## Dataframe con las video ids asociadas con música

In [54]:
filtered_records = []

for file in tfrecord_files:
    raw_dataset = tf.data.TFRecordDataset(file)
    parsed_dataset = raw_dataset.map(_parse_function)

    for record in parsed_dataset:
        yt_id = record["video_id"].numpy().decode("utf-8")
        
        if yt_id in ytid_filtrados:
            filtered_records.append({
                "YTID": yt_id,
                "start_seconds": record["start_time_seconds"].numpy(),
                "end_seconds": record["end_time_seconds"].numpy(),
                "labels": [l.numpy().decode("utf-8") for l in record["label"].numpy()]
            })

In [56]:
df_filtered = pd.DataFrame(filtered_records)

# Mostrar los primeros registros
df_filtered

Unnamed: 0,YTID,start_seconds,end_seconds,labels
0,--aE2O5G5WE,0.0,10.0,[]
1,-0mG4W5Hlq8,270.0,280.0,[]
2,-0SdAVK79lg,30.0,40.0,[]
3,-1TLtjPtnms,10.0,20.0,[]
4,-5xOcMJpTUk,70.0,80.0,[]
...,...,...,...,...
2266,_vwBe9ZXWXE,10.0,20.0,[]
2267,_WD9mbwAcrQ,130.0,140.0,[]
2268,_WS68gpLg7U,180.0,190.0,[]
2269,_zQTlTCqMzs,130.0,140.0,[]
