# Libraries, paths and directories

In [8]:
import tensorflow as tf
import pandas as pd
import json
import os
import re

In [147]:
# file paths

# base data directory
base_dir = '../data/'

# raw data directory
raw_dir = os.path.join(base_dir, 'raw/')

# iterim data directory
interim_dir = os.path.join(base_dir, 'interim/')

# processed data directory
proc_dir =  os.path.join(base_dir, 'processed/')

# splits 
splits = ['bal_train/', 'eval/', 'unbal_train/']

# segments files
segments = ['balanced_train_segments.csv', 'unbalanced_train_segments.csv', 'eval_segments.csv']

# labels file path
labels_path = os.path.join(raw_dir, 'class_labels_indices.csv')

# onthology file path
ont_path = os.path.join(raw_dir, 'ontology.json')

# [raw/interim/processed][split]
data_path = {'raw': {}, 'interim': {}, 'processed': {}}  

# [raw/interim/processed][split]
data_dir = {'raw': {}, 'interim': {}, 'processed': {}}  

for i, seg in enumerate(splits):
    
    seg_rm = seg.replace('/', '')
    
    raw = os.path.join(raw_dir, splits[i])
    data_dir['raw'][seg_rm] = raw
    raw = os.path.join(raw, segments[i])
    data_path['raw'][seg_rm] = raw
    
    interim = os.path.join(interim_dir, splits[i])
    data_dir['interim'][seg_rm] = interim
    interim = os.path.join(interim, segments[i])
    data_path['interim'][seg_rm] = interim
    
    processed = os.path.join(proc_dir, splits[i])
    data_dir['processed'][seg_rm] = processed
    processed = os.path.join(processed, segments[i])
    data_path['processed'][seg_rm] = processed

In [126]:
print(f"data file: {data_path['raw']['bal_train']}")
print(f"is in the directory: {data_dir['raw']['bal_train']}")

data file: ../data/raw/bal_train/balanced_train_segments.csv
is in the directory: ../data/raw/bal_train/


# Preprocessing

## Loading data

In [127]:
# Read the file and replace problematic commas
with open(data_path['raw']['bal_train'], "r", encoding="utf-8") as f:
    # Read all lines from the input file
    lines = f.readlines()
    
    # Remove the first two rows
    lines = lines[2:]
    
    # Remove the first two characters ("# ") from the third row (now the first row after removing the first two)
    lines[0] = lines[0][2:]

    # Replace commas that are NOT followed by a space
    fixed_lines = [re.sub(r',(?! )', ';', line) for line in lines]

# Save the modified file
with open(data_path['interim']['bal_train'], "w", encoding="utf-8") as f:
    f.writelines(fixed_lines)

print(f"File has been cleaned and saved in {data_dir['interim']['bal_train']}")

File has been cleaned and saved in ../data/interim/bal_train/


In [128]:
# step 3: Read the modified file
df = pd.read_csv(
    data_path['interim']['bal_train'],
    quotechar = '"',
    delimiter = ",",
    quoting = 1,
    index_col = 0,
    header = 0,
    dtype = {'positive_labels': str} 
)

# keep the index as a column
df = df.reset_index()

# drop it
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels
0,--PJHxphWEs,30.0,40.0,"""/m/09x0r;/t/dd00088"""
1,--ZhevVpy1s,50.0,60.0,"""/m/012xff"""
2,--aE2O5G5WE,0.0,10.0,"""/m/03fwl;/m/04rlf;/m/09x0r"""
3,--aO5cdqSAg,30.0,40.0,"""/t/dd00003;/t/dd00005"""
4,--aaILOrkII,200.0,210.0,"""/m/032s66;/m/073cg4"""


'../data/interim/bal_train/'

In [134]:
#os.listdir(data_dir['raw']['bal_train'])

In [142]:
# tfrecord files list for bal_train
split_dir = data_dir['raw']['bal_train']
tfrecord_files = [os.path.join(split_dir, f) for f in os.listdir(split_dir) if f.endswith('.tfrecord')]

In [143]:
def inspect_tfrecord_n(file_path, n):
    raw_dataset = tf.data.TFRecordDataset(file_path)
    file_name = os.path.basename(file_path)
    
    for i, raw_record in enumerate(raw_dataset.take(n), start=1):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        
        print(f"Features del registro numero {i} de {file_name}:")
        for key, feature in example.features.feature.items():
            if feature.HasField("bytes_list"):
                value = feature.bytes_list.value[0].decode('utf-8') if feature.bytes_list.value else ""
            elif feature.HasField("float_list"):
                value = feature.float_list.value[0] if feature.float_list.value else 0.0
            elif feature.HasField("int64_list"):
                value = feature.int64_list.value[0] if feature.int64_list.value else 0
            else:
                value = "Unknown Type"
            print(f'"{key}": {value}')
        print('')

We inspect an arbitrary record from the tfrecord list

In [144]:
file_path = tfrecord_files[0]
inspect_tfrecord_n(file_path, 3)

Features del registro numero 1 de --.tfrecord:
"labels": 399
"video_id": --cB2ZVjpnA
"start_time_seconds": 30.0
"end_time_seconds": 40.0

Features del registro numero 2 de --.tfrecord:
"end_time_seconds": 40.0
"video_id": --PJHxphWEs
"start_time_seconds": 30.0
"labels": 0

Features del registro numero 3 de --.tfrecord:
"end_time_seconds": 40.0
"video_id": --ekDLDTUXA
"start_time_seconds": 30.0
"labels": 27



In [145]:
test_1 = df[df['YTID'].str.contains('--cB2ZVjpnA')] 
test_1

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels
5,--cB2ZVjpnA,30.0,40.0,"""/m/01y3hg"""


In [148]:

# Open and read the JSON onthology file
with open(ont_path, 'r') as file:
    data = json.load(file)

data[0]

{'id': '/m/0dgw9r',
 'name': 'Human sounds',
 'description': 'Sounds produced by the human body through the actions of the individual.',
 'citation_uri': '',
 'positive_examples': [],
 'child_ids': ['/m/09l8g',
  '/m/01w250',
  '/m/09hlz4',
  '/m/0bpl036',
  '/m/0160x5',
  '/m/0k65p',
  '/m/01jg02',
  '/m/04xp5v',
  '/t/dd00012'],
 'restrictions': ['abstract']}

In [149]:
# Target string to search for
target_string = "/m/01y3hg"

# Function to search for the target string in 'id' and 'child_ids'
def search_data(data, target):
    results = []
    for item in data:
        # Check if the target matches the 'id' field
        if item['id'] == target:
            results.append(item['name'])
        # Check if the target matches any of the 'child_ids'
        if target in item.get('child_ids', []):
            results.append(item['name'])
    return results

# Perform the search
matching_names = search_data(data, target_string)

# Print the results
if matching_names:
    print(f"The target '{target_string}' was found in the following names: {matching_names}")
else:
    print(f"The target '{target_string}' was not found.")

The target '/m/01y3hg' was found in the following names: ['Alarm', 'Smoke detector, smoke alarm']


features del registro numero 1 de "--.tfrecord", con "video_id" de --cB2ZVjpnA tiene en la columna de "positive_labels" del archivo segments.csv, un valor de  "/m/01y3hg", el cual al buscarlo en el diccionario ontology.json obtenemos que tiene los valores de "['Alarm', 'Smoke detector, smoke alarm']"