In [1]:
import os
import math
import torch
import pandas as pd
from dataset.create_all_embedding import *
from utils.constant_mapping import *

In [2]:
def get_file_size(file_path):
    """
    Get and print the size of a file.

    :param file_path: The path to the file.
    """
    if os.path.exists(file_path):
        # Get the size of the file in bytes
        file_size_bytes = os.path.getsize(file_path)

        # Convert the size to a more human-readable format (e.g., KB, MB, GB, etc.)
        def convert_size(size_bytes):
            if size_bytes == 0:
                return "0 B"
            size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
            i = int(math.floor(math.log(size_bytes, 1024)))
            p = math.pow(1024, i)
            s = round(size_bytes / p, 2)
            return f"{s} {size_name[i]}"

        # Call the function to convert the file size and print it
        file_size_readable = convert_size(file_size_bytes)
        return file_size_readable
    else:
        return "does not exist"
    
def set_device(device_index=None):
    if device_index is not None and torch.cuda.is_available():
        num_devices = torch.cuda.device_count()
        if num_devices > device_index:
            torch.cuda.set_device(device_index)
            device = torch.device("cuda")
            print(f"Using GPU {torch.cuda.current_device()}")
            return device
        else:
            torch.cuda.set_device(0)
            device = torch.device("cuda:0")
            print("Specified GPU index is out of range. Using the first GPU.")
            return device
    else:
        device = torch.device("cpu")
        print("CUDA is not available or GPU index is not specified. Using CPU.")
        return device

In [3]:
root_path = "/home/braveenan/voice_dataset"
root_speechcommand = os.path.join(root_path, "SpeechCommand")
root_voxceleb = os.path.join(root_path, "VoxCeleb")
root_iemocap = os.path.join(root_path, "IEMOCAP")

device = set_device(0)

Using GPU 0


In [4]:
# Specify the model and frame pooling type directly
upstream_model_type = "wavlm_large"
frame_pooling_type = "energy"

upstreammodel_text = f"Upstream model type: {upstream_model_type}"
print(upstreammodel_text)
frame_pooling_text = f"Frame pooling type: {frame_pooling_type}"
print(frame_pooling_text)

Upstream model type: wavlm_large
Frame pooling type: energy


In [5]:
root_emb_path = root_path.replace("/voice_dataset", f"/embedding1/{upstream_model_type}/{frame_pooling_type}")
root_speechcommand_emb = os.path.join(root_emb_path, "SpeechCommand")
root_voxceleb_emb = os.path.join(root_emb_path, "VoxCeleb")
root_iemocap_emb = os.path.join(root_emb_path, "IEMOCAP")

os.makedirs(root_speechcommand_emb, exist_ok=True)
os.makedirs(root_voxceleb_emb, exist_ok=True)
os.makedirs(root_iemocap_emb, exist_ok=True)

In [6]:
bundle = ModelMapping.get_model_bundle(upstream_model_type)
upstream_model = bundle.get_model()

In [7]:
training_data = VoxCeleb1Embedding (
    root=root_voxceleb,
    subset = 'train',
    download=False,
    model=upstream_model,
    device=device,
    frame_pooling_type=frame_pooling_type,
)

validating_data = VoxCeleb1Embedding (
    root=root_voxceleb,
    subset = 'dev',
    download=False,
    model=upstream_model,
    device=device,
    frame_pooling_type=frame_pooling_type,
)

testing_data = VoxCeleb1Embedding (
    root=root_voxceleb,
    subset = 'test',
    download=False,
    model=upstream_model,
    device=device,
    frame_pooling_type=frame_pooling_type,
)

dataset_length_text = f"No of training data samples: {len(training_data)} \nNo of validating data samples: {len(validating_data)} \nNo of testing data samples: {len(testing_data)}"
print(dataset_length_text)

No of training data samples: 138361 
No of validating data samples: 6904 
No of testing data samples: 8251


In [8]:
# Create a function to process data and write to CSV
def process_and_write_data(data, subset, csv_file):
    for index, data_point in enumerate(data):
        (embedding, emblength, wavlength, label, wavpath) = data_point
        duration = wavlength // 16000
        embpath = os.path.join(root_voxceleb_emb, wavpath)
        embpath = embpath.replace(".wav", f"_{upstream_model_type}_{frame_pooling_type}.pt")
        embdir = embpath.replace(f"/{embpath.split('/')[-1]}", "")
        os.makedirs(embdir, exist_ok=True)
        torch.save(embedding, embpath)
        file_size_readable = get_file_size(embpath)

        # Write the data directly to the CSV file
        csv_file.write(f"{wavpath},{label},{wavlength},{duration},{emblength},{subset},{file_size_readable}\n")

        print(f"{subset} {index+1} {wavpath}")
        
        torch.cuda.empty_cache()
        
        break

In [9]:
# Create and open the CSV file for writing
output_csv_file = os.path.join(root_voxceleb_emb, f"voxceleb_{upstream_model_type}_{frame_pooling_type}.csv")
with open(output_csv_file, 'w') as csv_file:
    csv_file.write("Audio path,Audio label,Audio length,Audio duration,Sequence length,Data subset, File size\n")

    process_and_write_data(training_data, "train", csv_file)
    process_and_write_data(validating_data, "validation", csv_file)
    process_and_write_data(testing_data, "test", csv_file)

train 1 id10001/1zcIwhmdeo4/00001.wav
validation 1 id10001/7w0IBEWc9Qw/00001.wav
test 1 id10001/Y8hIVOBuels/00001.wav


In [10]:
# Display GPU memory usage before and after emptying the cache
allocated_before = torch.cuda.memory_allocated() / 1e9  # convert to GB
cached_before = torch.cuda.memory_reserved() / 1e9  # convert to GB
print(f"Before empty_cache - Allocated: {allocated_before:.4f} GB, Cached: {cached_before:.4f} GB")

torch.cuda.empty_cache()

allocated_after = torch.cuda.memory_allocated() / 1e9  # convert to GB
cached_after = torch.cuda.memory_reserved() / 1e9  # convert to GB
print(f"After empty_cache - Allocated: {allocated_after:.4f} GB, Cached: {cached_after:.4f} GB")

Before empty_cache - Allocated: 1.2704 GB, Cached: 2.0133 GB
After empty_cache - Allocated: 1.2704 GB, Cached: 1.3191 GB
