# Everything Data

In [1]:
### IMPORTS ###

import os
import requests
import zipfile
import shutil
import os
from tqdm import tqdm
import librosa
import soundfile as sf
import pandas as pd
import json
import jsonpickle
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import h5py
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import sentencepiece
import gc
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate

## Download and process data

In [2]:
def download_vctk():  
    # Define the URL and the target paths
    url = 'https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip'
    data_dir = './data/VCTK/raw'
    download_path = os.path.join(data_dir, 'VCTK-Corpus-0.92.zip')
    extract_path = os.path.join(data_dir, 'VCTK')

    # Ensure the data directory exists
    os.makedirs(data_dir, exist_ok=True)

    # Download the dataset
    print(f"Downloading VCTK dataset from {url}...")
    response = requests.get(url, stream=True)
    with open(download_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print("Download complete.")

    # Unzip the file
    print(f"Extracting {download_path} to {data_dir}...")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)
    print("Extraction complete.")

    # Find the extracted folder and rename it to "VCTK"
    extracted_folder_name = 'VCTK-Corpus-0.92'
    original_extract_path = os.path.join(data_dir, extracted_folder_name)

    if os.path.exists(original_extract_path):
        os.rename(original_extract_path, extract_path)
        print(f"Renamed {original_extract_path} to {extract_path}")
    else:
        print(f"Expected extracted folder {original_extract_path} not found")

    print(f"VCTK dataset is ready at {extract_path}")

In [3]:
def process_data(target_sample_rate):
    # Define paths and target sample rate
    input_dir = './data/VCTK/raw/wav48_silence_trimmed'
    output_dir = './data/VCTK/raw/wav{}'.format(int(target_sample_rate // 1e3))  
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Collect all files to process
    files_to_process = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith("_mic1.flac"):
                files_to_process.append((root, file))

    # Process files with a progress bar
    for root, file in tqdm(files_to_process, desc="Processing files", unit="file"):
        # Construct full file path
        file_path = os.path.join(root, file)

        # Load the audio file using librosa
        audio, sr = librosa.load(file_path, sr=None)

        # Downsample the audio file to the target sample rate
        audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=target_sample_rate)

        # Remove '_mic1' from the file name and change extension to .wav
        new_file_name = file.replace('_mic1.flac', '.wav')

        # Construct the output file path
        relative_path = os.path.relpath(file_path, input_dir)
        relative_dir = os.path.dirname(relative_path)
        output_file_path = os.path.join(output_dir, relative_dir, new_file_name)
        output_file_dir = os.path.dirname(output_file_path)
        os.makedirs(output_file_dir, exist_ok=True)

        # Export the downsampled audio file as a .wav file using soundfile
        sf.write(output_file_path, audio_resampled, target_sample_rate)

In [4]:
if not os.path.exists("./data/VCTK/raw/wav48_silence_trimmed"):
    download_vctk()

In [5]:
# Target Sample Rate

target_sample_rate = 8000

In [6]:
# Process the audio files

if not os.path.exists("./data/VCTK/raw/wav{}".format(int(target_sample_rate//1e3))):
    process_data(target_sample_rate)

## Make Dataset

In [7]:
def read_speaker_info():
    speaker_info_path = './data/VCTK/raw/speaker-info.txt'
    speaker_info = {}
    with open(speaker_info_path, 'r') as file:
        lines = file.readlines()[1:]  # Skip the header
        for line in lines:
            parts = line.strip().split()
            speaker_id = parts[0]
            age = parts[1]
            gender = parts[2]
            accent = parts[3]
            region = parts[4] if len(parts) > 4 else ""
            comment = " ".join(parts[5:]) if len(parts) > 5 else ""
            speaker_info[speaker_id] = {
                "age": age,
                "gender": gender,
                "accent": accent,
                "region": region,
                "comment": comment,
            }
    return speaker_info

def create_dataset(target_sample_rate):
    # Define paths
    wav_dir = f'./data/VCTK/raw/wav{int(target_sample_rate // 1e3)}'
    txt_dir = './data/VCTK/raw/txt'
    speaker_info = read_speaker_info()
    
    dataset = []

    files_to_process = []
    for root, dirs, files in os.walk(wav_dir):
        for file in files:
            if file.endswith(".wav"):
                files_to_process.append((root, file))

    for root, file in tqdm(files_to_process, desc="Creating dataset", unit="file"):
        file_path = os.path.join(root, file)
        audio, sr = librosa.load(file_path, sr=target_sample_rate)
        file_name = os.path.basename(file)
        speaker_id, text_id = file_name.split("_")[0], file_name.split("_")[1].split(".")[0]
        text_file_path = os.path.join(txt_dir, speaker_id, "{}_{}.txt".format(speaker_id, text_id))
        
        # Check if the text file exists
        if not os.path.exists(text_file_path):
            print(f"Text file not found for {file_name}, skipping...")
            continue
        
        with open(text_file_path, 'r') as text_file:
            text = text_file.read().strip()
        
        speaker_meta = speaker_info.get(speaker_id, {})
        entry = {
            "speaker_id": speaker_id,
            "file_path": file_path,
            "audio": audio.tolist(),  # Convert numpy array to list for JSON serialization
            "text": text,
        }
        dataset.append(entry)
        
    df = pd.DataFrame(dataset)
    train_df, val_df, test_df = split_dataset(df)
    
#     save_dataset_hdf5(train_data, 'train_{}'.format(int(target_sample_rate//1e3)))
#     save_dataset_hdf5(val_data, 'val_{}'.format(int(target_sample_rate//1e3)))
#     save_dataset_hdf5(test_data, 'test_{}'.format(int(target_sample_rate//1e3)))
    train_df.to_csv('train_{}'.format(int(target_sample_rate//1e3)))
    pd.to_csv('val_{}'.format(int(target_sample_rate//1e3)))
    pd.to_csv(test_df, 'test_{}'.format(int(target_sample_rate//1e3)))
    
    
    return train_df, val_df, test_df
    
def split_dataset(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    # Ensure the split proportions sum to 1
    assert train_size + val_size + test_size == 1.0, "Train, validation, and test sizes must sum to 1.0"
    
    # Get unique speakers
    speakers = df['speaker_id'].unique()
    
    # Split speakers into train and temp (val + test)
    train_speakers, temp_speakers = train_test_split(speakers, train_size=train_size, random_state=random_state)
    
    # Calculate the proportion for validation in the temp split
    val_proportion = val_size / (val_size + test_size)
    
    # Split temp_speakers into validation and test sets
    val_speakers, test_speakers = train_test_split(temp_speakers, train_size=val_proportion, random_state=random_state)
    
    # Assign entries to the respective sets
    train_df = df[df['speaker_id'].isin(train_speakers)]
    val_df = df[df['speaker_id'].isin(val_speakers)]
    test_df = df[df['speaker_id'].isin(test_speakers)]
    
    return train_df, val_df, test_df

In [8]:
def load_split(target_sample_rate):
#     train_data = load_dataset_hdf5("./data/VCTK/processed/train_{}.h5".format(int(target_sample_rate // 1e3)))
#     val_data = load_dataset_hdf5("./data/VCTK/processed/val_{}.h5".format(int(target_sample_rate // 1e3)))
#     test_data = load_dataset_hdf5("./data/VCTK/processed/test_{}.h5".format(int(target_sample_rate // 1e3)))  
    
#     train_df = pd.DataFrame(train_data)
#     val_df = pd.DataFrame(val_data)    
#     test_df = pd.DataFrame(test_data) 
    
    train_df = pd.read_csv("./data/VCTK/processed/train_{}.csv".format(int(target_sample_rate // 1e3)))
    val_df = pd.read_csv("./data/VCTK/processed/val_{}.csv".format(int(target_sample_rate // 1e3)))   
    test_df = pd.read_csv("./data/VCTK/processed/test_{}.csv".format(int(target_sample_rate // 1e3)))
    
    return train_df, val_df, test_df


In [9]:
### ACTUALLY LOAD/CREATE DATAFRAMES

return_dfs = True

if (not os.path.exists("./data/VCTK/processed/train_{}.csv".format(int(target_sample_rate // 1e3))) or
    not os.path.exists("./data/VCTK/processed/val_{}.csv".format(int(target_sample_rate // 1e3))) or
    not os.path.exists("./data/VCTK/processed/test_{}.csv".format(int(target_sample_rate // 1e3)))):
    
    # Create dataset
    train_df, val_df, test_df = create_dataset(target_sample_rate)
elif return_dfs:
    # Load from data
    train_df, val_df, test_df = load_split(target_sample_rate)
    

Creating dataset:  81%|████████████▉   | 35948/44455 [04:13<00:59, 141.88file/s]


KeyboardInterrupt: 

In [None]:
# Define the path to save the JSON file
json_file_path = './data/VCTK/raw/speaker_to_idx.json'

if not os.path.exists(json_file_path):
    unique_speakers = (train_df['speaker_id'].unique().tolist() + 
                        val_df['speaker_id'].unique().tolist() + 
                        test_df['speaker_id'].unique().tolist())

    speaker_to_idx = {speaker: idx for idx, speaker in enumerate(unique_speakers)}

    # Save the mapping to a JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(speaker_to_idx, json_file)
else:
    with open(json_file_path, 'r') as json_file:
        speaker_to_idx = json.load(json_file)

In [None]:
text_file_path = "./data/VCTK/raw/text.txt"

if not os.path.exists(text_file_path):
    text = (train_df['text'].tolist() + 
            val_df['text'].tolist() + 
            test_df['text'].tolist())

    with open(text_file_path, 'w') as file:
            for item in text:
                file.write(f"{item}\n")
else:
    text = []
    with open(text_file_path, 'r') as file:
        for line in file:
            text.append(line.strip())

In [None]:
### CREATE TOKENIZER

args = {
    "pad_id": 0,
    "bos_id": 1,
    "eos_id": 2,
    "unk_id": 3,
    "input": "./data/VCTK/raw/text.txt",
    "vocab_size": 4000,
    "model_prefix": "Multi30k",
    # "model_type": "word",
}
combined_args = " ".join(
    "--{}={}".format(key, value) for key, value in args.items())
sentencepiece.SentencePieceTrainer.Train(combined_args)

vocab = sentencepiece.SentencePieceProcessor()
vocab.Load("Multi30k.model")

In [None]:
print("Vocabulary size:", vocab.GetPieceSize())
print()

for example in text[:3]:
  sentence = example
  pieces = vocab.EncodeAsPieces(sentence)
  indices = vocab.EncodeAsIds(sentence)
  print(sentence)
  print(pieces)
  print(vocab.DecodePieces(pieces))
  print(indices)
  print(vocab.DecodeIds(indices))
  print()

piece = vocab.EncodeAsPieces("the")[0]
index = vocab.PieceToId(piece)
print(piece)
print(index)
print(vocab.IdToPiece(index))

In [None]:
if return_dfs:
    del train_df
if return_dfs: 
    del val_df
if return_dfs:
    del test_df
gc.collect()

In [None]:
### FILE BASED DATASET

# class VCTK(Dataset):
#     def __init__(self, hdf5_file_path):
#         self.hdf5_file_path = hdf5_file_path
        
#         # Open the HDF5 file to get the number of samples
#         with h5py.File(hdf5_file_path, 'r') as f:
#             self.num_samples = len(f.keys())
#             self.idx_to_keys = {idx: key for idx, key in enumerate(f.keys())}
    
#     def __len__(self):
#         return self.num_samples
    
#     def __getitem__(self, idx):
#         with h5py.File(self.hdf5_file_path, 'r') as f:
#             group = f[self.idx_to_keys[idx]]
#             audio = torch.tensor(group['audio'], dtype=torch.float32)
#             text = group.attrs['text']
#             speaker_id = speaker_to_idx[group.attrs['speaker_id']]
            
#             # Tokenize text
#             tokens = vocab.EncodeAsIds(text)
            
#             sample = {
#                 'audio': audio,
#                 'input_ids': tokens,  # Token IDs
#                 'speaker_id': torch.tensor(int(speaker_id), dtype=torch.long),  # Numeric speaker ID
#             }
        
#         return sample

# # Example usage
# # Assuming `train_df`, `val_df`, and `test_df` are your DataFrames

# train_hdf5_path = './data/VCTK/processed/train_8.h5'
# val_hdf5_path = './data/VCTK/processed/val_8.h5'
# test_hdf5_path = './data/VCTK/processed/test_8.h5'

# train_dataset = VCTK(train_hdf5_path)

# val_dataset = VCTK(val_hdf5_path)

# test_dataset = VCTK(test_hdf5_path)

# train_dataset[0]

In [None]:
class VCTK(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Convert audio data to a PyTorch tensor
        audio = torch.tensor(row['audio'], dtype=torch.float32)
        
        # Tokenize text
        text = row['text']
        tokens = vocab.EncodeAsIds(text)
        
        speaker_id = speaker_to_idx[row['speaker_id']]
        
        sample = {
            'audio': audio,
            'tokens': tokens,  # Token IDs
            'speaker_id': torch.tensor(speaker_id, dtype=torch.long),  # Numeric speaker ID
        }
        
        return sample
    
train_dataset = VCTK(train_df)

val_dataset = VCTK(val_df)

test_dataset = VCTK(test_df)

In [None]:
train_dataset[4]

In [None]:
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, pin_memory=True)

for item in tqdm(val_loader):
    continue

In [None]:
# Make a dataset/dataloader like B-VAE

In [None]:
# Start on Training process
## Make loss functions
## Design models
## Get train loop working

In [None]:
# Work on Inference process
## make Inference dataloaders ? 
## make Inference pipeline