# Wav Preprocessing
Preproc for wavLearning, including stat, random cutting, aligned cutting, sound2tensor


Let's call it wavLearning, but in fact, after checking, I still found that mfcc was the choice for many. 

No! Direct usage of MFCC does not really work, the problem is that we cannot transfer between MFCC and audio with no information loss, therefore, the mfcc-to-audio output is quite noisy compared with the input. Therefore I referred to other sound autoencoder works for how they dealt with this problem. (the problem to directly work with audio input is that they contain to many frames)

- the extraction of alignment infomation done in alignment_extract.ipynb
- length stats done in length_stat.ipynb
- this notebook will cut sounds

In [1]:
import torch
import torchaudio
import torchaudio.transforms as transforms
import os
import math
from multiprocessing import Pool

In [2]:
from paths import *
from mio import *
from sampler import *
from my_utils import *
from sound_proc import *

## Load Distribution Parameters

In [3]:
params = load_gamma_params("phones_length_gamma.param")

## Define Open and Cut Functions

### Ground Truth Cut

In [4]:
def open_and_cut_phones(wave_path, annos_path, params):
    sp = Sound_Proc()
    filtered_df = filter_tokens_and_get_df(annos_path, keepSIL=False)
    flat_starts, flat_ends, c_duration = filtered_df["start_time"].to_numpy(), filtered_df["end_time"].to_numpy(), filtered_df["duration"].to_numpy()
    
    rec, sample_rate = torchaudio.load(wave_path)
    cut_recs = sp.cut_rec(rec, flat_starts, flat_ends)
    
    tokens = filtered_df["token"].to_numpy()
    
    cst, cet = flat_starts, flat_ends
    
    # Framify
    # Create a dictionary with the three lists as values and the column names as keys
    data = {'rec': os.path.splitext(os.path.basename(wave_path))[0], "idx": list(map("{:08d}".format, range(len(c_duration)))), 'start_time': cst, 'end_time': cet, 'token': tokens, 'duration': c_duration}
    # Create a Pandas DataFrame from the dictionary
    df = pd.DataFrame(data)
    
    return cut_recs, df

In [5]:
# cr, df = open_and_cut_phones(os.path.join(wav_path, "s0101a.wav"), os.path.join(phones_extract_path, "s0101a.csv"), params)

### Random Sampling Cut

In [6]:
def open_and_cut_phones_random_sampling(wave_path, anno_path, params): 
    sp = Sound_Proc()
    metadata = torchaudio.info(wave_path)
    rec_len = sp.get_rec_length(metadata)
    samples = gamma_samples_sum(rec_len, params, shift=0.0125)

    flat_starts, flat_ends = samples2idx_with_se(samples)
    
    rec, sample_rate = torchaudio.load(wave_path)
    cut_recs = sp.cut_rec(rec, flat_starts, flat_ends)
    
    cst, cet = flat_starts, flat_ends
    c_duration = [cet[i] - cst[i] for i in range(len(cst))]
    
    # Framify
    # Create a dictionary with the three lists as values and the column names as keys
    data = {'rec': os.path.splitext(os.path.basename(wave_path))[0], "idx": list(map("{:08d}".format, range(len(c_duration)))), 'start_time': cst, 'end_time': cet, 'token': "", 'duration': c_duration}
    # Create a Pandas DataFrame from the dictionary
    df = pd.DataFrame(data)
    
    return cut_recs, df

### Multiprocessing
To make processing easier, both open-and-cut functions return the same output: `cut_recs` (a list of NumPy arrays) and a `token_list` (a Pandas DataFrame).

In order to speed up the processing time, you can use multiprocessing to plan the work and distribute it to the two open-and-cut functions. This will allow each function to work on a separate process, which can be run simultaneously, potentially reducing the overall processing time.

In [7]:
def collaboration_single_work(my_work_pool, fun, my_wave_dir, my_anno_dir, my_save_dir, my_log_dir, my_params): 
    print("Working from {} to {}".format(my_work_pool[0], my_work_pool[-1]))
    for rec_name in my_work_pool: 
        rec_raw, ext = os.path.splitext(rec_name)
        cut_recs, corr_df = fun(
            os.path.join(my_wave_dir, rec_name), 
            os.path.join(my_anno_dir, rec_raw + ".csv"),
            my_params
        )
        save_cut_waves_and_log(
            save_dir=my_save_dir, 
            log_dir=my_log_dir, 
            cut_list=cut_recs, 
            corr_df=corr_df, 
        )
    print("Work from {} to {} ends".format(my_work_pool[0], my_work_pool[-1]))

In [8]:
class MultiprocessManager: 
    def __init__(self, fun, my_wave_dir, my_anno_dir, my_save_dir, my_log_dir, my_params, num_workers=4): 
        self.fun = fun
        self.my_wave_dir = my_wave_dir
        self.my_anno_dir = my_anno_dir
        self.my_save_dir = my_save_dir
        self.my_log_dir = my_log_dir
        self.my_params = my_params
        self.num_workers = num_workers
    
    def divide_work(self, work):
        # determine the number of items per worker
        items_per_worker = math.ceil(len(work) / self.num_workers)

        # divide the work into chunks
        work_chunks = [work[i:i + items_per_worker] for i in range(0, len(work), items_per_worker)]

        return work_chunks
    
    def collaboration_work(self): 
        flat_tasks = os.listdir(self.my_wave_dir)
        task_pools = self.divide_work(flat_tasks)
        print(self.num_workers)
        p = Pool(self.num_workers)
        for i in range(self.num_workers):
            p.apply_async(collaboration_single_work, args=(task_pools[i], self.fun, self.my_wave_dir, self.my_anno_dir, self.my_save_dir, self.my_log_dir, self.my_params, ))
        print('Waiting for all subprocesses done...')
        p.close()
        p.join()
        print('All subprocesses done.')

## Run 

### Random Sampling

In [8]:
mpm = MultiprocessManager(open_and_cut_phones_random_sampling, wav_path, phones_extract_path, phone_seg_random_path, phone_seg_random_log_path, params, num_workers=8)

In [9]:
mpm.collaboration_work()

8
Working from s4002a.wav to s1101a.wavWorking from s2402b.wav to s3601b.wavWorking from s0504a.wav to s2502a.wavWorking from s1001b.wav to s1202a.wavWorking from s0203b.wav to s3701a.wavWorking from s1901a.wav to s0101b.wavWorking from s0202b.wav to s3502b.wavWorking from s3402a.wav to s0302a.wav







Waiting for all subprocesses done...
Work from s0504a.wav to s2502a.wav ends
Work from s0203b.wav to s3701a.wav ends
Work from s4002a.wav to s1101a.wav ends
Work from s2402b.wav to s3601b.wav ends
Work from s1901a.wav to s0101b.wav ends
Work from s0202b.wav to s3502b.wav ends
Work from s1001b.wav to s1202a.wav ends
Work from s3402a.wav to s0302a.wav ends
All subprocesses done.


#### Bind csvs into one

In [13]:
# List all the CSV files in the directory that start with 's'
directory = phone_seg_random_log_path
csv_files = sorted([f for f in os.listdir(directory) if f.startswith('s') and f.endswith('.csv')])

# Read and concatenate the CSV files using pandas
dfs = []
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))
    dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)

# Save the concatenated dataframe as "log.csv"
concatenated_df.to_csv(os.path.join(directory, 'log.csv'), index=False)

### Aligned Cutting

In [9]:
mpm = MultiprocessManager(open_and_cut_phones, wav_path, phones_extract_path, phone_seg_anno_path, phone_seg_anno_log_path, params, num_workers=8)

In [10]:
mpm.collaboration_work()

8
Working from s4002a.wav to s1101a.wavWorking from s0504a.wav to s2502a.wavWorking from s2402b.wav to s3601b.wavWorking from s1901a.wav to s0101b.wavWorking from s0202b.wav to s3502b.wavWorking from s0203b.wav to s3701a.wavWorking from s1001b.wav to s1202a.wav

Working from s3402a.wav to s0302a.wav





Waiting for all subprocesses done...
Work from s0504a.wav to s2502a.wav ends
Work from s0203b.wav to s3701a.wav ends
Work from s0202b.wav to s3502b.wav ends
Work from s1001b.wav to s1202a.wav ends
Work from s4002a.wav to s1101a.wav ends
Work from s1901a.wav to s0101b.wav ends
Work from s3402a.wav to s0302a.wav ends
Work from s2402b.wav to s3601b.wav ends
All subprocesses done.


#### Bind csvs into one

In [11]:
# List all the CSV files in the directory that start with 's'
directory = phone_seg_anno_log_path
csv_files = sorted([f for f in os.listdir(directory) if f.startswith('s') and f.endswith('.csv')])

# Read and concatenate the CSV files using pandas
dfs = []
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))
    dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)

# Save the concatenated dataframe as "log.csv"
concatenated_df.to_csv(os.path.join(directory, 'log.csv'), index=False)

### Zip to ease later data transfer

In [10]:
import zipfile

def zipdir(path, ziph):
    # Iterate over all the files in the directory
    for root, dirs, files in os.walk(path):
        for file in files:
            # Get the full path of the file
            file_path = os.path.join(root, file)
            # Add the file to the zip archive
            ziph.write(file_path)

# Name of the zip file to create
zip_name = 'phone_seg_random.zip'

# Path of the directory to be zipped
dir_path = phone_seg_random_path

# Create a ZipFile object with the zip file name and mode
zipf = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)

# Call the zipdir function to add the directory to the zip archive
zipdir(dir_path, zipf)

# Close the zip file
zipf.close()

print(f'{dir_path} has been zipped to {zip_name}!')

../src/bsc/phone_seg_random/ has been zipped to phone_seg_random.zip!
