# Data Preprocessing

1. Read the raw audio files
2. Equalize their volume
3. Cut them into segments of fixed length
4. (Optionally) Extract features
5. Save to files

In [None]:
# Autoreloading makes development easier
%load_ext autoreload
%autoreload 2

In [None]:
import os
import shutil
import sys
import numpy as np
from tools.audio_tools import read_audio, write_audio
from tools.constants import raw_classical_path, raw_jazz_path, cut_classical_path, cut_jazz_path, npy_data_path, npy_classical_path, npy_jazz_path, n_secs, default_sample_rate, npy_classical_path, npy_jazz_path
from tools.feature_tools import compute_mels, compute_imels

The function below cuts the audio file to fixed lengths and (possibly) immediately extracts the features as well.

In [None]:
def cut_audio_file(source_file: str, source_dir: str, target_dir: str, segment_duration: int = n_secs, target_loudness: float = -20, sr=default_sample_rate, force_power_of_2 = True, to_npy=True):
  # File name & path
  file_name, file_ext = os.path.splitext(source_file)
  source_path = os.path.join(source_dir, source_file)
  extension = "npy" if to_npy else "wav"
  if os.path.exists(os.path.join(target_dir, f"{file_name}_0.{extension}")):
    print(f"File already processed: {source_file}")
    return 0
  else:
    print(f"Processing file: {source_file}")

  # Read input file (mono @ target_sample_rate)
  audio = read_audio(source_path, sr=sr)

  # Equalize loudness
  current_rms = np.sqrt(np.mean(audio**2))
  target_rms = 10**(target_loudness/20)
  audio_normalized = audio * target_rms / current_rms

  # Compute the number of samples per segment
  samples_per_segment = int(sr * segment_duration)
  if force_power_of_2:
    # Pick the number of samples per segment that is nearest to a power of 2 (better for FFT transforms)
    samples_per_segment = 2**round(np.log2(samples_per_segment))
  n_segments = int(len(audio_normalized) / samples_per_segment)

  # Split into segments & save to file
  for i in range(n_segments):
    # split segment
    segment = audio_normalized[i*samples_per_segment:(i+1)*samples_per_segment]
    
    if to_npy:
      # Compute features
      features = compute_mels(segment)
      
      # construct output path
      target_file = f"{file_name}_{i}.npy"
      target_path = os.path.join(target_dir, target_file)
    
      # Write output
      np.save(target_path, features)
    else:
      # construct output path
      target_file = f"{file_name}_{i}.wav"
      target_path = os.path.join(target_dir, target_file)

      # Write output
      write_audio(segment, target_path, sr=sr)

  return n_segments

The cell below cleans up the preprocessing directory.  
Only run if changes need to be made to the preprocessed data.

In [None]:
# Uncomment if you really want to run this
assert False, "Sure you want to run this?"

# Reset preprocessed data
if os.path.exists(npy_data_path):
  shutil.rmtree(npy_classical_path)
  shutil.rmtree(npy_jazz_path)

Here, the actual preprocessing is performed.  
As the code is now, the audio will be cut and features immediately extracted and saved to disk in npy format.

In [None]:
# Bulk processingg n samples per genre
n_samples_per_genre = 5

os.makedirs(npy_classical_path, exist_ok=True)
os.makedirs(npy_jazz_path, exist_ok=True)
raw_paths = [raw_classical_path, raw_jazz_path]
cut_paths = [npy_classical_path, npy_jazz_path]
for raw_path, cut_path in zip(raw_paths, cut_paths):
  # Read files
  raw_files = os.listdir(raw_path)

  # Process limited files
  n_samples = min(n_samples_per_genre, len(raw_files))
  raw_files = np.random.choice(raw_files, size=n_samples, replace=False)

  for raw_file in raw_files:
    n_segments = cut_audio_file(raw_file, raw_path, cut_path, to_npy=True)

If the data were saved as cut wave files, the code snipped below allows to further process the files to npy structure.  
This is prefered as it considerably speeds up loading of the data

In [None]:
# Cut data -> NPY Features
for cut_dir, npy_dir in zip([cut_classical_path, cut_jazz_path], [npy_classical_path, npy_jazz_path]):
    print(f"Processing {cut_dir}")
    # Make sure npy dir exists
    os.makedirs(npy_dir, exist_ok=True)
    # Read cut_dir
    cut_files = os.listdir(cut_dir)
    n_files = len(cut_files)
    for i, cut_file in enumerate(cut_files):
        # Split file name from extension
        file_name, file_ext = os.path.splitext(cut_file)
        if file_ext != ".wav":
            print(f"Warning: skipping file {cut_file}")
            continue
        npy_path = os.path.join(npy_dir, f"{file_name}.npy")
        if os.path.exists(npy_path):
            continue
        # Read audio & compute mels
        audio = read_audio(os.path.join(cut_dir, cut_file))
        mels = compute_mels(audio)
        np.save(npy_path, mels)

        if i % (n_files // 100) == 0:
            sys.stdout.write('\r')
            sys.stdout.write(f"{(i+1) / n_files * 100:.0f} %")
            sys.stdout.flush()
    print()