In [11]:
import pandas as pd
import os

In [12]:
CSV_METADATA_FILE_NAME = "metadata.csv"
DATASET_METADATA_DIR = "TESS"
DATASET_DIR_NAME = f"{DATASET_METADATA_DIR}_transformed"
CUTOFF = 8192
# CUTOFF = 16384

In [13]:
project_dir = os.path.relpath(os.path.join(os.getcwd(), os.pardir))

csv_metadata_path = os.path.join(project_dir, "metadata", DATASET_METADATA_DIR, CSV_METADATA_FILE_NAME)

In [14]:
df = pd.read_csv(csv_metadata_path)
df.head()

Unnamed: 0,name,extension,path,dirs
0,OAF_back_angry,wav,C:\users\user\studia\audio_clustering_autoenco...,OAF_angry
1,OAF_bar_angry,wav,C:\users\user\studia\audio_clustering_autoenco...,OAF_angry
2,OAF_base_angry,wav,C:\users\user\studia\audio_clustering_autoenco...,OAF_angry
3,OAF_bath_angry,wav,C:\users\user\studia\audio_clustering_autoenco...,OAF_angry
4,OAF_bean_angry,wav,C:\users\user\studia\audio_clustering_autoenco...,OAF_angry


In [15]:
import librosa
from scipy import signal
import soundfile as sf

In [16]:
metadata_dir = os.path.join(project_dir, "metadata")

dataset_metadata_dir = os.path.join(metadata_dir, DATASET_METADATA_DIR)
os.makedirs(DATASET_DIR_NAME, exist_ok=True)

csv_file_name = "metadata_transformed.csv"
csv_file_path = os.path.join(dataset_metadata_dir, csv_file_name)

In [17]:
datasets_dir = os.path.join(project_dir, "datasets")

dataset_dir = os.path.join(datasets_dir, DATASET_DIR_NAME)
os.makedirs(dataset_dir, exist_ok=True)

In [18]:
def load_audio_from_path(path):
    sample_rate = librosa.get_samplerate(path)
    return librosa.load(path, sr=sample_rate)


def get_output_file(parent_dirs, file_name):
    if parent_dirs != '':
        return f"{dataset_dir}\\{parent_dirs}\\{file_name}.wav"
    
    return f"{dataset_dir}\\{file_name}\\{name}.wav"


# Filter audio file for given cutoff (for example to work in range 0kHz - 8kHz)
def low_pass_filter(y, sr, cutoff=CUTOFF):
    # Cutoff frequency (default = 8 kHz)
    cutoff_hz = cutoff

    # Low-pass filter
    nyquist = 0.5 * sr
    normal_cutoff = cutoff_hz / nyquist
    b, a = signal.butter(8, normal_cutoff, btype='low', analog=False)

    # Apply the filter to the audio data
    filtered_audio = signal.filtfilt(b, a, y)
    return filtered_audio

In [19]:
df_copy = df.copy()
df_copy['path'] = ''
df_copy['extension'] = 'wav'

for index, row in df.iterrows():
    name = row['name']
    path = row['path']    
    parent_dirs = row['dirs']
    
    y, sr = load_audio_from_path(path)
    output_file = get_output_file(parent_dirs, name)
    
    filtered_audio = low_pass_filter(y, sr, CUTOFF)
    
    try:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
    except Exception as e:
        print("Error creating directory:", e)
        
    sf.write(output_file, filtered_audio, sr, format='WAV')
    
    absolute_path = os.path.abspath(output_file)
    df_copy.at[index, 'path'] = absolute_path
    
    if index % 50 == 0:
        print(f"Transformed {index} files")
        

df_copy.to_csv(csv_file_path, index=False)

Transformed 0 files
Transformed 50 files
Transformed 100 files
Transformed 150 files
Transformed 200 files
Transformed 250 files
Transformed 300 files
Transformed 350 files
Transformed 400 files
Transformed 450 files
Transformed 500 files
Transformed 550 files
Transformed 600 files
Transformed 650 files
Transformed 700 files
Transformed 750 files
Transformed 800 files
Transformed 850 files
Transformed 900 files
Transformed 950 files
Transformed 1000 files
Transformed 1050 files
Transformed 1100 files
Transformed 1150 files
Transformed 1200 files
Transformed 1250 files
Transformed 1300 files
Transformed 1350 files
Transformed 1400 files
Transformed 1450 files
Transformed 1500 files
Transformed 1550 files
Transformed 1600 files
Transformed 1650 files
Transformed 1700 files
Transformed 1750 files
Transformed 1800 files
Transformed 1850 files
Transformed 1900 files
Transformed 1950 files
Transformed 2000 files
Transformed 2050 files
Transformed 2100 files
Transformed 2150 files
Transformed