In [1]:
import torch
print(torch.cuda.is_available())         # should return True
print(torch.cuda.get_device_name(0))     # should say "NVIDIA GeForce RTX 3060"


True
NVIDIA GeForce RTX 3060


IMPORTS

In [3]:
# For data manipulation and analysis 
import pandas as pd

# For numerical operations and array handling 
import numpy as np

# For creating plots and visualisations
import matplotlib.pylab as plt

# For advanced data visualisations 
#import seaborn as sns

# For file pattern matching 
import glob as glob

# For audio processing 
import librosa

# For displaying audio data visually 
import librosa.display

# For playing audio directly in notebooks 
import IPython.display as ipd


In [None]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# Config
RANDOM_SEED = 42
BASE_DIR = "datasets/release_in_the_wild"
META_CSV = os.path.join(BASE_DIR, "meta.csv")
WAV_DIR = BASE_DIR

OUTPUT_DIRS = {
    "train": os.path.join(BASE_DIR, "train"),
    "val": os.path.join(BASE_DIR, "val"),
    "test": os.path.join(BASE_DIR, "test")
}

# Create output folders
for path in OUTPUT_DIRS.values():
    os.makedirs(path, exist_ok=True)

# Load and clean metadata
df = pd.read_csv(META_CSV, header=0)
df["file"] = df["file"].str.strip()
df["label"] = df["label"].str.strip()
df["speaker"] = df["speaker"].str.strip()

# Stratified 70/15/15 split
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=RANDOM_SEED, stratify=df["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_SEED, stratify=temp_df["label"])

# Move files
def move_files(subset_df, split_name):
    for _, row in subset_df.iterrows():
        src = os.path.join(WAV_DIR, row["file"])
        dst = os.path.join(OUTPUT_DIRS[split_name], row["file"])
        if os.path.exists(src):
            shutil.move(src, dst)

move_files(train_df, "train")
move_files(val_df, "val")
move_files(test_df, "test")

# Save metadata
train_df.to_csv(os.path.join(BASE_DIR, "train_meta.csv"), index=False)
val_df.to_csv(os.path.join(BASE_DIR, "val_meta.csv"), index=False)
test_df.to_csv(os.path.join(BASE_DIR, "test_meta.csv"), index=False)

print("Dataset successfully split into train/val/test (70/15/15).")

import os

def count_files(folder_path):
    return len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

# Example usage
print(f"Train files: {count_files('datasets/release_in_the_wild/train')}")
print(f"Val files: {count_files('datasets/release_in_the_wild/val')}")
print(f"Test files: {count_files('datasets/release_in_the_wild/test')}")


#files add up to the coprrect amout, so no fiels were lost in the process

Dataset successfully split into train/val/test (70/15/15).
Train files: 22245
Val files: 4767
Test files: 4767


In [None]:
import librosa
import noisereduce as nr
import numpy as np
import soundfile as sf
from pathlib import Path

def preprocess_audio(y, sr, target_duration=6.0, apply_preemphasis=False, apply_reduction=False, coef=0.5, normalise='rms'):
    
    # Trim leading/trailing silence
    y, _ = librosa.effects.trim(y)

    # Apply noise reduction / dereverberation
    if apply_reduction:
        y = nr.reduce_noise(y=y, sr=sr)

    # Apply pre-emphasis filter
    if apply_preemphasis:
        y = librosa.effects.preemphasis(y, coef=coef)

    # Normalisation method
    if normalise == 'rms':
        rms = np.sqrt(np.mean(y**2))
        y = y / (rms + 1e-6)
    elif normalise == 'peak':
        y = y / (np.max(np.abs(y)) + 1e-6)

    # Duration control: pad or truncate
    target_length = int(sr * target_duration)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)))
    else:
        y = y[:target_length]

    return y

def preprocess_folder(input_folder, output_folder, sr=22050, target_duration=6.0, apply_preemphasis=False, apply_reduction=False, coef=0.5, normalise='rms'):
    
    input_folder = Path(input_folder)
    output_folder = Path(output_folder)
    wav_files = list(input_folder.rglob("*.wav"))
    
    print(f"Found {len(wav_files)} audio files in {input_folder}")

    for wav_file in wav_files:
        try:
            y, _ = librosa.load(wav_file, sr=sr)
            y = preprocess_audio(
                y, sr, 
                target_duration=target_duration,
                apply_preemphasis=apply_preemphasis,
                apply_reduction=apply_reduction,
                coef=coef,
                normalise=normalise
            )

            # Save processed file to the same structure
            relative_path = wav_file.relative_to(input_folder)
            out_path = output_folder / relative_path
            out_path.parent.mkdir(parents=True, exist_ok=True)

            sf.write(out_path, y, sr)

        except Exception as e:
            print(f"Failed: {wav_file} → {e}")
