In [2]:
import numpy as np
import librosa
import joblib  # For saving models to download
from datasets import load_dataset, Audio
# from sklearnex import patch_sklearn
# patch_sklearn()
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# --- INSTALL LIBRARIES (COLAB) ---
# !pip install datasets librosa polars scikit-learn-intelex joblib



# --- CONFIGURATION ---
SAMPLE_RATE = 16000
WINDOW_SECONDS = 1.0
WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS) # 16000 samples

def enforce_window_size(audio, target_len):
    """
    Makes any audio file exactly `target_len` samples long.
    """
    curr_len = len(audio)
    
    if curr_len == target_len:
        return [audio]
    
    # CASE 1: Audio is too LONG (e.g. Background Noise)
    # Slice it into multiple non-overlapping chunks
    elif curr_len > target_len:
        chunks = []
        num_chunks = curr_len // target_len
        for i in range(num_chunks):
            start = i * target_len
            chunks.append(audio[start : start + target_len])
        return chunks
    
    # CASE 2: Audio is too SHORT (e.g. The Event)
    # Center the event in the window and pad with silence (zeros)
    else:
        pad_total = target_len - curr_len
        pad_left = pad_total // 2
        pad_right = pad_total - pad_left
        padded = np.pad(audio, (pad_left, pad_right), mode='constant')
        return [padded]

def load_training_data(dataset_name, split="train", max_chunks_per_class=5000):
    ds = load_dataset(dataset_name, split=split, streaming=False)
    ds = ds.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))
    
    data = {0: [], 1: []}
    
    print("Streaming and processing...")
    for item in ds:
        # Stop if full
        if len(data[0]) >= max_chunks_per_class and len(data[1]) >= max_chunks_per_class:
            break
            
        try:
            audio = item['audio']['array']
            label = item['label']
            
            # Skip empty/bad files
            if len(audio) < 100: continue
            
            # Skip if we already have enough of this class
            if len(data[label]) >= max_chunks_per_class: continue

            # --- CRITICAL: Enforce Window Size ---
            # This returns a list of 1.0s chunks
            chunks = enforce_window_size(audio, WINDOW_SAMPLES)
            
            for chunk in chunks:
                # Extract MFCC for this chunk
                mfcc = librosa.feature.mfcc(y=chunk, sr=SAMPLE_RATE, n_mfcc=13)
                # Average over time -> (13,) vector
                mfcc_vec = np.mean(mfcc.T, axis=0)
                
                data[label].append(mfcc_vec)
                
                # Check limit again inside loop
                if len(data[label]) >= max_chunks_per_class: break
                
        except Exception:
            continue
            
    print(f"Final Counts: Class 0={len(data[0])}, Class 1={len(data[1])}")
    
    X = np.array(data[0] + data[1])
    y = np.array([0]*len(data[0]) + [1]*len(data[1]))
    return X, y



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# --- EXECUTION ---
#1. Load Data
ds_name = "geronimobasso/drone-audio-detection-samples"  # <--- PUT YOUR DATASET HERE
#X, y = load_training_data(ds_name)


In [None]:
ds = load_dataset(ds_name, split="train", streaming=False)


In [8]:
ds.save_to_disk("./my_offline_ds")

Saving the dataset (15/15 shards): 100%|██████████| 180320/180320 [00:41<00:00, 4368.11 examples/s] 


In [6]:
# --- 1. PREPROCESSING (Loop Padding Fix) ---
def enforce_window_size(audio, target_len):
    curr_len = len(audio)
    if curr_len == target_len:
        return [audio]
    elif curr_len > target_len:
        # Cut into chunks
        chunks = []
        num_chunks = curr_len // target_len
        for i in range(num_chunks):
            start = i * target_len
            chunks.append(audio[start : start + target_len])
        return chunks
    else:
        # LOOP PADDING (The Fix)
        repeats = int(np.ceil(target_len / curr_len))
        tiled = np.tile(audio, repeats)
        padded = tiled[:target_len]
        # Tiny noise to prevent perfect duplicate artifacts
        padded = padded + np.random.normal(0, 0.001, len(padded)) 
        return [padded]

In [7]:
data = []
labels = []
counts = {0: 0, 1: 0}
max_samples=1e5
for item in ds:
    if counts[0] >= max_samples and counts[1] >= max_samples:
        break
        
    label = item['label']
    if counts[label] >= max_samples: continue

    try:
        # Manual Decode
        audio_bytes = item['audio']['bytes']
        audio, _ = librosa.load(io.BytesIO(audio_bytes), sr=SAMPLE_RATE)
        
        # Pad/Cut
        chunks = enforce_window_size(audio, WINDOW_SAMPLES)
        
        for chunk in chunks:
            # Extract MFCC
            mfcc = librosa.feature.mfcc(y=chunk, sr=SAMPLE_RATE, n_mfcc=13)
            mfcc_vec = np.mean(mfcc.T, axis=0) # (13,)
            
            data.append(mfcc_vec)
            labels.append(label)
            
            # Update counts inside loop (chunks generate more samples)
            counts[label] += 1
            if counts[label] >= max_samples: break
            
    except Exception:
        continue
        
print(f"Final Counts: Class 0={counts[0]}, Class 1={counts[1]}")
return np.array(data), np.array(labels)

RuntimeError: Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6, 7, and 8.
          2. The PyTorch version (2.9.1+cpu) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.
        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]
FFmpeg version 8: Could not load this library: C:\Users\Mostafa\Downloads\Leopard detection project\Code\GMM\.venv\Lib\site-packages\torchcodec\libtorchcodec_core8.dll
FFmpeg version 7: Could not load this library: C:\Users\Mostafa\Downloads\Leopard detection project\Code\GMM\.venv\Lib\site-packages\torchcodec\libtorchcodec_core7.dll
FFmpeg version 6: Could not load this library: C:\Users\Mostafa\Downloads\Leopard detection project\Code\GMM\.venv\Lib\site-packages\torchcodec\libtorchcodec_core6.dll
FFmpeg version 5: Could not load this library: C:\Users\Mostafa\Downloads\Leopard detection project\Code\GMM\.venv\Lib\site-packages\torchcodec\libtorchcodec_core5.dll
FFmpeg version 4: Could not load this library: C:\Users\Mostafa\Downloads\Leopard detection project\Code\GMM\.venv\Lib\site-packages\torchcodec\libtorchcodec_core4.dll
[end of libtorchcodec loading traceback].

In [4]:

# 2. Train Scaler (Crucial for realtime)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



NameError: name 'X' is not defined

In [None]:
# 3. Train Models
# print("Training GMMs...")
# gmm0 = GaussianMixture(n_components=16, covariance_type='diag', random_state=42)
# gmm0.fit(X_scaled[y == 0])
# gmm1 = GaussianMixture(n_components=16, covariance_type='diag', random_state=42)
# gmm1.fit(X_scaled[y == 1])



In [None]:
# 4. SAVE EVERYTHING
# joblib.dump(scaler, 'scaler.pkl')
# joblib.dump(gmm0, 'gmm0.pkl')
# joblib.dump(gmm1, 'gmm1.pkl')
# print("Models saved! Download .pkl files to your laptop.")

In [None]:
X, y = load_segmented_data("your_dataset_name", target_samples=5000)