In [None]:
# %%
# ==============================================================================
# PRE-COMPUTATION WITH CUSTOM AUGMENTATION
# ==============================================================================
# This script uses my custom fine-tuned augmentation strategy to
# create the final pre-computed feature set. It involved solving a optimization 
# problem that we want to increase the no. of clips in such a way that the mean  
# is closest to without augment max class clips while making sure the balanced  
# distribution has as less std deviation as possible. 
# As the clips are multi-lable, simply repeating clips might also lead to increase  
# in no.s of max class which we want to leave as is.
## THIS IS CUSTBAL2.1 : The Groups are divided properly for more balanced,
## 70-15-15 split.
# ==============================================================================

import pandas as pd
import numpy as np
import librosa
from pathlib import Path
import os
from tqdm.auto import tqdm
import joblib

tqdm.pandas()

from sklearn.model_selection import GroupShuffleSplit

import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

!pip install --no-deps audiomentations
!pip install numpy_minmax numpy_rms python_stretch
from audiomentations import Compose, AddGaussianNoise, PitchShift

2025-07-28 11:07:31.610571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753700851.802070      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753700851.855256      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Collecting audiomentations
  Downloading audiomentations-0.42.0-py3-none-any.whl.metadata (11 kB)
Downloading audiomentations-0.42.0-py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.5/86.5 kB[0m [31m648.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: audiomentations
Successfully installed audiomentations-0.42.0
Collecting numpy_minmax
  Downloading numpy_minmax-0.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy_rms
  Downloading numpy_rms-0.6.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting python_stretch
  Downloading python_stretch-0.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting numpy<3,>=2 (from numpy_minmax)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.wh

In [None]:
# Config
CLIPS_DIR = Path("/kaggle/input/sep-28k/clips/stuttering-clips/clips")
LABEL_FILE = "/kaggle/input/sep-28k/SEP-28k_labels.csv"
OUTPUT_DIR = Path("./output_wav2vec_custom_augmented")
OUTPUT_DIR.mkdir(exist_ok=True)

MODEL_NAME = "facebook/wav2vec2-base-960h"
SAMPLING_RATE = 16000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
MAX_SEQ_LEN = 150
EMBEDDING_DIM = 768

# Labels
LABEL_COLS = ['Prolongation', 'Block', 'SoundRep', 'WordRep', 'Interjection']
ALL_LABELS = LABEL_COLS + ['NoStutter']

BATCH_SIZE = 32

CLEANED_DF_PATH = OUTPUT_DIR / "df_multilabel.parquet"
TRAIN_DATA_PATH = OUTPUT_DIR / "train_data.npz"
VAL_DATA_PATH = OUTPUT_DIR / "val_data.npz"
TEST_DATA_PATH = OUTPUT_DIR / "test_data.npz"

Using device: cuda


In [None]:
print("--- Preparing and Splitting Data... ---")
if os.path.exists(CLEANED_DF_PATH):
    df = pd.read_parquet(CLEANED_DF_PATH)
else:
    df = pd.read_csv(LABEL_FILE)
    df['Clip'] = df['Show'].astype(str) + '_' + df['EpId'].astype(str) + '_' + df['ClipId'].astype(str)
    clips_in_folder = {c.stem for c in CLIPS_DIR.glob("*.wav")}
    df = df[df['Clip'].isin(clips_in_folder)].copy()
    def get_duration(clip_name):
        try: return librosa.get_duration(path=CLIPS_DIR / f"{clip_name}.wav")
        except Exception: return None
    df['duration'] = df['Clip'].progress_apply(get_duration)
    df.dropna(subset=['duration'], inplace=True)
    df = df[(df['duration'] > 2.95) & (df['duration'] < 3.05)].copy()
    for col in LABEL_COLS:
        df[col] = (df[col] > 0).astype(int)
    df['NoStutter'] = (df[LABEL_COLS].sum(axis=1) == 0).astype(int)
    df.to_parquet(CLEANED_DF_PATH)

df['speaker_id'] = df['Show']

--- Preparing and Splitting Data... ---


  0%|          | 0/28177 [00:00<?, ?it/s]

In [None]:
# manual splits
manual_splits = {
    'train': ['IStutterSoWhat', 'MyStutteringLife', 'StrongVoices', 'StutteringIsCool', 'WomenWhoStutter'],
    'val': ['HeStutters','HVSA'],
    'test': ['StutterTalk']
}

# DataFrames creation
train_df = df[df['speaker_id'].isin(manual_splits['train'])].copy()
val_df = df[df['speaker_id'].isin(manual_splits['val'])].copy()
test_df = df[df['speaker_id'].isin(manual_splits['test'])].copy()

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")
print("Data splitting complete using manual splits.")


Train set size: 18544
Validation set size: 4300
Test set size: 5056
Data splitting complete using manual splits.


In [None]:
# distribution before augmentation 
print("\nOriginal Label Distribution in Training Set (Before Augmentation) ")
print(train_df[ALL_LABELS].sum().sort_values(ascending=False))
print("\n--- Original Label Distribution in Val Set ")
print(val_df[ALL_LABELS].sum().sort_values(ascending=False))
print("\n--- Original Label Distribution in Test Set")
print(test_df[ALL_LABELS].sum().sort_values(ascending=False))


Original Label Distribution in Training Set (Before Augmentation) 
Block           7998
Interjection    5909
Prolongation    5569
NoStutter       4162
SoundRep        3448
WordRep         2752
dtype: int64

--- Original Label Distribution in Val Set 
Interjection    1828
Block           1809
Prolongation    1522
SoundRep        1193
WordRep          914
NoStutter        624
dtype: int64

--- Original Label Distribution in Test Set
Block           2024
Interjection    1859
Prolongation    1353
NoStutter       1150
WordRep          929
SoundRep         902
dtype: int64


In [None]:
def extract_wav2vec_sequences(dataframe, desc="Extracting", augment=False):
    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
    model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()
    augmenter = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5)
    ])
    
    # Pre-calculate the final size of the dataset 
    final_size = 0
    if augment:
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Calculating final size"):
            final_size += 1 
            
            # CUSTOM AUGMENTATION LOGIC 
            num_augmentations = 0
            if row['Block'] == 0:
                if row['WordRep'] == 1 or row['SoundRep'] == 1:
                    num_augmentations = 3
                elif row['Prolongation'] == 1:
                    num_augmentations = 1
            elif row['Block'] == 1:
                if row['SoundRep'] == 1:
                    num_augmentations = 1
                elif row['WordRep'] == 1:
                    num_augmentations = 1
            
            if row[LABEL_COLS].sum() == 1 and row['WordRep'] == 1:
                num_augmentations += 3
            
            if row[LABEL_COLS].sum() == 2 and row['WordRep'] == 1 and row['SoundRep'] == 1:
                num_augmentations += 2
                
            if row[LABEL_COLS].sum() == 1 and row['SoundRep'] == 1:
                num_augmentations += 2
            
            final_size += num_augmentations
    else:
        final_size = len(dataframe)
    
    print(f"Final dataset size for '{desc}' will be {final_size} samples.")

    # Pre-allocate NumPy arrays to save memory
    all_sequences = np.zeros((final_size, MAX_SEQ_LEN, EMBEDDING_DIM), dtype=np.float32)
    all_labels = np.zeros((final_size, len(ALL_LABELS)), dtype=np.int8)
    
    current_idx = 0

    with torch.no_grad():
        for i in tqdm(range(0, len(dataframe), BATCH_SIZE), desc=f"{desc} Batches"):
            batch_df = dataframe.iloc[i:i+BATCH_SIZE]
            
            audio_batch = []
            labels_batch = []
            
            for index, row in batch_df.iterrows():
                clip_path = str(CLIPS_DIR / f"{row['Clip']}.wav")
                audio, sr = librosa.load(clip_path, sr=SAMPLING_RATE)
                current_labels = row[ALL_LABELS].values
                
                audio_batch.append(audio)
                labels_batch.append(current_labels)

                if augment:
                    # CUSTOM AUGMENTATION LOGIC 
                    num_augmentations = 0
                    if row['Block'] == 0:
                        if row['WordRep'] == 1 or row['SoundRep'] == 1:
                            num_augmentations = 3
                        elif row['Prolongation'] == 1:
                            num_augmentations = 1
                    elif row['Block'] == 1:
                        if row['SoundRep'] == 1:
                            num_augmentations = 1
                        elif row['WordRep'] == 1:
                            num_augmentations = 1

                    if row[LABEL_COLS].sum() == 1 and row['WordRep'] == 1:
                        num_augmentations += 3
                    
                    if row[LABEL_COLS].sum() == 2 and row['WordRep'] == 1 and row['SoundRep'] == 1:
                        num_augmentations += 2

                    if row[LABEL_COLS].sum() == 1 and row['SoundRep'] == 1:
                        num_augmentations += 2

                    for _ in range(num_augmentations):
                        augmented_audio = augmenter(samples=audio, sample_rate=SAMPLING_RATE)
                        audio_batch.append(augmented_audio)
                        labels_batch.append(current_labels)

            inputs = processor(audio_batch, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)
            inputs = inputs.to(DEVICE)
            sequences = model(**inputs).last_hidden_state.cpu().numpy()
            
            # Fills the pre-allocated arrays
            for j, seq in enumerate(sequences):
                if current_idx >= final_size: break
                if seq.shape[0] < MAX_SEQ_LEN:
                    pad_width = MAX_SEQ_LEN - seq.shape[0]
                    seq = np.pad(seq, ((0, pad_width), (0, 0)), mode='constant')
                else:
                    seq = seq[:MAX_SEQ_LEN, :]
                
                all_sequences[current_idx] = seq
                all_labels[current_idx] = labels_batch[j]
                current_idx += 1
            if current_idx >= final_size: break

    return all_sequences, all_labels

print("\n--- Starting Wav2Vec2 sequence extraction... ---")
X_train, y_train = extract_wav2vec_sequences(train_df, desc="Training", augment=True)
np.savez_compressed(TRAIN_DATA_PATH, x=X_train, y=y_train)
print(f"Saved compressed training data with shape: {X_train.shape}")

# distribution AFTER augmentation 
final_train_labels_df = pd.DataFrame(y_train, columns=ALL_LABELS)
print("\n--- Final Label Distribution in Training Set (After Augmentation) ---")
print(final_train_labels_df.sum().sort_values(ascending=False))

# Clear memory before processing the next set for memory-constrained environments 
del X_train, y_train

X_val, y_val = extract_wav2vec_sequences(val_df, desc="Validation", augment=False)
np.savez_compressed(VAL_DATA_PATH, x=X_val, y=y_val)
print(f"Saved compressed validation data with shape: {X_val.shape}")
del X_val, y_val

X_test, y_test = extract_wav2vec_sequences(test_df, desc="Test", augment=False)
np.savez_compressed(TEST_DATA_PATH, x=X_test, y=y_test)
print(f"Saved compressed test data with shape: {X_test.shape}")
del X_test, y_test

print(f"\n--- Feature extraction complete. All files saved in '{OUTPUT_DIR}' directory. ---")

# %%


--- Starting Wav2Vec2 sequence extraction... ---


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating final size:   0%|          | 0/18544 [00:00<?, ?it/s]

Final dataset size for 'Training' will be 33945 samples.


Training Batches:   0%|          | 0/580 [00:00<?, ?it/s]

Saved compressed training data with shape: (33945, 150, 768)

--- Final Label Distribution in Training Set (After Augmentation) ---
Block           10848
Interjection    10824
Prolongation    10800
SoundRep        10576
WordRep         10569
NoStutter        4162
dtype: int64


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final dataset size for 'Validation' will be 4300 samples.


Validation Batches:   0%|          | 0/135 [00:00<?, ?it/s]

Saved compressed validation data with shape: (4300, 150, 768)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final dataset size for 'Test' will be 5056 samples.


Test Batches:   0%|          | 0/158 [00:00<?, ?it/s]

Saved compressed test data with shape: (5056, 150, 768)

--- Feature extraction complete. All files saved in 'output_wav2vec_custom_augmented' directory. ---
