In [None]:
import pandas as pd
import numpy as np
import librosa
from pathlib import Path
import os
from tqdm.auto import tqdm
import joblib

tqdm.pandas()

from sklearn.model_selection import GroupShuffleSplit
!pip install --upgrade transformers huggingface-hub --quiet
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

!pip install --no-deps audiomentations --quiet
!pip install numpy_minmax numpy_rms python_stretch --quiet 
from audiomentations import Compose, AddGaussianNoise, PitchShift

In [None]:
# Config
CLIPS_DIR = Path("/kaggle/input/sep-28k/clips/stuttering-clips/clips")
LABEL_FILE = "/kaggle/input/sep-28k/SEP-28k_labels.csv"
OUTPUT_DIR = Path("/kaggle/working/output_wav2vec_custom_augmented")
OUTPUT_DIR.mkdir(exist_ok=True)

MODEL_NAME = "facebook/wav2vec2-large-960h"
SAMPLING_RATE = 16000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
MAX_SEQ_LEN = 150
EMBEDDING_DIM = 1024

# Labels
LABEL_COLS = ['Prolongation', 'Block', 'SoundRep', 'WordRep', 'Interjection']
ALL_LABELS = LABEL_COLS + ['NoStutter']

BATCH_SIZE = 32

CLEANED_DF_PATH = OUTPUT_DIR / "df_multilabel.parquet"
TRAIN_DATA_PATH = OUTPUT_DIR / "train_data.npz"
VAL_DATA_PATH = OUTPUT_DIR / "val_data.npz"
TEST_DATA_PATH = OUTPUT_DIR / "test_data.npz"

In [None]:
print(" Preparing and Splitting Data... ")
if os.path.exists(CLEANED_DF_PATH):
    df = pd.read_parquet(CLEANED_DF_PATH)
else:
    df = pd.read_csv(LABEL_FILE)
    df['Clip'] = df['Show'].astype(str) + '_' + df['EpId'].astype(str) + '_' + df['ClipId'].astype(str)
    clips_in_folder = {c.stem for c in CLIPS_DIR.glob("*.wav")}
    df = df[df['Clip'].isin(clips_in_folder)].copy()
    def get_duration(clip_name):
        try: return librosa.get_duration(path=CLIPS_DIR / f"{clip_name}.wav")
        except Exception: return None
    df['duration'] = df['Clip'].progress_apply(get_duration)
    df.dropna(subset=['duration'], inplace=True)
    df = df[(df['duration'] > 2.95) & (df['duration'] < 3.05)].copy()
    for col in LABEL_COLS:
        df[col] = (df[col] > 0).astype(int)
    df['NoStutter'] = (df[LABEL_COLS].sum(axis=1) == 0).astype(int)
    df.to_parquet(CLEANED_DF_PATH)

df['speaker_id'] = df['Show']

In [None]:
# manual splits
manual_splits = {
    'train': ['IStutterSoWhat', 'MyStutteringLife', 'StrongVoices', 'StutteringIsCool', 'WomenWhoStutter'],
    'val': ['HeStutters','HVSA'],
    'test': ['StutterTalk']
}

# DataFrames creation
train_df = df[df['speaker_id'].isin(manual_splits['train'])].copy()
val_df = df[df['speaker_id'].isin(manual_splits['val'])].copy()
test_df = df[df['speaker_id'].isin(manual_splits['test'])].copy()

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")
print("Data splitting complete using manual splits.")

In [None]:
# distribution before augmentation 
print("\nOriginal Label Distribution in Training Set (Before Augmentation) ")
print(train_df[ALL_LABELS].sum().sort_values(ascending=False))
print("\n Original Label Distribution in Val Set ")
print(val_df[ALL_LABELS].sum().sort_values(ascending=False))
print("\n Original Label Distribution in Test Set")
print(test_df[ALL_LABELS].sum().sort_values(ascending=False))

In [None]:
def extract_wav2vec_sequences(dataframe, desc="Extracting", augment=False):
    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
    model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()
    augmenter = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5)
    ])
    
    # precalculate the final size 
    final_size = 0
    if augment:
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Calculating final size"):
            final_size += 1 
            
            # CUSTOM AUGMENTATION LOGIC 
            num_augmentations = 0
            if row['Block'] == 0:
                if row['WordRep'] == 1 or row['SoundRep'] == 1:
                    num_augmentations = 3
                elif row['Prolongation'] == 1:
                    num_augmentations = 1
            elif row['Block'] == 1:
                if row['SoundRep'] == 1:
                    num_augmentations = 1
                elif row['WordRep'] == 1:
                    num_augmentations = 1
            
            if row[LABEL_COLS].sum() == 1 and row['WordRep'] == 1:
                num_augmentations += 3
            
            if row[LABEL_COLS].sum() == 2 and row['WordRep'] == 1 and row['SoundRep'] == 1:
                num_augmentations += 2
                
            if row[LABEL_COLS].sum() == 1 and row['SoundRep'] == 1:
                num_augmentations += 2
            
            final_size += num_augmentations
    else:
        final_size = len(dataframe)
    
    print(f"Final dataset size for '{desc}' will be {final_size} samples.")

    # prealloc np arr to save mem
    all_sequences = np.zeros((final_size, MAX_SEQ_LEN, EMBEDDING_DIM), dtype=np.float32)
    all_labels = np.zeros((final_size, len(ALL_LABELS)), dtype=np.int8)
    
    current_idx = 0

    with torch.no_grad():
        for i in tqdm(range(0, len(dataframe), BATCH_SIZE), desc=f"{desc} Batches"):
            batch_df = dataframe.iloc[i:i+BATCH_SIZE]
            
            audio_batch = []
            labels_batch = []
            
            for index, row in batch_df.iterrows():
                clip_path = str(CLIPS_DIR / f"{row['Clip']}.wav")
                audio, sr = librosa.load(clip_path, sr=SAMPLING_RATE)
                current_labels = row[ALL_LABELS].values
                
                audio_batch.append(audio)
                labels_batch.append(current_labels)

                if augment:
                    # CUSTOM AUGMENTATION LOGIC 
                    num_augmentations = 0
                    if row['Block'] == 0:
                        if row['WordRep'] == 1 or row['SoundRep'] == 1:
                            num_augmentations = 3
                        elif row['Prolongation'] == 1:
                            num_augmentations = 1
                    elif row['Block'] == 1:
                        if row['SoundRep'] == 1:
                            num_augmentations = 1
                        elif row['WordRep'] == 1:
                            num_augmentations = 1

                    if row[LABEL_COLS].sum() == 1 and row['WordRep'] == 1:
                        num_augmentations += 3
                    
                    if row[LABEL_COLS].sum() == 2 and row['WordRep'] == 1 and row['SoundRep'] == 1:
                        num_augmentations += 2

                    if row[LABEL_COLS].sum() == 1 and row['SoundRep'] == 1:
                        num_augmentations += 2

                    for _ in range(num_augmentations):
                        augmented_audio = augmenter(samples=audio, sample_rate=SAMPLING_RATE)
                        audio_batch.append(augmented_audio)
                        labels_batch.append(current_labels)

            inputs = processor(audio_batch, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)
            inputs = inputs.to(DEVICE)
            sequences = model(**inputs).last_hidden_state.cpu().numpy()
            
            # fills
            for j, seq in enumerate(sequences):
                if current_idx >= final_size: break
                if seq.shape[0] < MAX_SEQ_LEN:
                    pad_width = MAX_SEQ_LEN - seq.shape[0]
                    seq = np.pad(seq, ((0, pad_width), (0, 0)), mode='constant')
                else:
                    seq = seq[:MAX_SEQ_LEN, :]
                
                all_sequences[current_idx] = seq
                all_labels[current_idx] = labels_batch[j]
                current_idx += 1
            if current_idx >= final_size: break

    return all_sequences, all_labels

print("\n Starting Wav2Vec2 sequence extraction... ")
X_train, y_train = extract_wav2vec_sequences(train_df, desc="Training", augment=True)
np.savez_compressed(TRAIN_DATA_PATH, x=X_train, y=y_train)
print(f"Saved compressed training data with shape: {X_train.shape}")

# distribution after augm 
final_train_labels_df = pd.DataFrame(y_train, columns=ALL_LABELS)
print("\n Final Label Distribution in Training Set (After Augmentation) ")
print(final_train_labels_df.sum().sort_values(ascending=False))

del X_train, y_train

X_val, y_val = extract_wav2vec_sequences(val_df, desc="Validation", augment=False)
np.savez_compressed(VAL_DATA_PATH, x=X_val, y=y_val)
print(f"Saved compressed validation data with shape: {X_val.shape}")
del X_val, y_val

X_test, y_test = extract_wav2vec_sequences(test_df, desc="Test", augment=False)
np.savez_compressed(TEST_DATA_PATH, x=X_test, y=y_test)
print(f"Saved compressed test data with shape: {X_test.shape}")
del X_test, y_test

print(f"\n Feature extraction complete. All files saved in '{OUTPUT_DIR}' directory. ")