In [None]:
import pandas as pd
import numpy as np
import librosa
from pathlib import Path
import os
from tqdm.auto import tqdm
import joblib

tqdm.pandas()

from sklearn.model_selection import GroupShuffleSplit

import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

In [None]:
!pip install --no-deps audiomentations
!pip install numpy_minmax numpy_rms python_stretch
from audiomentations import Compose, AddGaussianNoise, PitchShift

In [None]:
# Configuration 
CLIPS_DIR = Path("/kaggle/input/sep-28k/clips/stuttering-clips/clips")
LABEL_FILE = "/kaggle/input/sep-28k/SEP-28k_labels.csv"
OUTPUT_DIR = Path("./output_wav2vec_precomputed_features")
OUTPUT_DIR.mkdir(exist_ok=True)

MODEL_NAME = "facebook/wav2vec2-base-960h"
SAMPLING_RATE = 16000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
MAX_SEQ_LEN = 150
EMBEDDING_DIM = 768

LABEL_COLS = ['Prolongation', 'Block', 'SoundRep', 'WordRep', 'Interjection']
ALL_LABELS = LABEL_COLS + ['NoStutter'] 

BATCH_SIZE = 32

CLEANED_DF_PATH = OUTPUT_DIR / "df_multilabel.parquet"
TRAIN_EMBEDDINGS_PATH, TRAIN_LABELS_PATH = OUTPUT_DIR / "train_seq_embeddings.npy", OUTPUT_DIR / "train_labels.npy"
VAL_EMBEDDINGS_PATH, VAL_LABELS_PATH = OUTPUT_DIR / "val_seq_embeddings.npy", OUTPUT_DIR / "val_labels.npy"
TEST_EMBEDDINGS_PATH, TEST_LABELS_PATH = OUTPUT_DIR / "test_seq_embeddings.npy", OUTPUT_DIR / "test_labels.npy"

In [None]:
print("Preparing and Splitting Data")
if os.path.exists(CLEANED_DF_PATH):
    print('FOUND')
    df = pd.read_parquet(CLEANED_DF_PATH)
else:
    df = pd.read_csv(LABEL_FILE)
    df['Clip'] = df['Show'].astype(str) + '_' + df['EpId'].astype(str) + '_' + df['ClipId'].astype(str)
    clips_in_folder = {c.stem for c in CLIPS_DIR.glob("*.wav")}
    df = df[df['Clip'].isin(clips_in_folder)].copy()
    def get_duration(clip_name):
        try: return librosa.get_duration(path=CLIPS_DIR / f"{clip_name}.wav")
        except Exception: return None
    df['duration'] = df['Clip'].progress_apply(get_duration)
    df.dropna(subset=['duration'], inplace=True)
    df = df[(df['duration'] > 2.95) & (df['duration'] < 3.05)].copy()
    for col in LABEL_COLS:
        df[col] = (df[col] > 0).astype(int)
    df['NoStutter'] = (df[LABEL_COLS].sum(axis=1) == 0).astype(int)
    df.to_parquet(CLEANED_DF_PATH)

df['speaker_id'] = df['Show']
gss_test = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
train_val_idx, test_idx = next(gss_test.split(df, groups=df['speaker_id']))
train_val_df = df.iloc[train_val_idx] 
test_df = df.iloc[test_idx]
gss_val = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
train_idx, val_idx = next(gss_val.split(train_val_df, groups=train_val_df['speaker_id']))
train_df = train_val_df.iloc[train_idx]
val_df = train_val_df.iloc[val_idx]
print("Data splitting complete.")

In [None]:

def extract_wav2vec_sequences(dataframe, desc="Extracting", augment=False):
    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
    model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()
    augmenter = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5)
    ])
    
    final_size = 0
    if augment:
        label_counts = dataframe[ALL_LABELS].sum()
        max_count = label_counts.max()
        for index, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Calculating final size"):
            final_size += 1 
            current_labels = row[ALL_LABELS].values
            active_labels = [label for k, label in enumerate(ALL_LABELS) if current_labels[k] == 1]
            if not active_labels: continue
            rarest_label_count = min([label_counts[label] for label in active_labels])
            num_augmentations = min(4, max(0, round((max_count / rarest_label_count) - 1)))
            final_size += num_augmentations
    else:
        final_size = len(dataframe)
    
    print(f"Final dataset size for '{desc}' will be {final_size} samples.")

    # Pre-allocate NumPy arrays
    all_sequences = np.zeros((final_size, MAX_SEQ_LEN, EMBEDDING_DIM), dtype=np.float32)
    all_labels = np.zeros((final_size, len(ALL_LABELS)), dtype=np.int8)
    
    current_idx = 0

    with torch.no_grad():
        for i in tqdm(range(0, len(dataframe), BATCH_SIZE), desc=f"{desc} Batches"):
            batch_df = dataframe.iloc[i:i+BATCH_SIZE]
            
            audio_batch = []
            labels_batch = []
            
            for index, row in batch_df.iterrows():
                clip_path = str(CLIPS_DIR / f"{row['Clip']}.wav")
                audio, sr = librosa.load(clip_path, sr=SAMPLING_RATE)
                current_labels = row[ALL_LABELS].values
                
                audio_batch.append(audio)
                labels_batch.append(current_labels)

                if augment:
                    active_labels = [label for k, label in enumerate(ALL_LABELS) if current_labels[k] == 1]
                    if not active_labels: continue
                    rarest_label_count = min([label_counts[label] for label in active_labels])
                    num_augmentations = min(4, max(0, round((max_count / rarest_label_count) - 1)))

                    for _ in range(num_augmentations):
                        augmented_audio = augmenter(samples=audio, sample_rate=SAMPLING_RATE)
                        audio_batch.append(augmented_audio)
                        labels_batch.append(current_labels)

            inputs = processor(audio_batch, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)
            inputs = inputs.to(DEVICE)
            sequences = model(**inputs).last_hidden_state.cpu().numpy()
            
            # Fill the pre-allocated arrays
            for j, seq in enumerate(sequences):
                if current_idx >= final_size: break
                if seq.shape[0] < MAX_SEQ_LEN:
                    pad_width = MAX_SEQ_LEN - seq.shape[0]
                    seq = np.pad(seq, ((0, pad_width), (0, 0)), mode='constant')
                else:
                    seq = seq[:MAX_SEQ_LEN, :]
                
                all_sequences[current_idx] = seq
                all_labels[current_idx] = labels_batch[j]
                current_idx += 1
            if current_idx >= final_size: break

    return all_sequences, all_labels

print("\nStarting Wav2Vec2 sequence extraction")
X_train, y_train = extract_wav2vec_sequences(train_df, desc="Training", augment=True)
np.save(TRAIN_EMBEDDINGS_PATH, X_train); np.save(TRAIN_LABELS_PATH, y_train)
print(f"Saved training embeddings with shape: {X_train.shape}")

del X_train, y_train

X_val, y_val = extract_wav2vec_sequences(val_df, desc="Validation", augment=False)
np.save(VAL_EMBEDDINGS_PATH, X_val); np.save(VAL_LABELS_PATH, y_val)
print(f"Saved validation embeddings with shape: {X_val.shape}")
del X_val, y_val

X_test, y_test = extract_wav2vec_sequences(test_df, desc="Test", augment=False)
np.save(TEST_EMBEDDINGS_PATH, X_test); np.save(TEST_LABELS_PATH, y_test)
print(f"Saved test embeddings with shape: {X_test.shape}")
del X_test, y_test

print(f"\nFeature extraction complete. ")