In [None]:
pip install -U speechtokenizer

In [None]:
!git clone https://github.com/ZhangXInFD/SpeechTokenizer.git

In [None]:
ls

In [None]:
cd SpeechTokenizer

In [None]:
ls

In [None]:
!pip install .

In [None]:
pip install beartype

In [None]:
pip install einops

In [None]:
from speechtokenizer import SpeechTokenizer
import torchaudio
import torch
config_path = '../model_/speechtokenizer_hubert_avg_config.json'
ckpt_path = '../model_/SpeechTokenizer.pt'
model = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path)
wav, sr = torchaudio.load('../../1001_DFA_ANG_XX.wav')

if wav.shape[0] > 1:
    wav = wav[:1, :]

if sr != model.sample_rate:
    wav = torchaudio.functional.resample(wav, sr, model.sample_rate)

wav = wav.unsqueeze(0)

with torch.no_grad():
    codes = model.encode(wav)

RVQ_1 = codes[:1, :, :] 
RVQ_supplement = codes[1:, :, :] 

# Take averages of RVQ_1 and RVQ_supplement
RVQ_1_avg = RVQ_1.squeeze().float().mean(dim=0)
RVQ_supplement_avg = RVQ_supplement.squeeze().float().mean(dim=0)

# Combine both averages
final_avg = (RVQ_1_avg + RVQ_supplement_avg) / 2
print(final_avg.shape)

In [None]:
import os
import torch
import torchaudio
import pandas as pd
from speechtokenizer import SpeechTokenizer

folder_path = "/kaggle/input/emodb-crema-d/EmoDb/wav"
output_file = "/kaggle/working/SpeechTokenizer_EmoDb.csv" 

# download the model from github and put the path of .json and .pt 

config_path = '/kaggle/input/model/speechtokenizer_hubert_avg_config.json'
ckpt_path = '/kaggle/input/model/SpeechTokenizer.pt'
model = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path).eval()

def preprocess_audio(audio_path, target_sample_rate):
    try:
        # Load waveform
        waveform, sample_rate = torchaudio.load(audio_path)

        # Ensure mono channel
        if waveform.shape[0] > 1:
            waveform = waveform[:1, :]

        # Resample if necessary
        if sample_rate != target_sample_rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)

        # Add batch dimension
        return waveform.unsqueeze(0)
    except Exception as e:
        print(f"Error loading audio file {audio_path}: {e}")
        return None

def extract_features(audio_path, model):
    target_sample_rate = model.sample_rate
    audio = preprocess_audio(audio_path, target_sample_rate)
    if audio is None:
        return None

    try:
        # Encode and separate codes
        with torch.no_grad():
            codes = model.encode(audio)

        RVQ_1_avg = codes[:1, :, :].squeeze().float().mean(dim=0)
        RVQ_supplement_avg = codes[1:, :, :].squeeze().float().mean(dim=0)

        final_avg = (RVQ_1_avg + RVQ_supplement_avg) / 2
        return final_avg.cpu().numpy()
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

all_features = []
filenames = []

for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        file_path = os.path.join(folder_path, filename)
        
        # Extract features
        features = extract_features(file_path, model)
        if features is not None:
            # Append features and filename to the lists
            all_features.append(features)
            filenames.append(filename)

features_df = pd.DataFrame(all_features)
features_df.insert(0, 'filename', filenames)  # Insert filename column at the beginning

features_df.to_csv(output_file, index=False)
print(f"Saved all features to {output_file}")