In [1]:
import torch
print(torch.backends.mps.is_available())  # Should return True if MPS is available

True


In [6]:
import os
import librosa
import torch
import pandas as pd
from transformers import Wav2Vec2Processor, HubertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from datasets import Dataset as HFDataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm  # For progress tracking

# Path to the dataset folder
dataset_path = "/Users/anshumankumar/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1"

# Initialize lists to store file names and raw audio data
audio_list = []
file_list = []

# Iterate over the audio files and extract raw audio waveforms
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(root, file)
            
            # Load the raw audio using librosa
            audio, sr = librosa.load(file_path, sr=16000)  # Resample to 16kHz for HuBERT
            print(f"Loaded audio for {file}, shape: {audio.shape}")
            
            # Store the raw audio and file name
            audio_list.append(audio)
            file_list.append(file)

# Convert the raw audio features and file names into a DataFrame
df = pd.DataFrame({'file_name': file_list, 'audio': audio_list})

# Display the DataFrame
print(df.head())

# Load a pre-trained HuBERT model and processor for fine-tuning
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForSequenceClassification.from_pretrained("facebook/hubert-large-ls960-ft", num_labels=3)  # Assuming 3 sentiment classes

# Set device to MPS if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device("cpu")
model.to(device)

# Function to process audio input for HuBERT
def process_audio(batch):
    # Extract input features using the processor
    inputs = processor(batch["audio"], sampling_rate=16000, return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]
    return batch

# Convert pandas DataFrame to Hugging Face Dataset and process audio for HuBERT
hf_dataset = HFDataset.from_pandas(df)
hf_dataset = hf_dataset.map(process_audio)

# Custom Dataset Class for PyTorch
class AudioDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        input_values = torch.tensor(self.dataset[idx]["input_values"], dtype=torch.float32).contiguous()  # Ensure contiguous
        label = torch.tensor(self.dataset[idx].get("label", 0), dtype=torch.long)  # Assuming label 0 for now
        return input_values, label

# Custom collate function for padding sequences in the dataloader
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)
    
    # Pad the sequences to have the same length
    inputs_padded = pad_sequence(inputs, batch_first=True)
    
    return inputs_padded, labels  # Do not add extra channel dimension

# Create PyTorch DataLoader with the collate function
train_dataset = AudioDataset(hf_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)  # Reduced batch size

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training loop with progress tracking using tqdm
num_epochs = 3
model.train()


for epoch in range(num_epochs):  # Train for num_epochs epochs
    running_loss = 0.0
    epoch_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    for i, batch in enumerate(epoch_iterator):
        optimizer.zero_grad()
        
        # Move inputs and labels to the device (MPS if available)
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Ensure inputs are contiguous and have the correct shape
        print(f"Before reshaping: Inputs shape: {inputs.shape}, Labels shape: {labels.shape}")

        # Reshape inputs if necessary (depends on input structure)
        inputs = inputs.contiguous().reshape(inputs.size(0), -1)  # [batch_size, sequence_length]

        print(f"After reshaping: Inputs shape: {inputs.shape}, Labels shape: {labels.shape}")

        # Forward pass
        outputs = model(input_values=inputs, labels=labels)
        loss = outputs.loss
        print(f"Loss: {loss.item()}")  # Print loss to track
        
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        running_loss += loss.item()
        epoch_iterator.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_dataloader)}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_hubert_sentiment")
processor.save_pretrained("./fine_tuned_hubert_sentiment")
print("Model training complete!")
print("Thanks for your patience, the model is trained successfully.")

Loaded audio for 03-01-05-01-02-01-16.wav, shape: (62463,)
Loaded audio for 03-01-06-01-02-02-16.wav, shape: (57124,)
Loaded audio for 03-01-06-02-01-02-16.wav, shape: (59793,)
Loaded audio for 03-01-05-02-01-01-16.wav, shape: (63531,)
Loaded audio for 03-01-07-01-01-01-16.wav, shape: (60327,)
Loaded audio for 03-01-04-01-01-02-16.wav, shape: (58192,)
Loaded audio for 03-01-04-02-02-02-16.wav, shape: (59793,)
Loaded audio for 03-01-07-02-02-01-16.wav, shape: (66200,)
Loaded audio for 03-01-08-02-02-01-16.wav, shape: (57124,)
Loaded audio for 03-01-08-01-01-01-16.wav, shape: (56590,)
Loaded audio for 03-01-03-02-02-02-16.wav, shape: (60861,)
Loaded audio for 03-01-03-01-01-02-16.wav, shape: (58726,)
Loaded audio for 03-01-02-02-01-01-16.wav, shape: (61929,)
Loaded audio for 03-01-01-01-02-02-16.wav, shape: (57124,)
Loaded audio for 03-01-02-01-02-01-16.wav, shape: (60861,)
Loaded audio for 03-01-03-02-01-01-16.wav, shape: (60327,)
Loaded audio for 03-01-03-01-02-01-16.wav, shape: (59793

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2880/2880 [01:02<00:00, 46.39 examples/s]
Epoch 1/3:   0%|          | 0/1440 [00:00<?, ?batch/s]

Before reshaping: Inputs shape: torch.Size([2, 68869]), Labels shape: torch.Size([2])
After reshaping: Inputs shape: torch.Size([2, 68869]), Labels shape: torch.Size([2])
Loss: 1.0580354928970337


Epoch 1/3:   0%|          | 0/1440 [00:46<?, ?batch/s]

Thanks for your patience, the model is trained successfully.
Model training complete!



