In [1]:
!pip install peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0
  Downloading accelerate-1.3.0-py3-none-any.whl (336 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.13.0
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl (906.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.4/906.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-runtime-cu12==12.4.127
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x

In [26]:
import torch
import torch.nn as nn
import numpy as np
from transformers import AutoModel,  AutoProcessor
from peft import LoraConfig, get_peft_model

class WhisperAudioClassifier(nn.Module):
    def __init__(self, whisper_model_name='openai/whisper-base', num_emotions=7):
        super().__init__()
        # Load Whisper model and feature extractor
        # self.whisper = WhisperModel.from_pretrained(whisper_model_name)
        # self.feature_extractor = WhisperFeatureExtractor.from_pretrained(whisper_model_name)
        self.feature_extractor = AutoProcessor.from_pretrained(whisper_model_name)
        self.whisper = AutoModel.from_pretrained(whisper_model_name)
        # Freeze original Whisper parameters
        for param in self.whisper.parameters():
            param.requires_grad = False
        
        # LoRA configuration
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=['q_proj', 'v_proj'],
            lora_dropout=0.1,
            bias='none',
            task_type='FEATURE_EXTRACTION'
        )
        
        # Apply LoRA
        self.whisper = get_peft_model(self.whisper, lora_config)
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(self.whisper.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_emotions)
        )
    
    def forward(self, audio):
        # Convert torch tensor to numpy if needed
        if torch.is_tensor(audio):
            audio = audio.numpy()

        print(audio.shape)
        # Preprocess audio using feature extractor
        inputs = self.feature_extractor(
            audio, 
            sampling_rate=16000, 
            return_tensors='pt'
        )
        print(inputs.input_features.shape)
        
        # Extract features from Whisper
        whisper_outputs = self.whisper.encoder(
            input_features=inputs.input_features, 
            attention_mask=torch.ones(inputs.input_features.shape[:2], device=inputs.input_features.device),
            output_hidden_states=True)
        
        # feats = whisper_outputs['hidden_states'][-1
        # print(whisper_outputs["hidden_states"].shape)
        # print(feats.shape)
        # Pool features
        pooled_features = whisper_outputs.last_hidden_state.mean(dim=1)
        
        # Classify emotions
        return self.classifier(pooled_features)

# Example usage
def main():
    # Initialize model
    model = WhisperAudioClassifier(num_emotions=7)
    
    # Create a dummy audio input (numpy array)
    dummy_audio = np.random.randn(16000).astype(np.float32)  # 1 second of audio
    
    # Forward pass
    outputs = model(dummy_audio)
    print(outputs.shape)  # Should output (batch_size, num_emotions)

In [29]:
# Initialize model
model = WhisperAudioClassifier(num_emotions=7)

# Create a dummy audio input (numpy array)
dummy_audio = np.random.randn(16000).astype(np.float32)  # 1 second of audio

# Forward pass
outputs = model(dummy_audio)
print(outputs.shape)  # Should output (batch_size, num_emotions)

(16000,)
torch.Size([1, 80, 3000])
torch.Size([1, 7])


In [30]:
model


WhisperAudioClassifier(
  (whisper): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 512)
          (layers): ModuleList(
            (0-5): 6 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=512, out_features=512, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=512, out_features=512, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=512, out_features=16, bias=False)
                  )
                  (lora_B): 