In [1]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-1.0.0-py3-none-any.whl (760 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m760.1/760.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.9->speechbrain)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.9->speechbrain)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.9->speechbrain)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.9->speechbrain)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from 

In [2]:
%%capture
!pip install datasets -U
!pip install librosa
!pip install jiwer

In [None]:
from speechbrain.inference.speaker import EncoderClassifier
import torch
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import Compose
from transformers import WhisperForConditionalGeneration, WhisperProcessor, AutoFeatureExtractor, WhisperModel

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice_train = load_dataset("fsicoli/common_voice_17_0", "en", split="train")
common_voice_test = load_dataset("fsicoli/common_voice_17_0", "en", split="test")

print(common_voice)

In [None]:
## ECAPA encoding
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
signal = common_voice_train[0]["audio"]["array"]
# fs = common_voice_train[0]["audio"]["sampling_rate"]
embeddings = classifier.encode_batch(torch.tensor(signal))

## Whisper Encoding
model = WhisperModel.from_pretrained("openai/whisper-base")
feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")

inputs = feature_extractor(common_voice_train[0]["audio"]["array"], return_tensors="pt")
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).encoder_last_hidden_state
list(last_hidden_state.shape)

In [None]:
class SpeechModel(nn.Module):
    def __init__(self, whisper_model, ecapa_model):
        super(SpeechModel, self).__init__()
        self.whisper_encoder = whisper_model.encoder
        self.ecapa_encoder = ecapa_model.encoder

        # Define downsampling layers
        self.whisper_downsample = nn.Conv1d(...)
        self.ecapa_downsample = nn.Conv1d(...)

        # Define Transformer layers
        self.transformer_encoder = nn.TransformerEncoderLayer(...)
        self.transformer_decoder = nn.TransformerDecoderLayer(...)

        # Define prediction heads
        self.next_word_prediction_head = nn.Linear(...)
        self.speaker_recognition_head = nn.Linear(...)

    def forward(self, x):
        # Forward pass for whisper branch
        whisper_embedding = self.whisper_encoder(x)
        downsampled_whisper = self.whisper_downsample(whisper_embedding)

        # Forward pass for ECAPA branch
        ecapa_embedding = self.ecapa_encoder(x)
        downsampled_ecapa = self.ecapa_downsample(ecapa_embedding)

        # Concatenate downscaled embeddings
        concatenated_embeddings = torch.cat((downsampled_whisper, downsampled_ecapa), dim=1)

        # Transformer layers
        transformer_output = self.transformer_encoder(concatenated_embeddings)

        # Task-specific heads
        next_word_prediction = self.next_word_prediction_head(transformer_output)
        speaker_recognition = self.speaker_recognition_head(transformer_output)

        return next_word_prediction, speaker_recognition

# Define model
whisper_model = OpenAIWhisperEncoder(...)
ecapa_model = ECAPATDNNModel(...)
model = SpeechModel(whisper_model, ecapa_model)

# Define loss functions
next_word_loss_function = nn.CrossEntropyLoss()
speaker_recognition_loss_function = nn.CrossEntropyLoss()

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define data preprocessing
transform = Compose([preprocess_function])

# Load dataset
dataset = CommonVoiceDataset(root_dir='path_to_commonvoice_dataset', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        inputs, targets = batch
        next_word_prediction, speaker_recognition = model(inputs)

        # Calculate loss
        next_word_loss = next_word_loss_function(next_word_prediction, targets)
        # Calculate speaker recognition loss
        speaker_recognition_loss = speaker_recognition_loss_function(speaker_recognition, speaker_labels)

        total_loss = next_word_loss + speaker_recognition_loss

        # Backpropagation
        total_loss.backward()
        optimizer.step()
