In [4]:
import torch

from Modules import LoadingModule
from Modules import Features_encoder
from Modules import quantizationModule
from Modules import wav2vec_transformer
from Modules import ContrastiveLoss

from Modules import TempLibriSpeech

In [2]:
#data loader module init
StandardScalerTransform = LoadingModule.StandardScalerTransform
LargeDataModule = LoadingModule.LargeDataModule("./data/Librispeech", batch_size=16, num_workers=1, transform=StandardScalerTransform)


In [6]:
#Temp import dataloader ### rendre compatible PLightning quand on aura le GPU
# en attendant import manuel
from torch.utils.data import DataLoader



dataset = TempLibriSpeech.LibriSpeech(split="test-clean", target_length=480000, device='cpu')
data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [7]:
for i, (audio, text) in enumerate(data_loader):
    print(f"Exemple {i+1}")
    print(f"Audio shape: {audio.shape}")
    print(f"Texte: {text}")
    print("-" * 50)
    if i == 1: 
        break

Exemple 1
Audio shape: torch.Size([16, 480000])
Texte: ("MY LORD MISS MILNER'S TASTE IS NOT A DEPRAVED ONE IT IS BUT TOO REFINED", 'FOR IN THE TIMES BEFORE THE GREAT FLOOD ATHENS WAS THE GREATEST AND BEST OF CITIES AND DID THE NOBLEST DEEDS AND HAD THE BEST CONSTITUTION OF ANY UNDER THE FACE OF HEAVEN', 'PLEASE TELL ME ONE THING BARTLEY AT LEAST TELL ME THAT YOU BELIEVE I THOUGHT I WAS MAKING YOU HAPPY', 'EVEN SO I HAD JUST RETURNED FROM AN ARDUOUS JOURNEY EXHAUSTED AND BADLY NEEDING A REST', 'GOOD GRACIOUS HAS THE KING ANY RIGHT TO INTERFERE IN MATTERS OF THAT KIND', 'THERE ARE HOWEVER SEVERAL POINTS IN WHICH SUCH AN ACCOUNT OF RECOGNITION IS INADEQUATE TO BEGIN WITH IT MIGHT SEEM AT FIRST SIGHT MORE CORRECT TO DEFINE RECOGNITION AS I HAVE SEEN THIS BEFORE THAN AS THIS HAS EXISTED BEFORE', "HALF AN HOUR LATER TURNING A DEAF EAR TO ALL REMONSTRANCE HE GAVE THE PROPRIETORS UNTIL FIVE O'CLOCK TO REMOVE THEIR FAMILIES AND PERSONAL PROPERTY FROM THE FREE STATE HOTEL", 'ITS ORIGIN WAS SMALL

In [8]:
### Model dev ###

In [None]:
import torch
import torch.nn as nn

class Model_W2V(nn.Module):
    def __init__(self):

        #EAB
        self.batch_size = 16
        seq_length = 49 # In the paper the output of the encoder block has a frequency of 49Hz, we are assuming that each input is a 1s input 
        embed_size = 64
        self.mask_prob = 0.15
        self.mask_length = 10
        num_heads = 8
        dropout = 0.1
        forward_expansion = 4
        kernel_size = 31
        groups = 16

        self.latent_reps = torch.rand(self.batch_size, seq_length, embed_size)
        

        super(Model_W2V, self).__init__()

        self.FeaturesEncoder = Features_encoder.FeatureEncoder(input_channels=1, feature_dim=512)

        self.masking = wav2vec_transformer.MaskingWithLearnableEmbedding(embed_size)
        
        self.TranformerBlock = wav2vec_transformer.TransformerBlockW(embed_size, num_heads, dropout, forward_expansion, kernel_size, groups)

        self.quantization = quantizationModule.QuantizationModule()

        self.LossItem = ContrastiveLoss.LossW2V(1)

    def forward(self, x):

        x = x.unsqueeze(1)
        print(x.shape)
        x = self.FeaturesEncoder(x)
        print(x.shape)
        quantized_repr = self.quantization(x)

        masked_reps, mask = self.masking(x, self.mask_prob, self.mask_length)
        contextualized_reps = self.TranformerBlock(masked_reps, masked_reps, masked_reps, None)



        loss = self.LossItem.compute_loss(contextualized_reps, quantized_repr, mask, self.batch_size)

        
        return x, loss
    

In [24]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from tqdm import tqdm

def train_model(model, dataset, epochs, learning_rate, device):

    dataloader = DataLoader(dataset, batch_size=model.batch_size, shuffle=True)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            inputs, _ = batch 
            optimizer.zero_grad()
            _, loss = model(inputs)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(dataloader)}")


In [25]:
model = Model_W2V()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, dataset, epochs=1, learning_rate=1e-4, device=device)


Epoch 1/1:   0%|          | 0/164 [00:00<?, ?it/s]

torch.Size([16, 1, 480000])


Epoch 1/1:   0%|          | 0/164 [00:04<?, ?it/s]


RuntimeError: Given normalized_shape=[512], expected input with shape [*, 512], but got input of size[16, 512, 96001]

In [None]:

# Exemple d'utilisation
batch_size = 16
sequence_length = 480000  # Longueur d'une séquence audio
x = torch.randn(batch_size, 1, sequence_length)  # (batch_size, 1, sequence_length)

# Instanciation de l'encodeur
feature_encoder = FeatureEncoder(input_channels=1, feature_dim=512)

# Passage avant
output = feature_encoder(x)
print(output.shape)  # Cela doit être (batch_size, feature_dim, reduced_sequence_length)
