In [1]:
import torch

from Modules import LoadingModule
from Modules import Features_encoder
from Modules import quantizationModule
from Modules import wav2vec_transformer
from Modules import ContrastiveLoss

from Modules import TempLibriSpeech

In [2]:
%pip install pytorch_lightning

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
print("helloworld")

helloworld


In [4]:
#data loader module init
StandardScalerTransform = LoadingModule.StandardScalerTransform
LargeDataModule = LoadingModule.LargeDataModule("./data/Librispeech", batch_size=16, num_workers=1, transform=StandardScalerTransform)


In [5]:
#Temp import dataloader ### rendre compatible PLightning quand on aura le GPU
# en attendant import manuel
from torch.utils.data import DataLoader



dataset = TempLibriSpeech.LibriSpeech(split="test-clean", target_length=48000, device='cpu')
data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [6]:
for i, (audio, text) in enumerate(data_loader):
    print(f"Exemple {i+1}")
    print(f"Audio shape: {audio.shape}")
    print(f"Texte: {text}")
    print("-" * 50)
    if i == 1: 
        break

Exemple 1
Audio shape: torch.Size([16, 48000])
Texte: ('AND ONE MORE THIS MORNING', 'IN THE NATURE OF THINGS LUXURIES AND THE COMFORTS OF LIFE BELONG TO THE LEISURE CLASS', 'THE ORCHARD WAS SPARKLING AND RIPPLING IN THE SUN', 'THE STRONG POSITION HELD BY THE EDISON SYSTEM UNDER THE STRENUOUS COMPETITION THAT WAS ALREADY SPRINGING UP WAS ENORMOUSLY IMPROVED BY THE INTRODUCTION OF THE THREE WIRE SYSTEM AND IT GAVE AN IMMEDIATE IMPETUS TO INCANDESCENT LIGHTING', 'SO THERE CAME A STEP AND A LITTLE RUSTLING OF FEMININE DRAPERIES THE SMALL DOOR OPENED AND RACHEL ENTERED WITH HER HAND EXTENDED AND A PALE SMILE OF WELCOME', 'THE EUROPE THEY HAD COME FROM LAY OUT THERE BEYOND THE IRISH SEA EUROPE OF STRANGE TONGUES AND VALLEYED AND WOODBEGIRT AND CITADELLED AND OF ENTRENCHED AND MARSHALLED RACES', 'COME AND GET THE BOOLOOROO SHE SAID GOING TOWARD THE BENCHES', 'THEIR MASTERS SAID MISSUS NEVERBEND', 'PEARL SEEING THE ROSE BUSHES BEGAN TO CRY FOR A RED ROSE AND WOULD NOT BE PACIFIED', 'THIS HAS I

In [7]:
### Model dev ###

In [8]:
import torch
import torch.nn as nn

class Model_W2V(nn.Module):
    def __init__(self):

        #EAB
        self.batch_size = 16
        seq_length = 49 # In the paper the output of the encoder block has a frequency of 49Hz, we are assuming that each input is a 1s input 
        embed_size = 64
        self.mask_prob = 0.15
        self.mask_length = 10
        num_heads = 8
        dropout = 0.1
        forward_expansion = 4
        kernel_size = 31
        groups = 16

        self.latent_reps = torch.rand(self.batch_size, seq_length, embed_size)
        

        super(Model_W2V, self).__init__()

        self.FeaturesEncoder = Features_encoder.FeatureEncoder(input_channels=1, feature_dim=512)

        self.masking = wav2vec_transformer.MaskingWithLearnableEmbedding(embed_size)
        
        self.TranformerBlock = wav2vec_transformer.TransformerBlockW(embed_size, num_heads, dropout, forward_expansion, kernel_size, groups)

        self.quantization = quantizationModule.QuantizationModule(num_codebooks=2, num_codes=256)

        self.LossItem = ContrastiveLoss.LossW2V(1)

    def forward(self, x):

        x = x.unsqueeze(1)
        print(x.shape)
        x = self.FeaturesEncoder(x)
        print(x.shape)
        quantized_repr = self.quantization(x)

        masked_reps, mask = self.masking(x, self.mask_prob, self.mask_length)
        contextualized_reps = self.TranformerBlock(masked_reps, masked_reps, masked_reps, None)



        loss = self.LossItem.compute_loss(contextualized_reps, quantized_repr, mask, self.batch_size)

        
        return x, loss
    

In [9]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from tqdm import tqdm

def train_model(model, dataset, epochs, learning_rate, device):

    dataloader = DataLoader(dataset, batch_size=model.batch_size, shuffle=True)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            inputs, _ = batch 
            optimizer.zero_grad()
            _, loss = model(inputs)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(dataloader)}")


In [10]:
model = Model_W2V()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, dataset, epochs=1, learning_rate=1e-4, device=device)


Epoch 1/1:   0%|          | 0/164 [00:00<?, ?it/s]

torch.Size([16, 1, 48000])
torch.Size([16, 4801, 512])
torch.Size([16, 601, 512])


Epoch 1/1:   0%|          | 0/164 [00:01<?, ?it/s]

torch.Size([16, 151, 512])
torch.Size([16, 151, 512])
la 2 256





RuntimeError: The expanded size of the tensor (512) must match the existing size (64) at non-singleton dimension 1.  Target sizes: [10, 512].  Tensor sizes: [64]

In [11]:
device

device(type='cpu')