In [4]:
!mkdir data

In [5]:
%%capture
 #!cat /content/drive/MyDrive/senior_sound/celeb1/vox1_dev* > /content/vox1_dev_wav.zip
!unzip /content/drive/MyDrive/senior_sound/celeb1/vox1_dev_wav.zip -d /content/data

In [None]:
!wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_960h.pt

In [None]:
!pip install transformers
!pip install adabelief-pytorch==0.2.0
!pip install ranger-adabelief==0.1.0

#!pip install torchtext#==0.8.1
#==0.7.0
!pip install torchaudio
#==1.2.2
!pip install pytorch-lightning
#==3.4.0
!pip install comet-ml

In [15]:
"""
import os
path = "/content/data/wav"

speakers = os.listdir(path)
speakers_file_list = {}
utterences = 0
speakers_number = len(speakers)
for i in range(len(speakers)):
  id_path = os.path.join(path, speakers[i])
  speakers_file_list[speakers[i]] = getListOfFiles(id_path)
  utterences += len(speakers_file_list[speakers[i]])

listsize = []
lenutterences = []
samplerates = []
all_audio = []

for key, values in speakers_file_list.items():
  listsize.append(len(values))
  xrate = 0
  secsize = 0
  for value in values:
    sample_rate, audio = wavfile.read(value)
    all_audio.append(audio.shape[0])
    secsize += audio.shape[0] / sample_rate
    if xrate == 0:
      xrate = sample_rate
    else:
      if sample_rate != xrate:
        print(values)
        print(value)
        break
  samplerates.append(sample_rate)
  lenutterences.append(secsize)


print(min(lenutterences), lenutterences.index(min(lenutterences)), lenutterences[161]) # (267.88281249999994, 726, 361.04281249999997)
print(min(listsize), listsize.index(min(listsize)), listsize[726]) # (45, 161, 45)

all_audio2 = np.array(all_audio)
new_l = np.around(all_audio2/sample_rate, decimals=1)

import matplotlib.pyplot as plt
plt.hist(new_l, bins=np.arange(new_l.min(), new_l.max()+1))

"""

In [10]:
from comet_ml import Experiment
from pytorch_lightning.loggers import CometLogger
#from pytorch_model_summary import summary
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor

import pytorch_lightning as pl
from pytorch_lightning import seed_everything

from scipy.io import wavfile
import os
import numpy as np
import soundfile as sf
from torch.utils.data import Dataset, DataLoader
import random
import torch
from transformers import Wav2Vec2PreTrainedModel, Wav2Vec2Model, Wav2Vec2Processor

from ge2e import GE2ELoss

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def seed_e(seed_value):
  seed_everything(seed_value)
  random.seed(seed_value)
  np.random.seed(seed_value) 
  torch.manual_seed(seed_value)
  torch.cuda.manual_seed(seed_value)
  torch.cuda.manual_seed_all(seed_value)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

In [11]:
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles        

In [29]:
class VoxCeleb(Dataset):
    
    def __init__(self, path, sampling_rate = 16000, max_seconds = 8, utterences_per_speaker = 2, full_data = True, window_size = None, step_size = None, 
                 shuffle=True): # path = "/content/data/wav/"

      # 3 - 4.1 seconds max

      if full_data:
        assert utterences_per_speaker < 45 # do max 40
      else:
        assert utterences_per_speaker < (267/step_size) - 1 # do max 40 if step size = 2 sec and window = 4 sec
      
      self.sampling_rate = sampling_rate
      self.max_seconds = max_seconds

      self.M = utterences_per_speaker
      self.full_data = full_data
      
      self.window_size = window_size
      self.step_size = step_size

      self.shuffle = shuffle

      self.speakers = os.listdir(path)
      self.speakers_file_list = {}
      self.utterences = 0
      self.speakers_number = len(self.speakers)
      for i in range(len(self.speakers)):
        id_path = os.path.join(path, self.speakers[i])
        self.speakers_file_list[self.speakers[i]] = getListOfFiles(id_path)
        self.utterences += len(self.speakers_file_list[self.speakers[i]])
    
    def sample_audio(self, audio):
      seconds = audio.shape[0] / self.sampling_rate
      max_samples = int(self.sampling_rate * self.max_seconds)
      if seconds > self.max_seconds:
        start_audio = random.sample(range(0, audio.shape[0] - max_samples), 1)[0]
        audio = audio[start_audio:start_audio + max_samples]
      return audio

    def read_audio(self, path):
      audio, sr = sf.read(path)
      if sr != self.sampling_rate:
        print(path)
        raise Exception("sampling rate broken")
      audio = self.sample_audio(audio)
      return audio

    def __len__(self):
      return self.speakers_number
    
    def __getitem__(self, idx):
      
      if self.shuffle:
        selected_speaker = random.sample(self.speakers, 1)[0]  # select random speaker
      else:
        selected_speaker = self.speakers[idx]   

      if self.full_data:
        list_of_audio = random.sample(self.speakers_file_list[selected_speaker], self.M)
        list_of_audio = list(map(self.read_audio, list_of_audio))
      else:
        raise Exception("Only full data avaulable now")

        # load utterance spectrogram of selected speaker

        # select M utterances per speaker

        # utterances of a speaker [batch(M), n_mels, frames]

        # transpose [batch, frames, n_mels]
      return list_of_audio, [selected_speaker]*len(list_of_audio)

In [30]:
def collate_fn_vox(batch, processor, sampling_rate, max_length = None):
  
  speakers_number = len(batch)

  connected_audio_list = []
  speakers = []
  for list_of_audio, selected_speaker in batch:
    connected_audio_list += list_of_audio
    speakers+=selected_speaker
  
  input_values = processor(connected_audio_list, padding = True, max_length = max_length, return_attention_mask = True, sampling_rate = sampling_rate, return_tensors="pt")
  speakers.append(speakers_number)

  return input_values.input_values, input_values.attention_mask, speakers

In [31]:
class Dataset_vox(pl.LightningDataModule):
    def __init__(self, conf, *args, **kwargs): #*args, **kwargs hparams, steps_per_epoch
      super().__init__()
      self.hparams = conf
      self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

    def prepare_data(self):
      print("can add download here")
    
    def setup(self):
      
      dataset = VoxCeleb(self.hparams["path"], sampling_rate=self.hparams["sampling_rate"], max_seconds=self.hparams["max_seconds"], 
                         utterences_per_speaker=self.hparams["utterences_per_speaker"], full_data=self.hparams["full_data"], 
                         window_size=self.hparams["window_size"], step_size=self.hparams["step_size"], shuffle=self.hparams["shuffle_speakers"])

      size_of_main = len(dataset)
      self.dataset_train, self.dataset_val = torch.utils.data.random_split(dataset, 
                                              [int(size_of_main*0.9), size_of_main - int(size_of_main*0.9)], 
                                              generator=torch.Generator().manual_seed(42))
      self.dataset_test = None
    def train_dataloader(self):
      data_train = DataLoader(self.dataset_train, batch_size=self.hparams["number_of_speakers"], num_workers=self.hparams["num_workers"], 
                              shuffle=self.hparams["dataloader_shuffle"], 
                              collate_fn = lambda x:collate_fn_vox(x, self.processor, self.hparams["sampling_rate"], max_length = self.hparams["max_length"])
                              )
      return data_train

    def val_dataloader(self):
      val = DataLoader(self.dataset_val, batch_size=self.hparams["number_of_speakers"], num_workers=self.hparams["num_workers"], 
                              shuffle=False, 
                              collate_fn = lambda x:collate_fn_vox(x, self.processor, self.hparams["sampling_rate"], max_length = self.hparams["max_length"])
                              )
      return val

    def test_dataloader(self):
      test = self.dataset_test
      return test

In [32]:
class Voice_Encoder_pl(pl.LightningModule):
    def __init__(self, re_dict, *args, **kwargs): #*args, **kwargs hparams, steps_per_epoch
        super().__init__()
        self.save_hyperparameters(re_dict)
        self.save_hyperparameters()

        self.model_params = self.hparams["model_params"]
        self.learning_params = self.hparams["training"]

        #self.swa_model = None
        #self.swa_mode = False

        #print("mixup set: ", self.learning_params["mixup"])
        #if self.learning_params["data_dropout"]:
        #    print("data_dropout activated")
        #    self.time_drop = torchaudio.transforms.TimeMasking(time_mask_param=self.learning_params["time_l"])
        # self.check_random_mixup = False
        self.feature_extractor = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        self.fc1 = nn.Linear(in_features = 768, out_features = self.model_params["fc1_dim"])
        self.avpool = nn.AvgPool1d(5, stride=3)
        self.fc2 = nn.Linear(in_features = self.model_params["fc1_dim"], out_features = self.model_params["fc2_dim"])
        self.fc3 = nn.Linear(in_features = self.model_params["fc2_dim"], out_features = self.model_params["embeding"])

        self.criterion = GE2ELoss(init_w=10.0, init_b=-5.0, loss_method='softmax')

    def forward(self, audio, attention_mask):
      if self.learning_params["block"] or (self.current_epoch < self.learning_params["start_learning_feature_epoch"]):
        self.feature_extractor.eval()
        with torch.no_grad():
          hidden = self.feature_extractor(audio, attention_mask).last_hidden_state
      else:
        self.feature_extractor.train()
        hidden = self.feature_extractor(audio, attention_mask).last_hidden_state

      hidden = self.fc1(hidden)
      hidden = hidden.transpose(1,2).contiguous()
      hidden = self.avpool(hidden)
      hidden = hidden.transpose(1,2).contiguous()
      hidden = self.fc2(hidden)
      hidden = torch.sum(hidden, dim = 1)
      hidden = self.fc3(hidden)

      return hidden
    

    def configure_optimizers(self):
        if self.learning_params["optimizer"] == "belief":
            optimizer =  AdaBelief(self.parameters(), lr = self.learning_params["lr"], eps = self.learning_params["eplison_belief"],
                                    weight_decouple = self.learning_params["weight_decouple"], 
                                    weight_decay = self.learning_params["weight_decay"], rectify = self.learning_params["rectify"])
        elif self.learning_params["optimizer"] == "ranger_belief":
            optimizer = RangerAdaBelief(self.parameters(), lr = self.learning_params["lr"], eps = self.learning_params["eplison_belief"],
                                       weight_decouple = self.learning_params["weight_decouple"],  weight_decay = self.learning_params["weight_decay"],)
        elif self.learning_params["optimizer"] == "adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_params["lr"])
        elif self.learning_params["optimizer"] == "adamW":
            optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_params["lr"])        

        if self.learning_params["add_sch"]:
            #CosineScheduler(20, warmup_steps=5, base_lr=0.3, final_lr=0.01)
            #MultiStepLR(trainer, milestones=[15, 30], gamma=0.5)
            lr_scheduler = {'scheduler': torch.optim.lr_scheduler.OneCycleLR(optimizer,
	                                                                        max_lr=self.learning_params["lr"],
	                                                                        steps_per_epoch=self.hparams.steps_per_epoch, #int(len(train_loader))
	                                                                        epochs=self.learning_params["epochs"],
	                                                                        anneal_strategy='linear'),
                        'name': 'lr_scheduler_lr',
                        'interval': 'step', # or 'epoch'
                        'frequency': 1,
                        }
            print("sch added")
            return [optimizer], [lr_scheduler]
        return optimizer
    
    def loss_function(self, x, speakers):
      # N, M, D: N - Number of speakers in a batch, M - Number of utterances for each speaker, D - d-vector
      b, d = x.shape
      speakers_number = speakers[-1] # N - Number of speakers in a batch, 

      x = x.view(speakers_number, -1, d)
      loss = self.criterion(x)

      return loss


    def training_step(self, batch, batch_idx):
        #also Manual optimization exist
        x, mask, speakers = batch
        output = self(x, mask)
        loss = self.loss_function(output, speakers)
        self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True) # prog_bar=True
        return loss

    #copied
    def get_lr_inside(self, optimizer):
        for param_group in optimizer.param_groups:
            return param_group['lr']


    def training_epoch_end(self, outputs):
        self.log('epoch_now', self.current_epoch, on_step=False, on_epoch=True, logger=True)
        (oppp) =  self.optimizers(use_pl_optimizer=True)
        self.log('lr_now', self.get_lr_inside(oppp), on_step=False, on_epoch=True, logger=True)


    def validation_step(self, batch, batch_idx):
        x, mask, speakers = batch
        output = self(x, mask)
        loss = self.loss_function(output, speakers)

        self.log('val_loss', loss, on_step=False, on_epoch=True, logger=True) #prog_bar=True,
        return {'val_loss': loss}

"""
    def test_step(self, batch, batch_idx):
        
        x, mask, speakers = batch
        output = self(x, mask)
        loss = self.loss_function(output, speakers)

        return {'test_loss': loss, #!!!!!!!!!!


    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()

        self.log('test_f1_score_weighted', f1_scored_w, on_step=False, on_epoch=True, logger=True) #prog_bar=True,
        self.log('test_f1_score_macro', f1_scored_m, on_step=False, on_epoch=True,  logger=True) #prog_bar=True,

"""

"\n    def test_step(self, batch, batch_idx):\n        \n        x, mask, speakers = batch\n        output = self(x, mask)\n        loss = self.loss_function(output, speakers)\n\n        return {'test_loss': loss, #!!!!!!!!!!\n\n\n    def test_epoch_end(self, outputs):\n        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()\n\n        self.log('test_f1_score_weighted', f1_scored_w, on_step=False, on_epoch=True, logger=True) #prog_bar=True,\n        self.log('test_f1_score_macro', f1_scored_m, on_step=False, on_epoch=True,  logger=True) #prog_bar=True,\n\n"

In [33]:
# data params
data_params = {
    "path": "/content/data/wav/",
    "sampling_rate": 16000,
    "max_seconds": 8, 
    "max_length": None,
    "utterences_per_speaker": 5, # M
    "full_data": True,
    "window_size": None,
    "step_size": None,
    "shuffle_speakers": True,
    "number_of_speakers": 10, # N
    "num_workers": 2,
    "dataloader_shuffle": True,
}

# model params

model_params = {
    "fc1_dim": 512,
    "fc2_dim": 512,
    "embeding": 256
}


# learning params

learning_params = {
    "block": True,
    "start_learning_feature_epoch": None,
    
    "optimizer": "belief", # "belief", "ranger_belief", "adam", adamW
    "lr": 3e-4, #
    "eplison_belief": 1e-16,
    "beta": [0.9, 0.999], # not used
    "weight_decouple": True, 
    "weight_decay": 1e-4,
    "rectify": True,
    #
    "add_sch": False,
    #
    "epochs": 10, #
}

hparams_encoder = {
    "model_params": model_params,
    "training": learning_params,
    "data_params": data_params,
}

In [34]:
dataset_pl = Dataset_vox(hparams_encoder["data_params"])
dataset_pl.prepare_data()
dataset_pl.setup()

for batch in dataset_pl.train_dataloader():
  x, mask, speakers = batch
  print(x, mask, speakers)
  print(x.shape, mask.shape)
  break
del dataset_pl

can add download here
tensor([[-0.4486, -0.5570, -0.5912,  ..., -0.0089,  0.2642,  0.3124],
        [ 0.6341,  0.6820,  0.5136,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0612,  0.2578,  0.1526,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 3.0502,  2.3039,  1.6878,  ...,  0.0000,  0.0000,  0.0000],
        [ 5.0606,  5.7087,  4.4404,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0482,  0.0406,  0.0335,  ..., -0.2669, -0.2357, -0.2463]]) tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]) ['id10656', 'id10656', 'id10656', 'id10656', 'id10656', 'id10581', 'id10581', 'id10581', 'id10581', 'id10581', 'id10677', 'id10677', 'id10677', 'id10677', 'id10677', 'id10535', 'id10535', 'id10535', 'id10535', 'id10535', 'id11015', 'id11015', 'id11015', 'id11015', 'id11015', 'id10435', 'id10435', 'id10435', 'id10435', 'id104

In [35]:
re_dict_check = hparams_encoder.copy()
model = Voice_Encoder_pl(re_dict_check) #Updated_Re_model(re_dict_updated) #Updated_old_Re_model(re_dict_check)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=843.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=377667514.0, style=ProgressStyle(descri…




Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
hidden = model.forward(x, mask)
print(hidden.shape)

torch.Size([50, 256])


In [None]:
seed_v = 42
root_dir = "/content/drive/MyDrive/senior_sound/celeb1/encoder_weights"
naming = "encoder_try_1"

seed_e(seed_v)

comet_logger = CometLogger(
  save_dir='/content/log/',
  api_key="23CU99n7TeyZdPeegNDlQ5aHf",
  project_name="encoder-voice",
  workspace="etzelkut",
  # rest_api_key=os.environ["COMET_REST_KEY"], # Optional
  experiment_name = naming, # Optional
)
  #

dataset_pl = Dataset_vox(hparams_encoder["data_params"])
dataset_pl.prepare_data()
dataset_pl.setup()
steps_per_epoch = int(len(dataset_pl.train_dataloader()))
print(steps_per_epoch)
proj_a = Voice_Encoder_pl(hparams_encoder, steps_per_epoch = steps_per_epoch)

trainer = Trainer(#callbacks=[lr_monitor],
                    logger=comet_logger,
                    gpus=1,
                    profiler=True,
                    #auto_lr_find=True, #set hparams
                    #gradient_clip_val=0.5,
                    check_val_every_n_epoch=1,
                    #early_stop_callback=True,
                    max_epochs = re_dict["training"]["epochs"],
                    progress_bar_refresh_rate = 0,
                    deterministic=True,)

trainer.fit(proj_a, dataset_pl)

checkpoint_name = os.path.join(root_dir, naming + '.ckpt')
trainer.save_checkpoint(checkpoint_name)

In [None]:
"""
from IPython.display import Audio
display(Audio(signal, rate=dataset.sampling_rate))
"""