In [16]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [1]:
!pip install pytube

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from pytube import YouTube
import os
import soundfile, librosa
import subprocess

# Loading metadata

In [None]:
metadata = pd.read_csv("/content/drive/MyDrive/Deep Learning project/data/metadata/Voice list - Dung.tsv", sep='\t', header=1)

In [None]:
metadata 

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,Chu Văn Biên,https://www.youtube.com/watch?v=lb2K7RFaSWg&t=...
1,False,Vted-Đặng Thành Nam,"https://www.youtube.com/watch?v=TaFQlUVmeEM, h..."
2,False,Nguyễn Anh Phong,"https://www.youtube.com/watch?v=3C4SjX4NVDE, h..."
3,False,Cô Vũ Mai Phương Official,"https://www.youtube.com/watch?v=cqjn5XBamRE, h..."
4,False,Bi Huỳnh Senpai,"https://www.youtube.com/watch?v=CC5oWWQvZlk, h..."
5,False,Top Manga,"https://www.youtube.com/watch?v=GOoLS5r5q5k, h..."
6,False,Sakura Review,"https://www.youtube.com/watch?v=xraIbICYMYo, h..."
7,False,Góc Khán Đài (Hoang),"https://www.youtube.com/watch?v=Q0w9rr5JOYM, h..."
8,False,K team,"https://www.youtube.com/watch?v=3h-4nYHw4U0, h..."
9,False,Tạp Hóa Comic,"https://www.youtube.com/watch?v=B4jr-s7qRX4, h..."


In [None]:
metadata = metadata.dropna()

In [None]:
not_crawled_metadata = metadata[metadata["Crawled"] == False]
not_crawled_metadata

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,Chu Văn Biên,https://www.youtube.com/watch?v=lb2K7RFaSWg&t=...
1,False,Vted-Đặng Thành Nam,"https://www.youtube.com/watch?v=TaFQlUVmeEM, h..."
2,False,Nguyễn Anh Phong,"https://www.youtube.com/watch?v=3C4SjX4NVDE, h..."
3,False,Cô Vũ Mai Phương Official,"https://www.youtube.com/watch?v=cqjn5XBamRE, h..."
4,False,Bi Huỳnh Senpai,"https://www.youtube.com/watch?v=CC5oWWQvZlk, h..."
5,False,Top Manga,"https://www.youtube.com/watch?v=GOoLS5r5q5k, h..."
6,False,Sakura Review,"https://www.youtube.com/watch?v=xraIbICYMYo, h..."
7,False,Góc Khán Đài (Hoang),"https://www.youtube.com/watch?v=Q0w9rr5JOYM, h..."
8,False,K team,"https://www.youtube.com/watch?v=3h-4nYHw4U0, h..."
9,False,Tạp Hóa Comic,"https://www.youtube.com/watch?v=B4jr-s7qRX4, h..."


In [None]:
prefix = 'd-'
metadata["Voice Name"] = metadata["Voice Name"].apply(lambda x: prefix + x.strip().replace(" ", "-"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
metadata.head()

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,d-Chu-Văn-Biên,https://www.youtube.com/watch?v=lb2K7RFaSWg&t=...
1,False,d-Vted-Đặng-Thành-Nam,"https://www.youtube.com/watch?v=TaFQlUVmeEM, h..."
2,False,d-Nguyễn-Anh-Phong,"https://www.youtube.com/watch?v=3C4SjX4NVDE, h..."
3,False,d-Cô-Vũ-Mai-Phương-Official,"https://www.youtube.com/watch?v=cqjn5XBamRE, h..."
4,False,d-Bi-Huỳnh-Senpai,"https://www.youtube.com/watch?v=CC5oWWQvZlk, h..."


# Downloading mp3 and converting to wav 

In [None]:
metadata_0 = metadata.iloc[10]
metadata_0

Crawled                                                     False
Voice Name                                            d-Bay-Anime
Playlist URL    https://www.youtube.com/watch?v=0ttXv8ZRrUg, h...
Name: 10, dtype: object

In [None]:
root_dir = "/content/drive/MyDrive/Deep Learning project/data/wav/"
save_dir = root_dir + metadata_0["Voice Name"] + '/'
save_dir

'/content/drive/MyDrive/Deep Learning project/data/wav/d-Bay-Anime/'

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [None]:
urls = [url.strip() for url in metadata_0["Playlist URL"].split(',')]
urls

['https://www.youtube.com/watch?v=0ttXv8ZRrUg',
 'https://www.youtube.com/watch?v=j_DK_8jKMIk',
 'https://www.youtube.com/watch?v=Du8P-USG7Ho']

In [None]:
def download_mp3(url: str, destination: str):
    yt = YouTube(url)
    
    # extract only audio
    audio = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
    audio_name = audio.default_filename.split('.')[0]

    if os.path.exists(destination + audio_name + '.mp3') or os.path.exists(destination + audio_name + '.wav'):
        mp3_path = ""
    else:
        # download the file
        out_path = audio.download(output_path=destination)
        
        # save the file
        base, extension = os.path.splitext(out_path)
        mp3_path = base + '.mp3'
        os.rename(out_path, mp3_path)

    return mp3_path

In [None]:
def mp3_to_wav(mp3_path):
    wav_path = os.path.splitext(mp3_path)[0] + '.wav'
    subprocess.call(['ffmpeg', '-i', mp3_path, wav_path])

    return wav_path

In [None]:
def resample_wav(wav_path):
    y, sr = librosa.load(wav_path)       
    y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
    y_mono = librosa.to_mono(y_16k)
    soundfile.write(wav_path, y_mono, 16000)

In [None]:
for url in urls:
    if not os.path.exists(save_dir + url):
        mp3_path = download_mp3(url, save_dir)
        if mp3_path != "":
            wav_path = mp3_to_wav(mp3_path)
            resample_wav(wav_path)
            os.remove(mp3_path)

# VAD

In [3]:
from torch import hub
from pprint import pprint
from glob import glob

In [None]:
wav_path = "/content/drive/MyDrive/Deep Learning project/data/wav/d-Bay-Anime/34 Sự Thật Zoro - Kiếm Sĩ Đầu Rêu Lạc Lối.wav"
sampling_rate = 16000

In [None]:
model, utils = hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True)
get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks = utils

  "You are about to download and run code from an untrusted repository. In a future release, this won't "
Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip


In [None]:
def vad(wav_path):
    j = 0
    wav = read_audio(wav_path, sampling_rate = sampling_rate)
    speech_timestamps = get_speech_timestamps(wav, model, threshold=0.5, sampling_rate=sampling_rate)

    wav_duration = 0
    k = 0   
    speech_timestamps_concat = []
    wav_concat = []

    while(True):
        wav_duration += speech_timestamps[k]['end'] - speech_timestamps[k]['start'] 
        speech_timestamps_concat.append(speech_timestamps[k])
            
        if k < len(speech_timestamps)-1:
            k += 1
            if wav_duration >= 3*16000:      
                wav_concat.append(collect_chunks(speech_timestamps_concat, wav))
                speech_timestamps_concat.clear()
                wav_duration = 0
                continue
            else:
                continue
        else:
            wav_concat.append(collect_chunks(speech_timestamps_concat, wav))
            speech_timestamps_concat.clear()
            wav_duration = 0
            break   

    wav_dir = os.path.splitext(wav_path)[0] + '/'
    if not os.path.exists(wav_dir):
        os.makedirs(wav_dir)

    for i in wav_concat:
        if not os.path.exists(wav_dir + str(j) + '.wav'):
            save_audio(wav_dir + str(j) + '.wav', i, sampling_rate = sampling_rate)
        j += 1 
    
    return wav_dir

In [None]:
def re_vad(wav_dir):
    files = glob(wav_dir + '*.wav')
    j = len(files) + 1

    for file in files:
        y, sr = soundfile.read(file)
        wav_length = len(y) / sr

        if wav_length >= 10:
            wav = read_audio(file, sampling_rate = sampling_rate)
            speech_timestamps = get_speech_timestamps(wav, model, threshold=0.9, sampling_rate=sampling_rate)
            wav_duration = 0
            k = 0   
            speech_timestamps_concat = []
            wav_concat = []

            if len(speech_timestamps) != 0:
                while(True):
                    wav_duration =  wav_duration + speech_timestamps[k]['end'] - speech_timestamps[k]['start']
                    
                    speech_timestamps_concat.append(speech_timestamps[k])
                    
                    if k < len(speech_timestamps)-1:
                        k += 1
                        if wav_duration >= 3*16000:      
                            wav_concat.append(collect_chunks(speech_timestamps_concat, wav))
                            speech_timestamps_concat.clear()
                            wav_duration = 0
                            continue
                        else:
                            continue
                    else:
                        wav_concat.append(collect_chunks(speech_timestamps_concat, wav))
                        speech_timestamps_concat.clear()
                        wav_duration = 0
                        break
                else:
                    continue   
                        
            for i in wav_concat:
                save_audio(wav_dir + str(j) + '.wav', i, sampling_rate = sampling_rate)
                j += 1 

In [None]:
def remove_wav(wav_dir: str):
    files = glob(wav_dir + '*.wav')
    k = 0

    for file in files:
        y, sr = soundfile.read(file)
        wav_length = len(y) / sr

        if wav_length < 3 or wav_length > 10:
            os.remove(file)
        else:
            new_file = os.path.join(file.rsplit('/', maxsplit=1)[0], f'{k}.wav')
            os.rename(file, new_file)
            k += 1

In [None]:
wav_dir = vad(wav_path)

In [None]:
re_vad(wav_dir)

In [None]:
remove_wav(wav_dir)

# Outlier removal

In [8]:
!pip install speechbrain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting speechbrain
  Downloading speechbrain-0.5.13-py3-none-any.whl (498 kB)
[K     |████████████████████████████████| 498 kB 9.2 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 63.3 MB/s 
[?25hCollecting hyperpyyaml
  Downloading HyperPyYAML-1.0.1.tar.gz (14 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 58.9 MB/s 
Collecting ruamel.yaml>=0.17.8
  Downloading ruamel.yaml-0.17.21-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 70.5 MB/s 
[?25hCollecting ruamel.yaml.clib>=0.2.6
  Downloading ruamel.yaml.clib-0.2.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (500 kB)
[K     |██████████

In [4]:
from speechbrain.pretrained import EncoderClassifier
from tqdm import tqdm
import torch
import torch.nn.functional as F
from scipy.spatial.distance import cdist
import numpy as np
from copy import deepcopy
import torchaudio

In [5]:
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":device})

In [7]:
wav_dir = "/content/drive/MyDrive/Deep Learning project/data/wav/d-Bay-Anime/34 Sự Thật Zoro - Kiếm Sĩ Đầu Rêu Lạc Lối/"

In [8]:
files = glob(wav_dir + '*.wav')
embeddings = {}

In [9]:
for idx, file in tqdm(enumerate(files), total=len(files)):
    signal, _  = torchaudio.load(file)

    # Splited utterance matrix
    max_audio = 3*16000
    feats = []
    startframe = np.linspace(0, signal.shape[1] - max_audio, num=5)

    for asf in startframe:
        feat = signal[0, int(asf): int(asf) + max_audio]
        feats.append(feat)

    feats = np.stack(feats, axis = 0).astype('f')
    data = torch.FloatTensor(feats)

    # Speaker embeddings
    embedding_1 = model.encode_batch(signal.to(device))
    embedding_1 = F.normalize(embedding_1[0, 0, :], p=2, dim=0)
    embedding_2 = model.encode_batch(data.to(device))
    embedding_2 = F.normalize(embedding_2.permute(1, 0, 2)[0, :, :], p=2, dim=1)

    embeddings[file] = [embedding_1, embedding_2]

100%|██████████| 153/153 [00:12<00:00, 12.73it/s]


In [10]:
scores = np.array([ [0]*(len(files) - 1) for i in range(len(files)) ], dtype='f')

In [11]:
for i, file in tqdm(enumerate(files), total=len(files)):
    remaining_files = deepcopy(files)
    remaining_files.remove(file)
    embedding_11, embedding_12 = embeddings[file]

    for j in range(len(remaining_files)):
        embedding_21, embedding_22 = embeddings[remaining_files[j]]
        # Compute the scores
        score_1 = torch.mean(torch.matmul(embedding_11, embedding_21.T))
        score_2 = torch.mean(torch.matmul(embedding_12, embedding_22.T))
        score = (score_1 + score_2) / 2
        score = score.detach().cpu().numpy()
        scores[i, j] = score

  if __name__ == '__main__':
100%|██████████| 153/153 [00:02<00:00, 55.38it/s]


In [12]:
mean_scores = np.mean(scores, axis=1)

In [13]:
mean_scores

array([0.6656892 , 0.7261392 , 0.6321623 , 0.7185927 , 0.70443547,
       0.48974353, 0.6786146 , 0.7106064 , 0.6926557 , 0.642682  ,
       0.7404093 , 0.67249525, 0.72070444, 0.6927182 , 0.6723831 ,
       0.6982084 , 0.7075573 , 0.68033105, 0.65998816, 0.71396184,
       0.7115004 , 0.65340453, 0.6821719 , 0.7233439 , 0.60733676,
       0.7008173 , 0.65330535, 0.72005385, 0.71000177, 0.718649  ,
       0.7225804 , 0.6828216 , 0.65486485, 0.7284385 , 0.73714495,
       0.7106382 , 0.751422  , 0.7406618 , 0.67730755, 0.7408595 ,
       0.75995713, 0.6515819 , 0.746051  , 0.70010835, 0.68956214,
       0.66404635, 0.71837395, 0.69818795, 0.67489654, 0.6434561 ,
       0.6886773 , 0.68376464, 0.70892304, 0.7246654 , 0.7161688 ,
       0.69406986, 0.7274107 , 0.7179249 , 0.7304995 , 0.7124105 ,
       0.70473653, 0.6606819 , 0.7344688 , 0.72595775, 0.69527376,
       0.73522455, 0.7379015 , 0.7191775 , 0.7028798 , 0.6917282 ,
       0.7117623 , 0.6981435 , 0.6441128 , 0.7025701 , 0.68979

In [14]:
threshold = 0.45 #0.5

for i, file in tqdm(enumerate(files), total=len(files)):
    if mean_scores[i] < threshold:
        # print(mean_scores[i], file)
        os.remove(file)

100%|██████████| 153/153 [00:00<00:00, 227805.65it/s]
