In [1]:
import pandas as pd
from pytube import YouTube
import os
import soundfile, librosa
import subprocess

# Loading metadata

In [2]:
metadata = pd.read_csv("../../data/metadata/Voice list - Dung.tsv", sep='\t')

In [3]:
metadata.head()

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,Chu Văn Biên,https://www.youtube.com/watch?v=TIIb183IHw8&t...
1,False,Đặng Thành Nam,"https://www.youtube.com/watch?v=rC-Trfxk78I, h..."
2,False,,
3,False,,
4,False,,


In [4]:
metadata = metadata.dropna()

In [5]:
metadata.head()

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,Chu Văn Biên,https://www.youtube.com/watch?v=TIIb183IHw8&t...
1,False,Đặng Thành Nam,"https://www.youtube.com/watch?v=rC-Trfxk78I, h..."


In [6]:
not_crawled_metadata = metadata[metadata["Crawled"] == False]
not_crawled_metadata

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,Chu Văn Biên,https://www.youtube.com/watch?v=TIIb183IHw8&t...
1,False,Đặng Thành Nam,"https://www.youtube.com/watch?v=rC-Trfxk78I, h..."


In [7]:
prefix = 'd-'
metadata["Voice Name"] = metadata["Voice Name"].apply(lambda x: prefix + x.strip().replace(" ", "-"))

In [8]:
metadata.head()

Unnamed: 0,Crawled,Voice Name,Playlist URL
0,False,d-Chu-Văn-Biên,https://www.youtube.com/watch?v=TIIb183IHw8&t...
1,False,d-Đặng-Thành-Nam,"https://www.youtube.com/watch?v=rC-Trfxk78I, h..."


# Downloading mp3 and converting to wav 

In [9]:
metadata_0 = metadata.iloc[0]
metadata_0

Crawled                                                     False
Voice Name                                         d-Chu-Văn-Biên
Playlist URL     https://www.youtube.com/watch?v=TIIb183IHw8&t...
Name: 0, dtype: object

In [10]:
root_dir = "../../data/wav/"
save_dir = root_dir + metadata_0["Voice Name"] + '/'
save_dir

'../../data/wav/d-Chu-Văn-Biên/'

In [11]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [12]:
urls = [url.strip() for url in metadata_0["Playlist URL"].split(',')]
urls

['https://www.youtube.com/watch?v=TIIb183IHw8&t=9s',
 'https://www.youtube.com/watch?v=Ngh995cseAo&t=300s',
 'https://www.youtube.com/watch?v=v1Y89MUh118',
 'https://www.youtube.com/watch?v=yATyswMXjg0']

In [39]:
def download_mp3(url: str, destination: str):
    yt = YouTube(url)
    
    # extract only audio
    audio = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
    audio_name = audio.default_filename.split('.')[0]

    if os.path.exists(destination + audio_name + '.mp3') or os.path.exists(destination + audio_name + '.wav'):
        mp3_path = ""
    else:
        # download the file
        out_path = audio.download(output_path=destination)
        
        # save the file
        base, extension = os.path.splitext(out_path)
        mp3_path = base + '.mp3'
        os.rename(out_path, mp3_path)

    return mp3_path

In [33]:
def mp3_to_wav(mp3_path):
    wav_path = os.path.splitext(mp3_path)[0] + '.wav'
    subprocess.call(['ffmpeg', '-i', mp3_path, wav_path])

    return wav_path

In [34]:
def resample_wav(wav_path):
    y, sr = librosa.load(wav_path)       
    y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
    y_mono = librosa.to_mono(y_16k)
    soundfile.write(wav_path, y_mono, 16000)

In [40]:
for url in urls:
    if not os.path.exists(save_dir + url):
        mp3_path = download_mp3(url, save_dir)
        if mp3_path != "":
            wav_path = mp3_to_wav(mp3_path)
            resample_wav(wav_path)
            os.remove(mp3_path)

# VAD