In [None]:
from pytube import YouTube, Playlist
from pytube.innertube import InnerTube
import pytube.exceptions as exceptions
from tqdm.notebook import tqdm
from typing import Union
import os
from fastcore.all import *

In [None]:
@patch
def bypass_age_gate2(self: YouTube):
        """Attempt to update the vid_info by bypassing the age gate."""
        innertube = InnerTube(
            client='ANDROID',
            use_oauth=self.use_oauth,
            allow_cache=self.allow_oauth_cache
        )
        innertube_response = innertube.player(self.video_id)

        playability_status = innertube_response['playabilityStatus'].get('status', None)

        # If we still can't access the video, raise an exception
        # (tier 3 age restriction)
        if playability_status == 'UNPLAYABLE':
            raise exceptions.AgeRestrictedError(self.video_id)

        self._vid_info = innertube_response

In [None]:
import re
def to_snake_case(name):
    return name.lower().replace(" ", "_").replace(":", "_").replace("__", "_")

In [None]:
def load_video(url, **kwargs):
    """Function to load youtube video"""
    return YouTube(url, use_oauth=True, allow_oauth_cache=True)

In [None]:
def download_youtube_audio(url, out_dir=".", out_fname=None, best_quality=True):
    "Download the audio from a YouTube video"
    yt = load_video(url)
    if out_fname is None:
        out_fname = os.path.join(out_dir, to_snake_case(yt.title) + ".mp4")
    else:
        out_fname = os.path.join(out_dir,out_fname)
    try:
        yt.streams
    except exceptions.AgeRestrictedError as e:
        print(f"ERROR: {e}")
        yt.bypass_age_gate2()
    yt = (yt.streams
        .filter(only_audio=True, file_extension="mp4")
        .order_by("abr"))
    if best_quality:
        yt = yt.desc()
    else:
        yt = yt.asc()
    return yt.first().download(filename=out_fname)

In [None]:
def download_all(file:Union[str,Path]):
    file = file if isinstance(file,Path) else Path(file)
    with open(file,'r') as infile:
        urls = infile.readlines()
    if len(urls) == 0:
        print(f"{file} has no urls")
    for i, url in enumerate(urls):
        if file.parent.joinpath(file.stem).joinpath(f"{file.stem}_{i}.mp3").exists():
            print(f"""{file.parent.joinpath(file.stem).joinpath(f"{file.stem}_{0}.mp3")} Exists""")
            continue
        download_youtube_audio(url, out_dir=f"{file.parent}/{file.stem}/", out_fname=f"{file.stem}_{i}.mp3")

In [None]:
download_all('./Audios/Biden.txt')

ERROR: d403nALfQrE is age restricted, and can't be accessed without logging in.


# Downlaoding Audios

In [None]:
url = "https://www.youtube.com/watch?v=CmekpMxhFmg"

In [None]:
download_youtube_audio(url, out_fname='Audios/Biden/biden2.mp3')

'/Users/matu/Documents/Xcode/whisper/Audios/Biden/biden2.mp3'

In [None]:
def download_youtube_subtitles(url, out_dir=".", out_fname=None, best_quality=True):
    "Download the audio from a YouTube video"
    yt = load_video(url)
    if out_fname is None:
        out_fname = os.path.join(out_dir, to_snake_case(yt.title) + ".srt")
    srt = yt.captions.all()
    return srt

In [None]:
download_youtube_subtitles(url, out_fname="Bruna.srt")

  srt = yt.captions.all()


[]

In [None]:
yt = YouTube(url, use_oauth=True, allow_oauth_cache=True)

TypeError: YouTube.__init__() got an unexpected keyword argument 'use_oauth'

In [None]:
srt = yt.captions.all()

  srt = yt.captions.all()


In [None]:
srt[0].download('bruna',srt=False)

'/Volumes/Users/matu/Documents/Xcode/whisper/bruna (a.pt).xml'

In [None]:
json_capt = srt[0].json_captions

In [None]:
xml_cap = srt[0].xml_captions

In [None]:
import json

In [None]:
def save_list(in_file:Union[str,Path], out_dir:str = './',out_fname:[str]=None, best_quality:bool=True):
    """Function to download sound and captions from a list of youtube videos"""
    in_file = in_file if isinstance(in_file, Path) else Path(in_file)
    if not in_file.exists():
        print(f"File {in_file} does not exist")
    with open(in_file,'r') as infile:
        urls = infile.readlines()
    for url in urls:
        yt = load_video(url)
        if out_fname is None:
            out_fname = os.path.join(out_dir, to_snake_case(yt.title) + ".mp4")
        vid = (yt.streams
                    .filter(only_audio=True, file_extension="mp4")
                    .order_by("abr"))
        if best_quality:
            vid = vid.desc()
        else:
            vid = vid.asc()
        vid.first().download(filename=out_fname)
        out_cap = out_fname + ".srt"
        srt = yt.captions.all();
        try:
            srt[0].download(out_cap)
        except:
            json_cap = srt[0].json_captions
            with open(out_cap,'w') as capout:
                capout.write(json.dumps(json_cap))
    return 


In [None]:
save_list('./url_list_bruna.txt')

  srt = yt.captions.all();
  srt = yt.captions.all();
  srt = yt.captions.all();
  srt = yt.captions.all();


In [None]:
for url in tqdm(playlist):
    download_youtube_audio(url, out_dir="fastai_transcripts")

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
download_youtube_audio("https://www.youtube.com/watch?v=Gh2HnhO2JS0", out_fname="pastry.mp4")

'/home/tcapelle/wandb/whisper/pastry.mp4'