In [None]:
import os
from pytube import YouTube, Playlist
import webvtt  
from fastcore.all import *
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
import re
def to_snake_case(name):
    return name.lower().replace(" ", "_").replace(":", "_").replace("__", "_")

In [None]:
def download_youtube_audio(url, out_dir=".", out_fname=None, best_quality=True):
    "Download the audio from a YouTube video"
    yt = YouTube(url)
    if out_fname is None:
        out_fname = os.path.join(out_dir, to_snake_case(yt.title) + ".mp4")
    yt = (yt.streams
            .filter(only_audio=True, file_extension="mp4")
            .order_by("abr"))
    if best_quality:
        yt = yt.desc()
    else:
        yt = yt.asc()
    return yt.first().download(filename=out_fname)

In [None]:
def get_videos_info(url ):
    data = dict()
    "get data for the app"
    yt = YouTube(url)
    data['title'] = yt.title
    data['description'] = yt.description
    data['author'] = yt.author
    data['url'] = url
    return data

In [None]:
playlist_url = "https://www.youtube.com/playlist?list=PLfYUBJiXbdtSvpQjSnJJ_PmDQB_VyT5iU"

In [None]:
playlist = Playlist(playlist_url)

In [None]:
info = [get_videos_info(url) for url in playlist]

In [None]:
pd.DataFrame(info)

Unnamed: 0,title,description,author,url
0,Lesson 1: Practical Deep Learning for Coders 2022,"Go to https://course.fast.ai for code, noteboo...",Jeremy Howard,https://www.youtube.com/watch?v=8SF_h3xF3cE
1,Lesson 2: Practical Deep Learning for Coders 2022,00:00 - Introduction\n00:55 - Reminder to use ...,Jeremy Howard,https://www.youtube.com/watch?v=F4tvM4Vb3A0
2,Lesson 3: Practical Deep Learning for Coders 2022,"00:00 Introduction and survey\n01:36 ""Lesson 0...",Jeremy Howard,https://www.youtube.com/watch?v=hBBOjCiFcuo
3,Lesson 4: Practical Deep Learning for Coders 2022,00:00:00 - Using Huggingface\n00:03:24 - Finet...,Jeremy Howard,https://www.youtube.com/watch?v=toUgBQv1BT8
4,Lesson 5: Practical Deep Learning for Coders 2022,00:00:00 - Introduction\n00:01:59 - Linear mod...,Jeremy Howard,https://www.youtube.com/watch?v=_rXzeWq4C6w
5,Lesson 6: Practical Deep Learning for Coders 2022,00:00 Review\n02:09 TwoR model\n04:43 How to c...,Jeremy Howard,https://www.youtube.com/watch?v=AdhG64NF76E
6,Lesson 7: Practical Deep Learning for Coders 2022,00:00 - Tweaking first and last layers\n02:47 ...,Jeremy Howard,https://www.youtube.com/watch?v=p4ZZq0736Po
7,Lesson 8 - Practical Deep Learning for Coders ...,00:00 - Neural net from scratch\n04:46 - Param...,Jeremy Howard,https://www.youtube.com/watch?v=htiNBPxcXgo


In [None]:
for url in tqdm(playlist):
    download_youtube_audio(url, out_dir="fastai_transcripts")

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
file_path = Path('./fastai_transcripts/')

In [None]:
def get_captions(infos: list):
    data = []
    for video in infos:
        file = to_snake_case(video['title'])
        vtt_file =  webvtt.read(file_path/f"{file}.vtt")
        documents = []
        for i, caption in tqdm(enumerate(vtt_file.captions)):
            start = caption.start
            start_in_s = caption.start_in_seconds
            passage = ""
            while len(passage.split(' ')) < 100 and i < len(vtt_file.captions)-1:
                passage += " " + vtt_file.captions[i].raw_text
                i += 1
            documents.append({
            'url':f"{video['url']}&t={int(start_in_s)}s",
            'title':video['title'],
            "text": passage,
            "start_second": start_in_s,
            "end_second": vtt_file.captions[i].end_in_seconds,
            })
        data.extend(documents)
    return data

In [None]:
all_cap = get_captions(info)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
all_cap[1]

{'url': 'https://www.youtube.com/watch?v=8SF_h3xF3cE&t=6s',
 'title': 'Lesson 1: Practical Deep Learning for Coders 2022',
 'text': "  This is version five of this course.  And it's the first do one we've done in two years.  So we've got a lot of cool things to cover.  It's amazing how much has changed.  Here is a XKCD from the end of 2015.  Who here is saying XKCD comics before?  Pretty much everybody, not surprising.  So the basic joke here is I'll let you read it,  and then I'll come back to it.  So it can be hard to tell what's easy and what's nearly impossible.  And in 2015 or at the end of 2015,",
 'start_second': 6.6,
 'end_second': 61.0}

We will loop through all of these files to give us the initial core dataset consisting of *video_id*, *text*, *start_second*, *end_second*, and *url*.

In [None]:
import json

In [None]:
with open("train.jsonl", "w") as f:
    for doc in all_cap:
        json.dump(doc, f)
        f.write('\n')

In [None]:
with open("train.jsonl") as f:
    d = f.readlines()

In [None]:
d[:1]

['{"url": "https://www.youtube.com/watch?v=8SF_h3xF3cE&t=0s", "title": "Lesson 1: Practical Deep Learning for Coders 2022", "text": "  Welcome to practical deep learning for coders lesson one.  This is version five of this course.  And it\'s the first do one we\'ve done in two years.  So we\'ve got a lot of cool things to cover.  It\'s amazing how much has changed.  Here is a XKCD from the end of 2015.  Who here is saying XKCD comics before?  Pretty much everybody, not surprising.  So the basic joke here is I\'ll let you read it,  and then I\'ll come back to it.  So it can be hard to tell what\'s easy and what\'s nearly impossible.", "start_second": 0.0, "end_second": 58.0}\n']