# import dependencies

In [1]:
from pytube import YouTube
from pytube import Playlist
from youtube_transcript_api import YouTubeTranscriptApi as yta
import os 
from moviepy.editor import *

# Functions required for making the dataset

In [3]:
def get_720p_video(yt):
    '''given an object of the class Youtube, return the video with 720p in mp3 format'''

    video_streams = yt.streams.filter(only_video=True, file_extension="mp4", progressive=False).order_by('resolution').desc()
    for t in video_streams:
        if(t.resolution == "720p"):
            video = t
            break

    return video

In [4]:
def get_160kbps_audio(yt):
    '''given an object of the class Youtube, return the audio with 160kps in webm format'''

    audio_streams = yt.streams.filter(only_audio=True, file_extension="webm").order_by("abr").desc()
    for t in audio_streams:
        if(t.abr == "160kbps"):
            audio = t 
            break

    return audio

In [160]:
def give_time_stamps_subclips(trans):
    '''given a transcript : return a list containing [start, stop] for every subclip'''

    # tran is a list of dictionaries with keys : 'text', 'start', 'duration'
    ans = []
    for item in trans:
        start, duration = item['start'], item['duration']
        
        #keeping a small buffer on both sides for smooth transition
        end = start + duration + 0.01
        start = start - 0.01
        temp = [start, end]
        ans.append(temp)

    return ans    

In [5]:
def give_id(link):
    '''gives the id of youtube link'''
    
    id = str(link).split("=")[1]
    return id

In [6]:
def give_title(yt):
    '''given a Youtube object returns the title in right format'''
    
    title = yt.title.replace("|", "")
    return title

In [167]:
def clip(folder_path, video_path, audio_path, trans, remove_orignal=False):
    '''
    given the transcript makes subclips of the audio and video files,
    which are stored at video_path and audio_path
    then clubs them together in seperate folders in the foler_path
    '''

    time_stamps = give_time_stamps_subclips(trans)
    input_video = VideoFileClip(video_path)
    input_audio = AudioFileClip(audio_path)

    for j, (start, stop) in enumerate(time_stamps):
        sub_video = input_video.subclip(start, stop)
        sub_audio = input_audio.subclip(start, stop)

        sub_folder_path = os.path.join(folder_path, "sub_" + str(j))
        os.makedirs(sub_folder_path)
        sub_video_path = os.path.join(sub_folder_path, "video.mp4")
        sub_audio_path = os.path.join(sub_folder_path, "audio.mp3")

        sub_video.write_videofile(sub_video_path)
        sub_audio.write_audiofile(sub_audio_path)

    if remove_orignal:
        os.remove(video_path)
        os.remove(audio_path)

# making the dataset

In [66]:
playlist_link = "https://www.youtube.com/playlist?list=PLRas9Wj3fdFJc_Gm2n0wjp-GXML6dHp_L"

main_py = Playlist(playlist_link)
print(len(main_py))

3626


In [165]:
len_of_dataset = 0
num_videos_no_subtitles = 0
num_videos_waste = 0
num_videos_failed = 0


os.makedirs("text_files")

- waste_videos.txt : contains information about videos with are either age- restricted, too long or they are youtube shorts
- no_subtitles.txt : contains information about videos with no text files
- info_dataset.txt : info about videos which are part of the dataset
- failed.txt : could not connect using Youtube 

In [None]:
for playlist_index, video_url in enumerate(main_py):

    # if playlist_index < 3500:
    #     continue
    # if playlist_index > 3505:
    #     break

    video_id = give_id(video_url)
    # print(f"j = {playlist_index}, video_url = {video_url}, video_id = {video_id}")

    try:

        yt = YouTube(video_url)
        video_title = give_title(yt)

        # print(f"title = {video_title}")

        if ((yt.age_restricted==False) and (yt.length >= 60) and (yt.length <= 1800)):
            # if video is not age restricted and length of video is greater than 1min but less than 30min, then we will download it

            # transcript : if we are able to access the transcript, then only download the video
            try: 
                # first pref is to download the 'en-IN' transcript
                trans = yta.get_transcript(video_id, languages=['en-IN', 'en'])

                folder_path = os.path.join("dataset", "folder_" + str(len_of_dataset))

                # need to download the transcript as well
                os.makedirs(folder_path)
                trans_path = folder_path + "/trans.txt"
                with open(trans_path, "a") as f: 
                    for i in trans:
                        f.write(f"{i}\n")

                # Video and audio
                video = get_720p_video(yt)
                video_path = folder_path + f"/{video_title}.mp4"
                # print(f"video_path = {video_path}")
                video.download(folder_path)

                audio = get_160kbps_audio(yt)
                audio_path = folder_path + f"/{video_title}.webm"
                # print(f"audio_path = {audio_path}")
                audio.download(folder_path)

                # now let's clip the audio and video according to the transcript
                clip(folder_path, video_path, audio_path, trans, remove_orignal=True)

                # done with the loop

                # let's make a text file containing dictionary of index and url of videos stored in dataset
                my_dict = {"index" : len_of_dataset, "url" : video_url, "fps" : video.fps}
                # print(my_dict)
                with open("text_files/info_dataset.txt", "a") as f:
                    f.write(f"{my_dict}\n")

                # increment the length
                len_of_dataset += 1
                
            # there might be some videos for which the subtitles are disabled, we need to skip thoose    
            except:
                # for this video we don't have the subtitles : make a txt file storing information about such files
                my_dict = {"index" : num_videos_no_subtitles, "url" : video_url}
                with open("text_files/no_subtitles.txt", "a") as f: 
                    f.write(f"{my_dict}\n")

                num_videos_no_subtitles += 1
        
        else: 
            # the video is either age_restricted or too_short or too_long
            my_dict = {"index": num_videos_waste, "url" : video_url}
            with open("text_files/waste_videos.txt", "a") as f: 
                f.write(f"{my_dict}\n")

            num_videos_waste += 1
        
    except:
        my_dict = {"index" : num_videos_failed, "url" : video_url}
        with open("text_files/failed.txt", "a") as f: 
            f.write(f"{my_dict}\n")

        num_videos_failed += 1

    print("\n\n")
    print("-" * 50)
    print("\n\n")

