In [None]:
!pip install pytube --user

## Testing video download

In [1]:
import os
import json
from pytube import YouTube
import re

def remove_special_characters(input_string):
    # Using regex to keep only alphanumeric characters and spaces
    clean_string = re.sub(r'[^a-zA-Z0-9\s]', '', input_string)
    return clean_string

def download_video_info(video_url, output_directory='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Create a directory for downloads if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Get the highest resolution audio stream
        audio_stream = yt.streams.filter(only_audio=True).first()

        # Download the audio stream
        
        file_name = remove_special_characters(yt.title)
        file_name = file_name.replace(" ","_")
        file_name = file_name.replace("…","_")
        file_name = file_name.replace(",","_")
        
        audio_stream.download(output_directory,filename=f'{file_name}.wav')
        audio_path = os.path.join(f"{output_directory}/{file_name}.wav")
        print(f"Downloading audio to {audio_path}...")

        # Collect video information
        video_info = {
            'title': yt.title,
            'duration': yt.length,
            'author': yt.author,
            'views': yt.views,
            'description': yt.description,
            'audio_path': audio_path
        }

        return video_info

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def process_video_links(file_path):
    with open(file_path, 'r') as file:
        video_links = file.readlines()

    video_data_list = []
    
    # video_links = ["https://www.youtube.com/watch?v=CgruI1RjH_c"]

    for video_link in video_links:
        video_link = video_link.strip()
        video_info = download_video_info(video_link)
        
        if video_info:
            video_data_list.append(video_info)

    # Save video data to a JSON file
    output_json_path = 'video_data.json'
    with open(output_json_path, 'w') as json_file:
        json.dump(video_data_list, json_file, indent=2)

    print(f'Video data saved to {output_json_path}')

# Replace 'YOUR_TEXT_FILE_PATH' with the path to your text file containing video links
text_file_path = "./downloads/Fireship_clone/@Fireship-shorts.txt"

# Process video links and save data to JSON
process_video_links(text_file_path)

Downloading audio to downloads/the_untold_history_of_web_development.wav...
Downloading audio to downloads/my_browser_my_paste.wav...
Downloading audio to downloads/Yo_mama_so_FAT32.wav...
Downloading audio to downloads/A_Day_in_the_Life_of_a_Proompt_Engineer.wav...
Downloading audio to downloads/Uh_oh_AIsearch_engine_for_developers_has_emerged.wav...
Downloading audio to downloads/Meet_SAM_Metas_latest_AI_model.wav...
Downloading audio to downloads/the_PATH_var_of_righteousness.wav...


KeyboardInterrupt: 

## Testing Audio Transcription api

In [2]:
import json
from deepgram import DeepgramClient, PrerecordedOptions

def transcribe_audio(audio_file_path):

    # Your Deepgram API Key
    DEEPGRAM_API_KEY = '7bd6cbddac3ef5b7b8e14b14072d9d978af15e73'

    # Initialize the Deepgram SDK
    deepgram = DeepgramClient(DEEPGRAM_API_KEY)

    # Call the transcribe_file method on the prerecorded class
    with open(audio_file_path, "rb") as file:
        buffer_data = file.read()

    payload = {
        "buffer": buffer_data,
    }

    options = PrerecordedOptions(
        model="nova-2",
        language="en",
        smart_format=True,
        punctuate=True,
        paragraphs=True,
        diarize=True,
        summarize="v2",
        detect_topics=True,
        filler_words=True,
    )

    file_response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
    file_response = file_response.to_json()

    json_final = json.loads(file_response)

    with open(f"test.json", "w") as file:
        json.dump(json_final, file, indent=4)
        
    return json_final
    

# # Example usage:
# audio_file_path = "./downloads/Fireship_clone/100+_Computer_Science_Concepts_Explained.wav"
# transcribe_audio(audio_file_path)
# print("Transcribing completed successfully")


## Final function

In [3]:
import os
import json
from pytube import YouTube
from tqdm import tqdm
from deepgram import DeepgramClient, PrerecordedOptions

def download_and_transcribe_video(video_url, output_directory='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Create a directory for downloads if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Get the highest resolution audio stream
        audio_stream = yt.streams.filter(only_audio=True).first()

        # Download the audio stream with tqdm progress bar
        file_name = remove_special_characters(yt.title)
        file_name = file_name.replace(" ","_")
        file_name = file_name.replace("…","_")
        file_name = file_name.replace(",","_")
        audio_path = os.path.join(output_directory, f'{file_name}.wav')
        print(f"Downloading audio to {audio_path}...")
        # with tqdm(total=audio_stream.filesize, unit='B', unit_scale=True, desc=f'Downloading {file_name}') as bar:
        #     def on_progress(chunk, _):
        #         bar.update(len(chunk))

        audio_stream.download(output_directory, filename=f'{file_name}.wav')

        transcript = transcribe_audio(audio_path)

        # Collect video information
        video_info = {
            'link': video_url,
            'title': yt.title,
            'duration': yt.length,
            'author': yt.author,
            'views': yt.views,
            'description': yt.description,
            'audio_path': audio_path,
            'transcript': transcript
        }
        
        save_transcript_to_json(video_info, f'{output_directory}/{file_name}_transcript.json')
        append_transcript_to_json(video_info, f'final_json_transcript_final.json')

        return video_info

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def save_transcript_to_json(transcript, json_path):
    with open(json_path, 'w') as file:
        json.dump(transcript, file, indent=4)
    print(f'Transcript saved to {json_path}')

def append_transcript_to_json(transcript, json_path):
    # Create an empty list if the file doesn't exist yet
    if not os.path.exists(json_path):
        with open(json_path, 'w') as file:
            json.dump([], file)

    # Load existing data from the file
    with open(json_path, 'r') as file:
        data = json.load(file)

    # Append the new transcript to the list
    data.append(transcript)

    # Save the updated list to the file
    with open(json_path, 'w') as file:
        json.dump(data, file, indent=4)

    print(f'Transcript appended to {json_path}')

def process_video_links(file_path):
    with open(file_path, 'r') as file:
        video_list = file.readlines()
        
    video_data_list = []
    videos_to_process = [video_line.strip().split(",") for video_line in video_list if video_line.strip().endswith(',0')]

    # for video_link in tqdm(video_list, desc='Processing videos', unit='video'):
    # for idx, video_line in enumerate(tqdm(video_list, desc='Processing videos', unit='video')):
    for video_link, progress in tqdm(videos_to_process, desc='Processing videos', unit='video'):
        # video_link, progress = video_line.split(",")
        if int(progress) == 0:
            video_link = video_link.strip()
            print(f'\nDownloading and transcribing: {video_link}')
            try:
                video_info = download_and_transcribe_video(video_link)
                video_data_list.append(video_info)
                idx = video_list.index(f'{video_link},0\n')
                video_list[idx] = f'{video_link},1\n'
            except:
                # save failed video links in a text file
                print(f'\nError processing video',video_link)
                with open("logs_file.txt", 'a') as log_file:
                    log_file.write(video_link)
        else:
            print("Video already downloaded and processed")
            
        with open(file_path, "w") as file:
            file.writelines(video_list)

# Replace 'YOUR_TEXT_FILE_PATH' with the path to your text file containing video links
text_file_path = "./downloads/Fireship_clone_2/@Fireship-videos-remaining.txt"

# Process video links and save data to JSON
process_video_links(text_file_path)


Processing videos:   0%|          | 0/100 [00:00<?, ?video/s]


Downloading and transcribing: https://www.youtube.com/watch?v=5ChkQKUzDCs
Downloading audio to downloads\Big_projects_are_ditching_TypeScript_why.wav...
Transcript saved to downloads/Big_projects_are_ditching_TypeScript_why_transcript.json


Processing videos:   1%|          | 1/100 [00:46<1:16:06, 46.12s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=3Ay6qZ88boI
Downloading audio to downloads\PHP_will_make_you_poor_StackOverflow_2023_Results.wav...
Transcript saved to downloads/PHP_will_make_you_poor_StackOverflow_2023_Results_transcript.json


Processing videos:   2%|▏         | 2/100 [01:33<1:16:28, 46.82s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Uc6lM1Aig9c
Downloading audio to downloads\Apple_makes_risky_bet_on_VRAR_future.wav...
Transcript saved to downloads/Apple_makes_risky_bet_on_VRAR_future_transcript.json


Processing videos:   3%|▎         | 3/100 [02:23<1:17:55, 48.20s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=V4qrvoFodmo
Downloading audio to downloads\Worldcoin_the_shtcoin_for_humanity_just_launched.wav...
Transcript saved to downloads/Worldcoin_the_shtcoin_for_humanity_just_launched_transcript.json


Processing videos:   4%|▍         | 4/100 [03:10<1:16:19, 47.71s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=4_MDP6TcHwU
Downloading audio to downloads\Why_did_my_sidehustle_fail_How_to_validate_business_ideas.wav...
Transcript saved to downloads/Why_did_my_sidehustle_fail_How_to_validate_business_ideas_transcript.json


Processing videos:   5%|▌         | 5/100 [04:01<1:17:46, 49.12s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=nmfRDRNjCnM
Downloading audio to downloads\10_crazy_announcements_from_Google_IO.wav...
Transcript saved to downloads/10_crazy_announcements_from_Google_IO_transcript.json


Processing videos:   6%|▌         | 6/100 [04:50<1:16:26, 48.79s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=LkrI3erbUtw


Processing videos:   7%|▋         | 7/100 [04:51<51:40, 33.34s/video]  

An error occurred: LkrI3erbUtw is age restricted, and can't be accessed without logging in.

Downloading and transcribing: https://www.youtube.com/watch?v=FJACTC9wFhU
Downloading audio to downloads\AWS_CodeWhisperer_The_Copilot_Killer.wav...
Transcript saved to downloads/AWS_CodeWhisperer_The_Copilot_Killer_transcript.json


Processing videos:   8%|▊         | 8/100 [05:39<58:14, 37.98s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=klTvEwg3oJ4
Downloading audio to downloads\Vector_databases_are_so_hot_right_now_WTF_are_they.wav...
Transcript saved to downloads/Vector_databases_are_so_hot_right_now_WTF_are_they_transcript.json


Processing videos:   9%|▉         | 9/100 [06:25<1:01:14, 40.38s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=GXN34w8kju4
Downloading audio to downloads\Twitter_algorithm_opensourced_Is_Elon_playing_5D_chess.wav...
Transcript saved to downloads/Twitter_algorithm_opensourced_Is_Elon_playing_5D_chess_transcript.json


Processing videos:  10%|█         | 10/100 [07:12<1:04:01, 42.68s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=xW9DJTvB3NI
Downloading audio to downloads\Google_Bard_the_ChatGPT_killer.wav...
Transcript saved to downloads/Google_Bard_the_ChatGPT_killer_transcript.json


Processing videos:  11%|█         | 11/100 [08:00<1:05:43, 44.31s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=66tfvFeALBQ


Processing videos:  12%|█▏        | 12/100 [08:02<45:49, 31.25s/video]  

An error occurred: 66tfvFeALBQ is age restricted, and can't be accessed without logging in.

Downloading and transcribing: https://www.youtube.com/watch?v=gUYBFDPZ5qk
Downloading audio to downloads\I_built_a_5_chat_app_with_Pocketbase__Svelte_Will_it_scale.wav...
Transcript saved to downloads/I_built_a_5_chat_app_with_Pocketbase__Svelte_Will_it_scale_transcript.json


Processing videos:  13%|█▎        | 13/100 [08:57<55:53, 38.55s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=eaedq1Jl2fc
Downloading audio to downloads\What_will_AI_Programming_look_like_in_5_Years.wav...
Transcript saved to downloads/What_will_AI_Programming_look_like_in_5_Years_transcript.json


Processing videos:  14%|█▍        | 14/100 [09:43<58:33, 40.86s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=2pfcynxODJc
Downloading audio to downloads\Is_FAANG_fked.wav...
Transcript saved to downloads/Is_FAANG_fked_transcript.json


Processing videos:  15%|█▌        | 15/100 [10:27<59:01, 41.67s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Zs9Tifup1Bc
Downloading audio to downloads\Is_coding_really_dead_6_trends_that_look_bad.wav...
Transcript saved to downloads/Is_coding_really_dead_6_trends_that_look_bad_transcript.json


Processing videos:  16%|█▌        | 16/100 [11:14<1:00:48, 43.43s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=x2eF3YLiNhY
Downloading audio to downloads\Qwik_the_worlds_first_O1_JavaScript_framework.wav...
Transcript saved to downloads/Qwik_the_worlds_first_O1_JavaScript_framework_transcript.json


Processing videos:  17%|█▋        | 17/100 [11:56<59:19, 42.89s/video]  

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Wqy3PBEglXQ
Downloading audio to downloads\PocketBase_The_Ultimate_SideHustle_Backend.wav...
Transcript saved to downloads/PocketBase_The_Ultimate_SideHustle_Backend_transcript.json


Processing videos:  18%|█▊        | 18/100 [12:40<58:54, 43.11s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=ciF7WZXmpjU
Downloading audio to downloads\There_aint_no_such_thing_as_a_free_tier.wav...
Transcript saved to downloads/There_aint_no_such_thing_as_a_free_tier_transcript.json


Processing videos:  19%|█▉        | 19/100 [13:24<58:39, 43.45s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=gxBkghlglTg
Downloading audio to downloads\Astro_just_Launched_Could_it_be_the_ultimate_web_framework.wav...
Transcript saved to downloads/Astro_just_Launched_Could_it_be_the_ultimate_web_framework_transcript.json


Processing videos:  20%|██        | 20/100 [14:07<57:52, 43.41s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=-bHK0qmp06c
Downloading audio to downloads\Carbon_Lang_The_C_killer.wav...
Transcript saved to downloads/Carbon_Lang_The_C_killer_transcript.json


Processing videos:  21%|██        | 21/100 [14:51<57:17, 43.51s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=yOP5-3_WFus
Downloading audio to downloads\Is_edge_computing_really_faster.wav...
Transcript saved to downloads/Is_edge_computing_really_faster_transcript.json


Processing videos:  22%|██▏       | 22/100 [15:40<58:31, 45.02s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=HDZWWFSZUF0
Downloading audio to downloads\Web5_The_Web3_Killer.wav...
Transcript saved to downloads/Web5_The_Web3_Killer_transcript.json


Processing videos:  23%|██▎       | 23/100 [16:24<57:31, 44.82s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=g-_hVXzkn0o
Downloading audio to downloads\Is_the_tech_bubble_bursting_right_now.wav...
Transcript saved to downloads/Is_the_tech_bubble_bursting_right_now_transcript.json


Processing videos:  24%|██▍       | 24/100 [17:08<56:20, 44.49s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=noq-ZHTD2Cg
Downloading audio to downloads\The_Nuxt_big_thing_in_web_development.wav...
Transcript saved to downloads/The_Nuxt_big_thing_in_web_development_transcript.json


Processing videos:  25%|██▌       | 25/100 [17:52<55:31, 44.41s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=UgIwjLg4ONk
Downloading audio to downloads\The_Parallax_Effect__5_Minute_WebDev_Project.wav...
Transcript saved to downloads/The_Parallax_Effect__5_Minute_WebDev_Project_transcript.json


Processing videos:  26%|██▌       | 26/100 [18:35<54:17, 44.02s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=iMVgvkVJuDI
Downloading audio to downloads\Time_a_programmers_worst_enemy__The_Code_Report.wav...
Transcript saved to downloads/Time_a_programmers_worst_enemy__The_Code_Report_transcript.json


Processing videos:  27%|██▋       | 27/100 [19:18<53:14, 43.76s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=O9F4K804XC8
Downloading audio to downloads\A_heroic_new_proposal_for_JavaScript__The_Code_Report.wav...
Transcript saved to downloads/A_heroic_new_proposal_for_JavaScript__The_Code_Report_transcript.json


Processing videos:  28%|██▊       | 28/100 [19:59<51:36, 43.01s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=deg8bOoziaE
Downloading audio to downloads\This_video_was_made_with_code_But_how.wav...
Transcript saved to downloads/This_video_was_made_with_code_But_how_transcript.json


Processing videos:  29%|██▉       | 29/100 [20:45<51:52, 43.84s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=3OqQhtLwY9o
Downloading audio to downloads\I_feel_cyber_attacked__The_Code_Report.wav...
Transcript saved to downloads/I_feel_cyber_attacked__The_Code_Report_transcript.json


Processing videos:  30%|███       | 30/100 [21:28<50:50, 43.58s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=1L2hrG-7i2Y
Downloading audio to downloads\The_Shocking_State_of_JavaScript__The_Code_Report.wav...
Transcript saved to downloads/The_Shocking_State_of_JavaScript__The_Code_Report_transcript.json


Processing videos:  31%|███       | 31/100 [22:10<49:39, 43.18s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=KPyeJ5J2a7A
Downloading audio to downloads\What_happens_if_you_hack_45_billion_BTC__The_Code_Report.wav...
Transcript saved to downloads/What_happens_if_you_hack_45_billion_BTC__The_Code_Report_transcript.json


Processing videos:  32%|███▏      | 32/100 [22:53<48:46, 43.03s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=MBqS1kYzwTc
Downloading audio to downloads\BIG_new_feature_lands_in_Nodejs__The_Code_Report.wav...
Transcript saved to downloads/BIG_new_feature_lands_in_Nodejs__The_Code_Report_transcript.json


Processing videos:  33%|███▎      | 33/100 [23:38<48:43, 43.63s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=V7LEihbOv3Y
Downloading audio to downloads\How_I_post_banger_tweets_with_artificial_intelligence__Twitter_Bot_Tutorial.wav...
Transcript saved to downloads/How_I_post_banger_tweets_with_artificial_intelligence__Twitter_Bot_Tutorial_transcript.json


Processing videos:  34%|███▍      | 34/100 [24:26<49:24, 44.92s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=bJUl3OAIT0k
Downloading audio to downloads\How_a_CS_student_tracks_Elons_Private_Jet_with_Python__The_Code_Report.wav...
Transcript saved to downloads/How_a_CS_student_tracks_Elons_Private_Jet_with_Python__The_Code_Report_transcript.json


Processing videos:  35%|███▌      | 35/100 [25:11<48:38, 44.90s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=_oHByo8tiEY
Downloading audio to downloads\I_created_a_Command_Line_Game_for_you__5Minute_Nodejs_CLI_Project.wav...
Transcript saved to downloads/I_created_a_Command_Line_Game_for_you__5Minute_Nodejs_CLI_Project_transcript.json


Processing videos:  36%|███▌      | 36/100 [26:00<49:19, 46.25s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=BrcugNqRwUs
Downloading audio to downloads\How_to_buy_Stocks_with_JavaScript__Algo_Trading_Tutorial_for_Dummies.wav...
Transcript saved to downloads/How_to_buy_Stocks_with_JavaScript__Algo_Trading_Tutorial_for_Dummies_transcript.json


Processing videos:  37%|███▋      | 37/100 [26:49<49:21, 47.01s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Xg9ihH15Uto
Downloading audio to downloads\How_to_Land_a_100Kyr_Tech_Job__10_Strategies.wav...
Transcript saved to downloads/How_to_Land_a_100Kyr_Tech_Job__10_Strategies_transcript.json


Processing videos:  38%|███▊      | 38/100 [27:40<49:49, 48.22s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=meTpMP0J5E8
Downloading audio to downloads\Build_a_WEB3_app_to_mint_unlimited_NFTs_But_should_you.wav...
Transcript saved to downloads/Build_a_WEB3_app_to_mint_unlimited_NFTs_But_should_you_transcript.json


Processing videos:  39%|███▉      | 39/100 [28:40<52:40, 51.81s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=DOIWQddRD5M
Downloading audio to downloads\Is_Redis_the_ONLY_database_you_need__Fullstack_app_from_scratch_with_Nextjs__Redis.wav...
Transcript saved to downloads/Is_Redis_the_ONLY_database_you_need__Fullstack_app_from_scratch_with_Nextjs__Redis_transcript.json


Processing videos:  40%|████      | 40/100 [29:31<51:22, 51.37s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=R6S-b_k-ZKY
Downloading audio to downloads\The_Dark_Side_of_Open_Source__What_really_happened_to_Fakerjs.wav...
Transcript saved to downloads/The_Dark_Side_of_Open_Source__What_really_happened_to_Fakerjs_transcript.json


Processing videos:  41%|████      | 41/100 [30:12<47:41, 48.50s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=9iU_IE6vnJ8
Downloading audio to downloads\Monorepos__How_the_Pros_Scale_Huge_Software_Projects__Turborepo_vs_Nx.wav...
Transcript saved to downloads/Monorepos__How_the_Pros_Scale_Huge_Software_Projects__Turborepo_vs_Nx_transcript.json


Processing videos:  42%|████▏     | 42/100 [31:06<48:17, 49.96s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=hdHjjBS4cs8
Downloading audio to downloads\Brainfk_in_100_Seconds.wav...
Transcript saved to downloads/Brainfk_in_100_Seconds_transcript.json


Processing videos:  43%|████▎     | 43/100 [31:50<45:42, 48.11s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=wHTcrmhskto
Downloading audio to downloads\Is_Web3_all_Hype_Top_10_Web_30_Questions__Answers.wav...
Transcript saved to downloads/Is_Web3_all_Hype_Top_10_Web_30_Questions__Answers_transcript.json


Processing videos:  44%|████▍     | 44/100 [32:39<45:20, 48.58s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=mAsM9c2sGjA
Downloading audio to downloads\Shopify_built_a_JS_Framework__Hydrogen_First_Look.wav...
Transcript saved to downloads/Shopify_built_a_JS_Framework__Hydrogen_First_Look_transcript.json


Processing videos:  45%|████▌     | 45/100 [33:26<44:06, 48.11s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=pfaSUYaSgRo
Downloading audio to downloads\Ultimate_Tailwind_CSS_Tutorial__Build_a_Discordinspired_Animated_Navbar.wav...
Transcript saved to downloads/Ultimate_Tailwind_CSS_Tutorial__Build_a_Discordinspired_Animated_Navbar_transcript.json


Processing videos:  46%|████▌     | 46/100 [34:17<44:05, 48.98s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=yufqeJLP1rI
Downloading audio to downloads\Auth0_in_100_Seconds__And_beyond_with_a_Nextjs_Authentication_Tutorial.wav...
Transcript saved to downloads/Auth0_in_100_Seconds__And_beyond_with_a_Nextjs_Authentication_Tutorial_transcript.json


Processing videos:  47%|████▋     | 47/100 [35:05<42:56, 48.61s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Sxxw3qtb3_g
Downloading audio to downloads\How_to_OVER_Engineer_a_Website__What_is_a_Tech_Stack.wav...
Transcript saved to downloads/How_to_OVER_Engineer_a_Website__What_is_a_Tech_Stack_transcript.json


Processing videos:  48%|████▊     | 48/100 [35:55<42:34, 49.12s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=c_8cplBi_gE
Downloading audio to downloads\Content_Platforms_Explained_in_100_Seconds__Build_your_own_with_Sanity.wav...
Transcript saved to downloads/Content_Platforms_Explained_in_100_Seconds__Build_your_own_with_Sanity_transcript.json


Processing videos:  49%|████▉     | 49/100 [36:55<44:33, 52.42s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=cEBkvm0-rg0
Downloading audio to downloads\How_to_Google_It_like_a_Senior_Software_Engineer.wav...
Transcript saved to downloads/How_to_Google_It_like_a_Senior_Software_Engineer_transcript.json


Processing videos:  50%|█████     | 50/100 [37:41<42:00, 50.41s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=SuqU904ZHA4
Downloading audio to downloads\Springy_Animated_Modals__Framer_Motion__React_Tutorial_for_Beginners.wav...
Transcript saved to downloads/Springy_Animated_Modals__Framer_Motion__React_Tutorial_for_Beginners_transcript.json


Processing videos:  51%|█████     | 51/100 [38:33<41:25, 50.72s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=J5x3OMXjgMc
Downloading audio to downloads\I_built_a_decentralized_chat_dapp__GUN_web3_Tutorial.wav...
Transcript saved to downloads/I_built_a_decentralized_chat_dapp__GUN_web3_Tutorial_transcript.json


Processing videos:  52%|█████▏    | 52/100 [39:27<41:24, 51.76s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=lPJVi797Uy0
Downloading audio to downloads\Build_a_Curvaceous_Homepage__Wavy_Background_Tutorial_with_SVG__CSS.wav...
Transcript saved to downloads/Build_a_Curvaceous_Homepage__Wavy_Background_Tutorial_with_SVG__CSS_transcript.json


Processing videos:  53%|█████▎    | 53/100 [40:20<40:50, 52.13s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=cuHDQhDhvPE
Downloading audio to downloads\I_built_the_same_app_10_times__Which_JS_Framework_is_best.wav...
Transcript saved to downloads/I_built_the_same_app_10_times__Which_JS_Framework_is_best_transcript.json


Processing videos:  54%|█████▍    | 54/100 [41:17<41:02, 53.53s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=zd6ffqoK_EU
Downloading audio to downloads\The_Decline_of_Firebase_bundle_sizes__New_JavaScript_SDK_First_Look.wav...
Transcript saved to downloads/The_Decline_of_Firebase_bundle_sizes__New_JavaScript_SDK_First_Look_transcript.json


Processing videos:  55%|█████▌    | 55/100 [42:04<38:51, 51.82s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=4duqI8WyfqE
Downloading audio to downloads\The_Truth_about_Github_Copilot__AI_Programming_First_Look.wav...
Transcript saved to downloads/The_Truth_about_Github_Copilot__AI_Programming_First_Look_transcript.json


Processing videos:  56%|█████▌    | 56/100 [42:56<38:02, 51.88s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=K6Vcfm7TA5U
Downloading audio to downloads\Google_Sheets_Your_Next_Database.wav...
Transcript saved to downloads/Google_Sheets_Your_Next_Database_transcript.json


Processing videos:  57%|█████▋    | 57/100 [43:54<38:20, 53.50s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=7JdcGBSWo50
Downloading audio to downloads\Build_5_Apps_in_5_Minutes_with_Flutter_But_should_you.wav...
Transcript saved to downloads/Build_5_Apps_in_5_Minutes_with_Flutter_But_should_you_transcript.json


Processing videos:  58%|█████▊    | 58/100 [44:45<37:01, 52.90s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=vKJpN5FAeF4
Downloading audio to downloads\Closures_Explained_in_100_Seconds__Tricky_JavaScript_Interview_Prep.wav...
Transcript saved to downloads/Closures_Explained_in_100_Seconds__Tricky_JavaScript_Interview_Prep_transcript.json


Processing videos:  59%|█████▉    | 59/100 [45:33<35:10, 51.49s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Jv2uxzhPFl4
Downloading audio to downloads\TestDriven_Development__Fun_TDD_Introduction_with_JavaScript.wav...
Transcript saved to downloads/TestDriven_Development__Fun_TDD_Introduction_with_JavaScript_transcript.json


Processing videos:  60%|██████    | 60/100 [46:30<35:15, 52.88s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Q7AOvWpIVHU
Downloading audio to downloads\Build_a_Mindblowing_3D_Portfolio_Website__Threejs_Beginners_Tutorial.wav...
Transcript saved to downloads/Build_a_Mindblowing_3D_Portfolio_Website__Threejs_Beginners_Tutorial_transcript.json


Processing videos:  61%|██████    | 61/100 [47:25<34:52, 53.66s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=vqs_0W-MSB0
Downloading audio to downloads\How_a_CPU_Works_in_100_Seconds__Apple_Silicon_M1_vs_Intel_i9.wav...
Transcript saved to downloads/How_a_CPU_Works_in_100_Seconds__Apple_Silicon_M1_vs_Intel_i9_transcript.json


Processing videos:  62%|██████▏   | 62/100 [48:21<34:24, 54.32s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=WiwfiVdfRIc
Downloading audio to downloads\Is_Supabase_Legit_Firebase_Alternative_Breakdown.wav...
Transcript saved to downloads/Is_Supabase_Legit_Firebase_Alternative_Breakdown_transcript.json


Processing videos:  63%|██████▎   | 63/100 [49:12<32:59, 53.49s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=v969_M6cWk0
Downloading audio to downloads\Ethical_Hacking_in_100_Seconds__And_why_do_we_need_CORS.wav...
Transcript saved to downloads/Ethical_Hacking_in_100_Seconds__And_why_do_we_need_CORS_transcript.json


Processing videos:  64%|██████▍   | 64/100 [50:04<31:45, 52.92s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=ydkQlJhodio
Downloading audio to downloads\How_to_use_TypeScript_with_React_But_should_you.wav...
Transcript saved to downloads/How_to_use_TypeScript_with_React_But_should_you_transcript.json


Processing videos:  65%|██████▌   | 65/100 [50:55<30:28, 52.25s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Cz3WcZLRaWc
Downloading audio to downloads\MySQL__The_Basics__Learn_SQL_in_23_Easy_Steps.wav...
Transcript saved to downloads/MySQL__The_Basics__Learn_SQL_in_23_Easy_Steps_transcript.json


Processing videos:  66%|██████▌   | 66/100 [51:53<30:41, 54.16s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=sZif1kuAjcY
Downloading audio to downloads\Get_Paid_with_Crypto_in_your_App__Coinbase_Commerce_Tutorial.wav...
Transcript saved to downloads/Get_Paid_with_Crypto_in_your_App__Coinbase_Commerce_Tutorial_transcript.json


Processing videos:  67%|██████▋   | 67/100 [52:46<29:36, 53.84s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=UTHgr6NLeEw
Downloading audio to downloads\Make_Awesome_SVG_Animations_with_CSS__7_Useful_Techniques.wav...
Transcript saved to downloads/Make_Awesome_SVG_Animations_with_CSS__7_Useful_Techniques_transcript.json


Processing videos:  68%|██████▊   | 68/100 [53:40<28:39, 53.75s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=ShcR4Zfc6Dw
Downloading audio to downloads\Why_so_many_distros_The_Weird_History_of_Linux.wav...
Transcript saved to downloads/Why_so_many_distros_The_Weird_History_of_Linux_transcript.json


Processing videos:  69%|██████▉   | 69/100 [54:34<27:46, 53.76s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=WmR9IMUD_CY
Downloading audio to downloads\WebRTC_in_100_Seconds__Build_a_Video_Chat_app_from_Scratch.wav...
Transcript saved to downloads/WebRTC_in_100_Seconds__Build_a_Video_Chat_app_from_Scratch_transcript.json


Processing videos:  70%|███████   | 70/100 [55:27<26:46, 53.54s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=qF7dkrce-mQ
Downloading audio to downloads\Bitcoin__in_100_Seconds__Build_your_Own_Blockchain.wav...
Transcript saved to downloads/Bitcoin__in_100_Seconds__Build_your_Own_Blockchain_transcript.json


Processing videos:  71%|███████   | 71/100 [56:21<25:59, 53.79s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=yJ5agkia4o8
Downloading audio to downloads\Invite_Only_Use_FOMO_to_grow_your_app__Exclusive_Phone_SignIn_Tutorial.wav...
Transcript saved to downloads/Invite_Only_Use_FOMO_to_grow_your_app__Exclusive_Phone_SignIn_Tutorial_transcript.json


Processing videos:  72%|███████▏  | 72/100 [57:16<25:18, 54.24s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=-MTSQjw5DrM
Downloading audio to downloads\RESTful_APIs_in_100_Seconds__Build_an_API_from_Scratch_with_Nodejs_Express.wav...
Transcript saved to downloads/RESTful_APIs_in_100_Seconds__Build_an_API_from_Scratch_with_Nodejs_Express_transcript.json


Processing videos:  73%|███████▎  | 73/100 [58:09<24:10, 53.73s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=r_MpUP6aKiQ
Downloading audio to downloads\dotfiles_in_100_Seconds.wav...
Transcript saved to downloads/dotfiles_in_100_Seconds_transcript.json


Processing videos:  74%|███████▍  | 74/100 [59:07<23:49, 54.96s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=O0gmXbN7lVE
Downloading audio to downloads\_in_NaN_Seconds.wav...
Transcript saved to downloads/_in_NaN_Seconds_transcript.json


Processing videos:  75%|███████▌  | 75/100 [59:56<22:07, 53.11s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=TNhaISOUy6Q
Downloading audio to downloads\10_React_Hooks_Explained__Plus_Build_your_own_from_Scratch.wav...
Transcript saved to downloads/10_React_Hooks_Explained__Plus_Build_your_own_from_Scratch_transcript.json


Processing videos:  76%|███████▌  | 76/100 [1:00:50<21:20, 53.37s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=QdHvS0D1zAI
Downloading audio to downloads\Raspberry_Pi_versus_AWS__How_to_host_your_website_on_the_RPi4.wav...
Transcript saved to downloads/Raspberry_Pi_versus_AWS__How_to_host_your_website_on_the_RPi4_transcript.json


Processing videos:  77%|███████▋  | 77/100 [1:01:43<20:27, 53.37s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=Sklc_fQBmcs
Downloading audio to downloads\Nextjs_in_100_Seconds__Plus_Full_Beginners_Tutorial.wav...


Processing videos:  78%|███████▊  | 78/100 [1:01:58<15:18, 41.74s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=sFsRylCQblw
Downloading audio to downloads\Progressive_Web_Apps_in_100_Seconds__Build_a_PWA_from_Scratch.wav...


Processing videos:  79%|███████▉  | 79/100 [1:02:12<11:44, 33.57s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=N6lYcXjd4pg
Downloading audio to downloads\How_to_Burn_Money_in_the_Cloud__Avoid_AWS_GCP_Azure_Cost_Disasters.wav...


Processing videos:  80%|████████  | 80/100 [1:02:27<09:17, 27.85s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=e1KpKBHJOrA
Downloading audio to downloads\CSS_Pseudoelements__in_100_Seconds.wav...


Processing videos:  81%|████████  | 81/100 [1:02:42<07:40, 24.21s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=yrrw6KdGuxc
Downloading audio to downloads\Animated_CSS_Login_Form_w_Password_Validation_Meter.wav...
Transcript saved to downloads/Animated_CSS_Login_Form_w_Password_Validation_Meter_transcript.json


Processing videos:  82%|████████▏ | 82/100 [1:03:48<11:01, 36.75s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=JTOJsU3FSD8
Downloading audio to downloads\7_Full_Stack_App_Ideas_for_Developers_w_Instructions_Included.wav...
Transcript saved to downloads/7_Full_Stack_App_Ideas_for_Developers_w_Instructions_Included_transcript.json


Processing videos:  83%|████████▎ | 83/100 [1:04:44<11:59, 42.29s/video]

Transcript appended to final_json_transcript_final.json

Downloading and transcribing: https://www.youtube.com/watch?v=wvRVfyPKOA0


Processing videos:  85%|████████▌ | 85/100 [1:04:56<05:50, 23.36s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=-atblwgc63E
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=YOlr79NaAtQ


Processing videos:  87%|████████▋ | 87/100 [1:04:56<02:30, 11.55s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=scEDHsr3APg
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=rjfO2AKEsA8


Processing videos:  89%|████████▉ | 89/100 [1:04:57<01:03,  5.76s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=3yqDxhR2XxE
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>


Processing videos:  90%|█████████ | 90/100 [1:04:57<00:40,  4.09s/video]


Downloading and transcribing: https://www.youtube.com/watch?v=biOMz4puGt8
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>


Processing videos:  91%|█████████ | 91/100 [1:04:57<00:26,  2.92s/video]


Downloading and transcribing: https://www.youtube.com/watch?v=OWy0e231eMI
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=sSDHdWrSqLY


Processing videos:  93%|█████████▎| 93/100 [1:04:58<00:10,  1.53s/video]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=gigtS_5KOqo
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>


Processing videos:  94%|█████████▍| 94/100 [1:04:58<00:06,  1.13s/video]


Downloading and transcribing: https://www.youtube.com/watch?v=tV9byUUfPQM
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=MYHVyl-juUk


Processing videos:  96%|█████████▌| 96/100 [1:04:58<00:02,  1.52video/s]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=bL3I7Pls-1w
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>


Processing videos:  97%|█████████▋| 97/100 [1:04:58<00:01,  1.92video/s]


Downloading and transcribing: https://www.youtube.com/watch?v=Zd014DjonqE
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>


Processing videos:  98%|█████████▊| 98/100 [1:04:59<00:00,  2.36video/s]


Downloading and transcribing: https://www.youtube.com/watch?v=qP5zw7fjQgo
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=HkdAHXoRtos


Processing videos: 100%|██████████| 100/100 [1:04:59<00:00,  3.18video/s]

An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>

Downloading and transcribing: https://www.youtube.com/watch?v=8Wy1AqY5gqE
An error occurred: <urlopen error [Errno 11001] getaddrinfo failed>


Processing videos: 100%|██████████| 100/100 [1:04:59<00:00, 39.00s/video]


In [4]:
# combine all the json files into a single file

import os
import json

def combine_json_files(directory_name, output_file='combined.json'):
    combined_data = []

    # Check if the directory exists
    if not os.path.exists(directory_name) or not os.path.isdir(directory_name):
        print(f"Error: {directory_name} is not a valid directory.")
        return

    # Loop through all files in the directory
    for filename in os.listdir(directory_name):
        file_path = os.path.join(directory_name, filename)

        # Check if the file is a JSON file
        if os.path.isfile(file_path) and filename.endswith('.json'):
            with open(file_path, 'r') as file:
                try:
                    # Load JSON data from the file
                    json_data = json.load(file)

                    # Append the loaded data to the combined_data list
                    combined_data.append(json_data)

                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file {filename}: {e}")

    # Write the combined_data to a new JSON file
    with open(output_file, 'w') as output_file:
        json.dump(combined_data, output_file, indent=2)

    print(f"Combined JSON data saved to {output_file.name}")

# Example usage:
directory_name = './downloads/'
combine_json_files(directory_name)

Combined JSON data saved to combined.json


## Huggingface operations

In [5]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="./combined.json")
# dataset2 = load_dataset("json", data_files="./final_json_transcript_final.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['author', 'duration', 'description', 'transcript', 'audio_path', 'link', 'title', 'views'],
        num_rows: 522
    })
})

In [87]:
# Assuming you have the text file named 'input.txt' and the list of video links
# named 'video_link_list'

input_file_path = './downloads/Fireship_clone_2/@Fireship-videos.txt'
output_file_path = 'output.txt'

# Read the existing links from the text file
with open(input_file_path, 'r') as file:
    existing_links = [line.split(',')[0] for line in file]

# Filter out the links that are not in video_link_list
new_links = [link for link in existing_links if link not in dataset["train"]["link"]]

# Write the new links to the output file
with open(output_file_path, 'w') as output_file:
    for link in new_links:
        output_file.write(f"{link},0\n")

print(f"New links written to {output_file_path}")

New links written to output.txt


In [7]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
dataset.push_to_hub("CognitiveLab/FS_test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/CognitiveLab/FS_test/commit/2b573174a0ad7b4f8c917a257dd26f45cc8be28c', commit_message='Upload dataset', commit_description='', oid='2b573174a0ad7b4f8c917a257dd26f45cc8be28c', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['author', 'duration', 'description', 'transcript', 'audio_path', 'link', 'title', 'views'],
        num_rows: 522
    })
})

In [10]:
import pandas as pd
df = pd.DataFrame(dataset['train'])

In [11]:
df.head()

Unnamed: 0,author,duration,description,transcript,audio_path,link,title,views
0,Fireship,787,Learn the fundamentals of Computer Science wit...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Computer_Science_Concepts_Expla...,https://www.youtube.com/watch?v=-uleG_Vecis,100+ Computer Science Concepts Explained,2110216
1,Fireship,743,The ultimate 10 minute JavaScript course that ...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_JavaScript_Concepts_you_Need_to...,https://www.youtube.com/watch?v=lkIFF4maKMU,100+ JavaScript Concepts you Need to Know,1642938
2,Fireship,798,WebDev 101 is a complete introduction into the...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Web_Development_Things_you_Shou...,https://www.youtube.com/watch?v=erEgovG9WBs,100+ Web Development Things you Should Know,1296840
3,Fireship,1471,Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...,"{'metadata': {'channels': 1, 'created': '2024-...","downloads\100_Firebase_Tips,_Tricks,_and_Screw...",https://www.youtube.com/watch?v=iWEgpdVSZyg,"100 Firebase Tips, Tricks, and Screw-ups",177364
4,Fireship,246,Google made a ton of exciting announcements at...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\10_crazy_announcements_from_Google_I...,https://www.youtube.com/watch?v=nmfRDRNjCnM,10 crazy announcements from Google I/O,968111


In [12]:
df.rename(columns={'transcript': 'transcript_json'}, inplace=True)
df.head()

Unnamed: 0,author,duration,description,transcript_json,audio_path,link,title,views
0,Fireship,787,Learn the fundamentals of Computer Science wit...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Computer_Science_Concepts_Expla...,https://www.youtube.com/watch?v=-uleG_Vecis,100+ Computer Science Concepts Explained,2110216
1,Fireship,743,The ultimate 10 minute JavaScript course that ...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_JavaScript_Concepts_you_Need_to...,https://www.youtube.com/watch?v=lkIFF4maKMU,100+ JavaScript Concepts you Need to Know,1642938
2,Fireship,798,WebDev 101 is a complete introduction into the...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Web_Development_Things_you_Shou...,https://www.youtube.com/watch?v=erEgovG9WBs,100+ Web Development Things you Should Know,1296840
3,Fireship,1471,Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...,"{'metadata': {'channels': 1, 'created': '2024-...","downloads\100_Firebase_Tips,_Tricks,_and_Screw...",https://www.youtube.com/watch?v=iWEgpdVSZyg,"100 Firebase Tips, Tricks, and Screw-ups",177364
4,Fireship,246,Google made a ton of exciting announcements at...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\10_crazy_announcements_from_Google_I...,https://www.youtube.com/watch?v=nmfRDRNjCnM,10 crazy announcements from Google I/O,968111


In [13]:
json_string = df.loc[0, 'transcript_json']

# Display the loaded JSON object
print(json_string["results"]["channels"][0]["alternatives"][0]["transcript"])

What's the first thing you should do when your code throws an error? Obviously, you should change nothing and try to run it again a few times. If that doesn't work, you're gonna need a computer science degree. The awesome thing about software engineering is that you can learn to code and get a high paying job, while literally having no idea how anything actually works. It all just feels like magic. Like a pilot driving a giant metal tube in the sky while knowing nothing about aerodynamics. Mother of God, no. Holy shit. Shit. Welcome to computer science 101. In today's video, you'll learn the science behind the garbage code you've been writing by learning 101 different computer science terms and concepts. This is a computer. It's just a piece of tape that holds ones and zeros along with a device that can read and write to it. It's called a Turing machine and in theory, it can compute anything, like the graphics in this video or the algorithm that recommended that you watch it. At the co

In [14]:
import pandas as pd
import json

# Assuming your DataFrame is named df

def parse_json(row):
    try:
        transcript_json = row['transcript_json']
        if transcript_json["results"]["summary"]["result"] == "success":
            transcript = str(transcript_json["results"]["channels"][0]["alternatives"][0]["transcript"])
            summary = str(transcript_json["results"]["summary"]["short"])
            return transcript, summary
        else:
            print("an error occurred")
            return None, None
    except (json.JSONDecodeError, KeyError):
        print("an exception occurred")
        return None, None

# Apply the custom function to each row
df[['transcript', 'summary']] = df.apply(parse_json, axis=1, result_type='expand')

# Display the updated DataFrame
# print(df.head())

In [15]:
from datasets import Dataset
import pandas as pd
final_dataset = Dataset.from_pandas(df)

In [16]:
final_dataset

Dataset({
    features: ['author', 'duration', 'description', 'transcript_json', 'audio_path', 'link', 'title', 'views', 'transcript', 'summary'],
    num_rows: 522
})

In [17]:
final_dataset.push_to_hub("CognitiveLab/FS_transcribe_summary")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/CognitiveLab/FS_transcribe_summary/commit/c7075b4ee3b535b8b82347fb018af70df4dffb86', commit_message='Upload dataset', commit_description='', oid='c7075b4ee3b535b8b82347fb018af70df4dffb86', pr_url=None, pr_revision=None, pr_num=None)

## Prompt formatting

In [20]:
import pandas as pd
import json

# Assuming your DataFrame is named df

def create_prompt(row):
    try:
        author = row["author"]
        title = row["title"]
        video_transcript = row["transcript"]
        video_summary = row["summary"]
        # transcript_json = row['transcript_json']
        text = f"""
        [INST]
        You are youtuber called {author} you make engaging high-intensity and entertaining coding tutorials and tech news. 
        you covers a wide range of topics relevant to programmers, aiming to help them learn and improve their skills quickly.
        
        Given the title of the video : {title} 
        and a small summary : {video_summary}
        [/INST]
        
        Generate the video : {video_transcript}
        """        
        return text

    except (json.JSONDecodeError, KeyError):
        print("an exception occurred")
        return None

# Apply the custom function to each row
df['text'] = df.apply(create_prompt, axis=1, result_type='expand')

# Display the updated DataFrame
df.head()

Unnamed: 0,author,duration,description,transcript_json,audio_path,link,title,views,transcript,summary,text
0,Fireship,787,Learn the fundamentals of Computer Science wit...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Computer_Science_Concepts_Expla...,https://www.youtube.com/watch?v=-uleG_Vecis,100+ Computer Science Concepts Explained,2110216,What's the first thing you should do when your...,The importance of hardware and memory for a co...,\n [INST]\n You are youtuber cal...
1,Fireship,743,The ultimate 10 minute JavaScript course that ...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_JavaScript_Concepts_you_Need_to...,https://www.youtube.com/watch?v=lkIFF4maKMU,100+ JavaScript Concepts you Need to Know,1642938,JavaScript. It's a wonderful programming langu...,The speaker explains that JavaScript is a prog...,\n [INST]\n You are youtuber cal...
2,Fireship,798,WebDev 101 is a complete introduction into the...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Web_Development_Things_you_Shou...,https://www.youtube.com/watch?v=erEgovG9WBs,100+ Web Development Things you Should Know,1296840,Web development is the best job in the world. ...,The internet is a collection of machines conne...,\n [INST]\n You are youtuber cal...
3,Fireship,1471,Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...,"{'metadata': {'channels': 1, 'created': '2024-...","downloads\100_Firebase_Tips,_Tricks,_and_Screw...",https://www.youtube.com/watch?v=iWEgpdVSZyg,"100 Firebase Tips, Tricks, and Screw-ups",177364,Welcome to my top 10 Firebase tips. Welcome to...,The speakers discuss how to build successful r...,\n [INST]\n You are youtuber cal...
4,Fireship,246,Google made a ton of exciting announcements at...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\10_crazy_announcements_from_Google_I...,https://www.youtube.com/watch?v=nmfRDRNjCnM,10 crazy announcements from Google I/O,968111,"It is May 11, 2023, and you're watching the Co...","In this video, the speakers discuss Google's u...",\n [INST]\n You are youtuber cal...


In [21]:
from datasets import Dataset
import pandas as pd
final_dataset = Dataset.from_pandas(df)

In [22]:
final_dataset.push_to_hub("CognitiveLab/FS_transcribe_summary_prompt")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CognitiveLab/FS_transcribe_summary_prompt/commit/a102e80f3b8128bb80e65de560d71efe9bbed9a2', commit_message='Upload dataset', commit_description='', oid='a102e80f3b8128bb80e65de560d71efe9bbed9a2', pr_url=None, pr_revision=None, pr_num=None)

## Parsing operation for the api response

In [31]:
with open("dummy.json","r") as f:
    transcribe_json_list = json.load(f)

In [36]:
transcribe_json_list[0]["results"]["channels"][0]["alternatives"][0]["transcript"]

"Have you ever woken up in the middle of the night in a panic wondering how to extract a polygonal mesh of an isosurface from a 3 dimensional discrete scalar field? Yeah. I didn't think so. But back in 87, 2 programmers at General Electric did. They created and patented the marching cubes algorithm, an algorithm that has likely saved countless lives by allowing doctors to visualize data from CT and MRI scans. Whenever you instruct a machine to solve a problem with code, you're creating an algorithm, a procedure for rearranging ones and zeros that can make animals talk and vacuums walk. Most algorithms belong in a dumpster, but some are fast, skin. Some are beautiful and some are so weird, they're indistinguishable from magic. Today, we'll look at 10 of the most interesting algorithms ever engineered sphere, and how they're used to solve very interesting problems in the real world. 1st on the list, we have wave function collapse. One of the weirdest things in all of science is the doubl

In [38]:
transcribe_json_list[0]["results"]["summary"]["result"]

'success'

In [39]:
transcribe_json_list[0]["results"]["summary"]["short"]

'The speakers discuss the use of algorithms in scientific research, including random random algorithms like BOGO sort and BOGO sort to solve problems in scientific research, and the potential uses of these algorithms in optimizing algorithms and algorithms for algorithms. They also touch on the use of quantum algorithms in machine design and the future of digital security, including the use of random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random r

In [None]:
from tqdm import tqdm

final_dataset = []
# for video_link in tqdm(video_links, desc='Processing videos', unit='video'):
for transcribe_json in tqdm(transcribe_json_list,desc='Processing transcribe'):
    transcribe = transcribe_json["results"]["channels"][0]["alternatives"][0]["transcript"]
    if transcribe_json["results"]["summary"]["result"]=="success":
        summary = transcribe_json["results"]["summary"]["short"]
    final_json = {
        "transcribe": transcribe,
        "summary": summary
    }
    final_dataset.append(final_json)

with open("transcribe_data_final_processed.json", "w") as output:
    json.dump(final_dataset, output)
    

In [None]:
transcribe_json_list[0]["channels"]

In [None]:
final_dataset_transcribe = load_dataset("json",data_files="./transcribe_data_final.json")

In [None]:
final_dataset_transcribe.push_to_hub("CognitiveLab/FS_transcribe_summary")

In [None]:
final_dataset_transcribe

In [None]:
final_dataset_transcribe["train"][1]

In [None]:
import json
with open("./video_data_and_transcripts.json") as F:
    json_data = json.load(F)

In [None]:
len(json_data)