In [5]:
import os
import re
import pandas as pd
import yt_dlp
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor  # More efficient for I/O
import gc  # Garbage collection for memory management
def download_video(url_output_path):
    url, output_path = url_output_path
    ydl_opts = {'outtmpl': output_path, 'format': 'best'}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

def extract_audio(video_path_audio_path):
    video_path, audio_path = video_path_audio_path
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
    finally:
        audio.close()
        video.close()

def split_audio(audio_path, chunk_length_ms, output_folder):
    audio = AudioSegment.from_wav(audio_path)
    chunks = []
    for i, chunk in enumerate(audio[::chunk_length_ms]):  # Iterate directly over segments
        chunk_name = os.path.join(output_folder, f"chunk_{i}.wav")
        chunk.export(chunk_name, format="wav")
        chunks.append(chunk_name)
    return chunks

def recognize_audio_chunk(chunk_info):
    chunk, recognizer = chunk_info
    with sr.AudioFile(chunk) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        print(f"Chunk {chunk} processed")
        return text
    except sr.UnknownValueError:
        print(f"Could not understand audio in chunk {chunk}")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service for chunk {chunk}; {e}")
    finally:
        del audio_data  # Free up memory after each chunk
    return ""

def process_video(video_info):
    index, video_url, output_folder, chunk_length_ms, recognizer = video_info
    try:
        video_path = os.path.join(output_folder, f"temp_video_{index}.mp4")
        audio_path = os.path.join(output_folder, f"temp_audio_{index}.wav")

        download_video((video_url, video_path))
        extract_audio((video_path, audio_path))

        chunks = split_audio(audio_path, chunk_length_ms, output_folder)
        chunk_infos = [(chunk, recognizer) for chunk in chunks]

        with ThreadPoolExecutor() as executor:
            texts = list(executor.map(recognize_audio_chunk, chunk_infos))  # Force to list

        episode_number = re.search(r"episode-(\d+)", video_url).group(1)
        transcript_path = os.path.join(output_folder, f"transcript_{episode_number}.txt")

        with open(transcript_path, 'w') as f:
            f.write(" ".join(texts))

        [os.remove(file) for file in [video_path, audio_path] + chunks]
        gc.collect()  # Explicitly trigger garbage collection
    except Exception as e:
        print(f"Error processing video {video_url}: {e}")

def process_videos_from_excel(excel_path, output_folder="Bannon", chunk_length_ms=60000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        df = pd.read_excel(excel_path)
        df = df.dropna()
        print(df.head(5))
    except FileNotFoundError:
        print(f"Error: The file '{excel_path}' does not exist.")
        return

    recognizer = sr.Recognizer()
    for index, row in df.iterrows():
        video_info = (index, f"https://rumble.com{row['href']}", output_folder, chunk_length_ms, recognizer) 
        process_video(video_info)


if __name__ == "__main__":
    excel_path = 'bannon2.xlsx'  # Update this path if necessary
    process_videos_from_excel(excel_path)


                                                href
0  /vkrpze-episode-1149-will-the-us-stand-up-like...
1  /vkejpt-episode-1125-the-marxist-assault-on-th...
2  /vjb4ml-episode-1064-ccp-100-a-century-of-deat...
3  /vhfa9h-episode-966-war-room-special-rise-of-t...
4  /vh8tyt-episode-955-the-indispensables-untold-...
[Rumble] Extracting URL: https://rumble.com/vkrpze-episode-1149-will-the-us-stand-up-like-the-french.html
[Rumble] vkrpze-episode-1149-will-the-us-stand-up-like-the-french.html: Downloading webpage
[RumbleEmbed] Extracting URL: https://rumble.com/embed/vi5ju4
[RumbleEmbed] vi5ju4: Downloading JSON metadata
[info] vi5ju4: Downloading 1 format(s): mp4-720p
[download] Destination: Bannon\temp_video_0.mp4
[download] 100% of  687.06MiB in 00:00:20 at 33.17MiB/s    
MoviePy - Writing audio in Bannon\temp_audio_0.wav


                                                                        

MoviePy - Done.
Chunk chunk_0.wav processed
Chunk chunk_6.wav processed
Chunk chunk_10.wav processed
Chunk chunk_12.wav processed
Chunk chunk_8.wav processed
Chunk chunk_14.wav processed
Chunk chunk_2.wav processed
Chunk chunk_4.wav processed
Chunk chunk_15.wav processed
Chunk chunk_9.wav processed
Chunk chunk_7.wav processed
Chunk chunk_1.wav processed
Chunk chunk_11.wav processed
Chunk chunk_3.wav processed
Chunk chunk_13.wav processed
Chunk chunk_16.wav processed
Chunk chunk_5.wav processed
Chunk chunk_18.wav processed
Chunk chunk_20.wav processed
Chunk chunk_22.wav processed
Chunk chunk_24.wav processed
Chunk chunk_30.wav processed
Chunk chunk_28.wav processed
Chunk chunk_17.wav processed
Chunk chunk_26.wav processed
Chunk chunk_21.wav processed
Chunk chunk_19.wav processed
Chunk chunk_32.wav processed
Chunk chunk_25.wav processed
Chunk chunk_27.wav processed
Chunk chunk_23.wav processed
Chunk chunk_29.wav processed
Chunk chunk_31.wav processed
Chunk chunk_34.wav processed
Chunk ch