In [2]:
import os
import re
import pandas as pd
import yt_dlp
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from pydub import AudioSegment
from concurrent.futures import ThreadPoolExecutor  # More efficient for I/O
import gc  # Garbage collection for memory management

def download_video(url_output_path):
    url, output_path = url_output_path
    ydl_opts = {'outtmpl': output_path, 'format': 'best'}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

def extract_audio(video_path_audio_path):
    video_path, audio_path = video_path_audio_path
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
    finally:
        audio.close()
        video.close()

def split_audio(audio_path, chunk_length_ms, output_folder):
    audio = AudioSegment.from_wav(audio_path)
    chunks = []
    for i, chunk in enumerate(audio[::chunk_length_ms]):  # Iterate directly over segments
        chunk_name = os.path.join(output_folder, f"chunk_{i}.wav")
        chunk.export(chunk_name, format="wav")
        chunks.append(chunk_name)
    return chunks

def recognize_audio_chunk(chunk_info):
    chunk, recognizer = chunk_info
    text = ""
    with sr.AudioFile(chunk) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        print(f"Chunk {chunk} processed")
    except sr.UnknownValueError:
        print(f"Could not understand audio in chunk {chunk}")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service for chunk {chunk}; {e}")
    finally:
       
        os.remove(chunk)  # Delete chunk after processing
    return text

def process_video(video_info):
    index, video_url, output_folder, chunk_length_ms, recognizer = video_info
    try:
        video_path = os.path.join(output_folder, f"temp_video_{index}.mp4")
        audio_path = os.path.join(output_folder, f"temp_audio_{index}.wav")

        download_video((video_url, video_path))
        extract_audio((video_path, audio_path))

        chunks = split_audio(audio_path, chunk_length_ms, output_folder)
        chunk_infos = [(chunk, recognizer) for chunk in chunks]

        with ThreadPoolExecutor() as executor:
            texts = list(executor.map(recognize_audio_chunk, chunk_infos))  # Force to list

        episode_number = re.search(r"episode-(\d+)", video_url).group(1)
        transcript_path = os.path.join(output_folder, f"transcript_{episode_number}.txt")

        with open(transcript_path, 'w') as f:
            f.write(" ".join(texts))
        
        os.remove(video_path)  # Delete video file after processing
        os.remove(audio_path)  # Delete audio file after processing
        gc.collect()  # Explicitly trigger garbage collection
    except Exception as e:
        print(f"Error processing video {video_url}: {e}")

def process_videos_from_excel(excel_path, output_folder="Bannon", chunk_length_ms=60000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        df = pd.read_excel(excel_path)
        df = df.dropna()
        print(df.head(5))
    except FileNotFoundError:
        print(f"Error: The file '{excel_path}' does not exist.")
        return

    recognizer = sr.Recognizer()
    for index, row in df.iterrows():
        video_info = (index, f"https://rumble.com{row['href']}", output_folder, chunk_length_ms, recognizer) 
        process_video(video_info)

if __name__ == "__main__":
    excel_path = 'bannon2.xlsx'  # Update this path if necessary
    process_videos_from_excel(excel_path)


                                                href
0  /v15yaux-episode-1879-breggin-roguski-on-the-s...
1  /v157l45-episode-1865-the-nyc-governor-race-co...
2  /v14i3k7-episode-1850-wapo-attacks-2000-mules-...
3  /vz70p1-episode-1748-biden-approves-instant-as...
4  /vxvdfv-episode-1721-pfizer-documents-yield-sh...
[Rumble] Extracting URL: https://rumble.com/v15yaux-episode-1879-breggin-roguski-on-the-selling-of-american-sovereignty-to-who.html
[Rumble] v15yaux-episode-1879-breggin-roguski-on-the-selling-of-american-sovereignty-to-who.html: Downloading webpage
[RumbleEmbed] Extracting URL: https://rumble.com/embed/v13c4wv
[RumbleEmbed] v13c4wv: Downloading JSON metadata
[info] v13c4wv: Downloading 1 format(s): mp4-720p
[download] Destination: Bannon\temp_video_0.mp4
[download] 100% of  690.55MiB in 00:00:16 at 41.30MiB/s     
MoviePy - Writing audio in Bannon\temp_audio_0.wav


                                                                        

MoviePy - Done.
Chunk Bannon\chunk_0.wav processed
Chunk Bannon\chunk_8.wav processed
Chunk Bannon\chunk_6.wav processed
Could not understand audio in chunk Bannon\chunk_7.wav
Chunk Bannon\chunk_4.wav processed
Chunk Bannon\chunk_11.wav processed
Chunk Bannon\chunk_9.wav processed
Chunk Bannon\chunk_3.wav processed
Chunk Bannon\chunk_1.wav processed
Chunk Bannon\chunk_12.wav processed
Could not understand audio in chunk Bannon\chunk_5.wav
Chunk Bannon\chunk_10.wav processed
Chunk Bannon\chunk_2.wav processed
Error processing video https://rumble.com/v15yaux-episode-1879-breggin-roguski-on-the-selling-of-american-sovereignty-to-who.html: remove: path should be string, bytes or os.PathLike, not AudioData
[Rumble] Extracting URL: https://rumble.com/v157l45-episode-1865-the-nyc-governor-race-continues-the-looming-food-and-energy-cr.html
[Rumble] v157l45-episode-1865-the-nyc-governor-race-continues-the-looming-food-and-energy-cr.html: Downloading webpage
[RumbleEmbed] Extracting URL: https:

KeyboardInterrupt: 