In [1]:
import os
import glob
import pandas as pd
import yt_dlp
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from pydub import AudioSegment

def download_video(url, output_path="downloaded_video.mp4"):
    ydl_opts = {'outtmpl': output_path, 'format': 'best'}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

def extract_audio(video_path, audio_path="audio.wav"):
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
    finally:
        audio.close()  # Close the audio object to release the file
        video.close()  # Close the video object to release the file

def split_audio(audio_path, chunk_length_ms=60000):
    audio = AudioSegment.from_wav(audio_path)
    chunks = []
    for i in range(0, len(audio), chunk_length_ms):
        chunk = audio[i:i + chunk_length_ms]
        chunk_name = f"chunk_{i // chunk_length_ms}.wav"
        chunk.export(chunk_name, format="wav")
        chunks.append(chunk_name)
    return chunks

def recognize_audio_chunks(chunks):
    recognizer = sr.Recognizer()
    full_text = ""
    for i, chunk in enumerate(chunks):
        with sr.AudioFile(chunk) as source:
            audio_data = recognizer.record(source)
            try:
                text = recognizer.recognize_google(audio_data)
                full_text += text + " "
                print("Chunk processsed ",i)
            except (sr.UnknownValueError, sr.RequestError) as e:
                print(f"Error processing chunk {i}: {e}")
    return full_text.strip()

def cleanup_files(patterns):
    for pattern in patterns:
        for file in glob.glob(pattern):
            os.remove(file)
            print(f"Deleted: {file}")

def process_videos_from_excel(excel_path, output_folder="Bannon"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    try:
        df = pd.read_excel(excel_path)
    except FileNotFoundError:
        print(f"Error: The file '{excel_path}' does not exist.")
        return

    for index, row in df.iterrows():
        video_url = f"https://rumble.com{row['href']}"
        video_path = f"downloaded_video{index}.mp4"
        audio_path = f"audio{index}.wav"
        transcript_path = os.path.join(output_folder, f"transcript_{index+1}.txt")

        download_video(video_url, video_path)
        extract_audio(video_path, audio_path)
        chunks = split_audio(audio_path)
        transcript = recognize_audio_chunks(chunks)
        
        with open(transcript_path, 'w') as f:
            f.write(transcript)
        
        cleanup_files([video_path, audio_path] + glob.glob(f"{output_folder}/chunk_*.wav"))

if __name__ == "__main__":
    excel_path = 'bannon.xlsx'  # Update this path if necessary
    process_videos_from_excel(excel_path)


[Rumble] Extracting URL: https://rumble.com/v4kvbc9-livestream-joe-kent-for-congress-joekentforcongress.com.html
[Rumble] v4kvbc9-livestream-joe-kent-for-congress-joekentforcongress.com.html: Downloading webpage
[RumbleEmbed] Extracting URL: https://rumble.com/embed/v4ia339
[RumbleEmbed] v4ia339: Downloading JSON metadata
[info] v4ia339: Downloading 1 format(s): mp4-1080p-1
[download] Destination: downloaded_video0.mp4
[download] 100% of  755.58MiB in 00:00:21 at 34.55MiB/s    
MoviePy - Writing audio in audio0.wav


                                                                        

MoviePy - Done.
Chunk processses chunk {i}
Chunk processses chunk {i}


In [1]:
pip install moviepy

Collecting moviepyNote: you may need to restart the kernel to use updated packages.

  Downloading moviepy-1.0.3.tar.gz (388 kB)
     ---------------------------------------- 0.0/388.3 kB ? eta -:--:--
     ------- ------------------------------- 71.7/388.3 kB 3.8 MB/s eta 0:00:01
     ----------------------- -------------- 235.5/388.3 kB 2.9 MB/s eta 0:00:01
     -------------------------------------- 388.3/388.3 kB 3.4 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting decorator<5.0,>=4.0.2 (from moviepy)
  Downloading decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Colle