In [6]:
import os
import re
import pandas as pd
import yt_dlp
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from pydub import AudioSegment
from multiprocessing.dummy import Pool as ThreadPool
stop=0
def download_video(url_output_path):
    url, output_path = url_output_path
    ydl_opts = {'outtmpl': output_path, 'format': 'best'}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

def extract_audio(video_path_audio_path):
    video_path, audio_path = video_path_audio_path
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
    finally:
        audio.close()
        video.close()

def split_audio(audio_path_chunk_length):
    audio_path, chunk_length_ms = audio_path_chunk_length
    audio = AudioSegment.from_wav(audio_path)
    chunks = []
    for i in range(0, len(audio), chunk_length_ms):
        chunk = audio[i:i + chunk_length_ms]
        chunk_name = f"chunk_{i // chunk_length_ms}.wav"
        chunk.export(chunk_name, format="wav")
        chunks.append(chunk_name)
    return chunks

def recognize_audio_chunk(chunk_info):
    chunk, recognizer = chunk_info
    try:
        with sr.AudioFile(chunk) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            print(f"Chunk {chunk} processed")
            return text
    except Exception as e:
        print(f"Error processing chunk {chunk}: {e}")
        return ""

def process_video(video_info):
    index, video_url, output_folder, chunk_length_ms, recognizer = video_info
    try:
        video_path = os.path.join(output_folder, f"temp_video_{index}.mp4")  
        audio_path = os.path.join(output_folder, f"temp_audio_{index}.wav")

        download_video((video_url, video_path))
        extract_audio((video_path, audio_path))

        chunks = split_audio((audio_path, chunk_length_ms))

        chunk_infos = [(chunk, recognizer) for chunk in chunks]
        with ThreadPool() as pool:
            texts = pool.map(recognize_audio_chunk, chunk_infos)

        episode_number = re.search(r"episode-(\d+)", video_url).group(1)
        transcript_path = os.path.join(output_folder, f"transcript_{episode_number}.txt")

        with open(transcript_path, 'w') as f:
            f.write(" ".join(texts))

        [os.remove(file) for file in [video_path, audio_path] + chunks]
        stop+=1
    except Exception as e:
        print("Problem with url:", e)


def process_videos_from_excel(excel_path, output_folder="Bannon", chunk_length_ms=60000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        df = pd.read_csv(excel_path)
        df = df.dropna()
        print(df.head(5))
    except FileNotFoundError:
        print(f"Error: The file '{excel_path}' does not exist.")
        return

    recognizer = sr.Recognizer()
    
    for index, row in df.iterrows():
        video_info = (index, f"https://rumble.com{row['href']}", output_folder, chunk_length_ms, recognizer) 
        process_video(video_info)
        
        if(stop==10):
            df.drop(df.head(10).index, inplace=True)
            df.to_csv('bannon2.csv', encoding='utf-8', index=False)
            return




if __name__ == "__main__":
    excel_path = 'bannon2.csv'  # Update this path if necessary
    process_videos_from_excel(excel_path)

                                                href
0  /v28tnw0-episode-2503-the-twitter-hearings-beg...
1                         /v28kzrm-episode-2498.html
2  /v27qyxw-episode-2480-the-created-crisis-of-th...
3                         /v27enzs-episode-2476.html
4  /v24ruk6-episode-2431-what-did-the-administrat...
[Rumble] Extracting URL: https://rumble.com/v28tnw0-episode-2503-the-twitter-hearings-began-in-congress.html
[Rumble] v28tnw0-episode-2503-the-twitter-hearings-began-in-congress.html: Downloading webpage
[RumbleEmbed] Extracting URL: https://rumble.com/embed/v267qiu
[RumbleEmbed] v267qiu: Downloading JSON metadata
[info] v267qiu: Downloading 1 format(s): mp4-720p
[download] Destination: Bannon\temp_video_0.mp4
[download] 100% of  690.49MiB in 00:01:46 at 6.51MiB/s     
MoviePy - Writing audio in Bannon\temp_audio_0.wav


                                                                        

MoviePy - Done.
Chunk chunk_0.wav processed
Chunk chunk_14.wav processed
Chunk chunk_12.wav processed
Chunk chunk_10.wav processed
Chunk chunk_4.wav processed
Chunk chunk_8.wav processed
Chunk chunk_6.wav processed
Chunk chunk_2.wav processed
Chunk chunk_1.wav processed
Chunk chunk_13.wav processed
Chunk chunk_11.wav processed
Chunk chunk_15.wav processed
Chunk chunk_5.wav processed
Chunk chunk_9.wav processed
Chunk chunk_16.wav processed
Chunk chunk_7.wav processed
Chunk chunk_3.wav processed
Chunk chunk_24.wav processed
Chunk chunk_20.wav processed
Chunk chunk_22.wav processed
Chunk chunk_18.wav processed
Chunk chunk_26.wav processed
Chunk chunk_25.wav processed
Chunk chunk_28.wav processed
Chunk chunk_23.wav processed
Chunk chunk_17.wav processed
Error processing chunk chunk_27.wav: 
Chunk chunk_21.wav processed
Chunk chunk_30.wav processed
Error processing chunk chunk_36.wav: 
Chunk chunk_32.wav processed
Chunk chunk_19.wav processed
Chunk chunk_34.wav processed
Chunk chunk_29.wav 

                                                                        

MoviePy - Done.
Chunk chunk_0.wav processed
Chunk chunk_14.wav processed
Chunk chunk_4.wav processed
Chunk chunk_6.wav processed
Chunk chunk_2.wav processed
Chunk chunk_12.wav processed
Chunk chunk_8.wav processed
Chunk chunk_10.wav processed
Error processing chunk chunk_15.wav: 
Chunk chunk_1.wav processed
Chunk chunk_7.wav processed
Chunk chunk_3.wav processed
Chunk chunk_5.wav processed
Chunk chunk_9.wav processed
Chunk chunk_13.wav processed
Chunk chunk_16.wav processed
Chunk chunk_11.wav processed
Error processing chunk chunk_22.wav: 
Chunk chunk_18.wav processed
Chunk chunk_20.wav processed
Chunk chunk_24.wav processed
Chunk chunk_30.wav processed
Chunk chunk_26.wav processed
Chunk chunk_17.wav processed
Chunk chunk_28.wav processed
Chunk chunk_23.wav processed
Chunk chunk_19.wav processed
Chunk chunk_21.wav processed
Chunk chunk_25.wav processed
Chunk chunk_31.wav processed
Chunk chunk_32.wav processed
Chunk chunk_29.wav processed
Chunk chunk_34.wav processed
Chunk chunk_36.wav 

                                                                        

MoviePy - Done.
Chunk chunk_8.wav processed
Chunk chunk_0.wav processed
Chunk chunk_14.wav processed
Chunk chunk_9.wav processed
Chunk chunk_12.wav processed
Chunk chunk_4.wav processed
Chunk chunk_6.wav processed
Chunk chunk_10.wav processed
Chunk chunk_2.wav processed
Chunk chunk_1.wav processed
Chunk chunk_15.wav processed
Error processing chunk chunk_7.wav: 
Chunk chunk_16.wav processed
Chunk chunk_13.wav processed
Chunk chunk_5.wav processed
Error processing chunk chunk_11.wav: 
Chunk chunk_3.wav processed
Chunk chunk_20.wav processed
Chunk chunk_18.wav processed
Chunk chunk_24.wav processed
Chunk chunk_28.wav processed
Chunk chunk_17.wav processed
Chunk chunk_26.wav processed
Chunk chunk_22.wav processed
Chunk chunk_30.wav processed
Error processing chunk chunk_25.wav: 
Chunk chunk_21.wav processed
Error processing chunk chunk_19.wav: 
Chunk chunk_29.wav processed
Chunk chunk_32.wav processed
Chunk chunk_23.wav processed
Error processing chunk chunk_38.wav: 
Chunk chunk_27.wav pr

                                                                        

MoviePy - Done.
Chunk chunk_0.wav processed
Chunk chunk_2.wav processed
Chunk chunk_4.wav processed
Chunk chunk_14.wav processed
Chunk chunk_12.wav processed
Chunk chunk_1.wav processed
Chunk chunk_10.wav processed
Chunk chunk_6.wav processed
Chunk chunk_8.wav processed
Chunk chunk_15.wav processed
Chunk chunk_3.wav processed
Chunk chunk_13.wav processed
Chunk chunk_9.wav processed
Chunk chunk_7.wav processed
Chunk chunk_5.wav processed
Chunk chunk_11.wav processed
Chunk chunk_16.wav processed
Chunk chunk_22.wav processed
Chunk chunk_26.wav processed
Chunk chunk_18.wav processed
Chunk chunk_20.wav processed
Chunk chunk_17.wav processed
Chunk chunk_30.wav processed
Chunk chunk_28.wav processed
Chunk chunk_24.wav processed
Chunk chunk_23.wav processed
Chunk chunk_21.wav processed
Chunk chunk_27.wav processed
Chunk chunk_19.wav processed
Chunk chunk_29.wav processed
Chunk chunk_31.wav processed
Chunk chunk_32.wav processed
Chunk chunk_38.wav processed
Chunk chunk_36.wav processed
Chunk ch

                                                                        

MoviePy - Done.
Chunk chunk_6.wav processed
Chunk chunk_0.wav processed
Error processing chunk chunk_8.wav: 
Chunk chunk_2.wav processed
Chunk chunk_10.wav processed
Error processing chunk chunk_12.wav: 
Chunk chunk_4.wav processed
Chunk chunk_14.wav processed
Chunk chunk_7.wav processed
Chunk chunk_1.wav processed
Chunk chunk_9.wav processed
Chunk chunk_3.wav processed
Error processing chunk chunk_5.wav: 
Chunk chunk_11.wav processed
Chunk chunk_15.wav processed
Chunk chunk_13.wav processed
Chunk chunk_18.wav processed
Chunk chunk_26.wav processed
Chunk chunk_16.wav processed
Chunk chunk_28.wav processed
Chunk chunk_20.wav processed
Chunk chunk_22.wav processed
Chunk chunk_24.wav processed
Chunk chunk_27.wav processed
Chunk chunk_19.wav processed
Chunk chunk_30.wav processed
Chunk chunk_21.wav processed
Chunk chunk_17.wav processed
Chunk chunk_29.wav processed
Chunk chunk_23.wav processed
Chunk chunk_25.wav processed
