In [None]:
import os
import re
import pandas as pd
import yt_dlp
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from pocketsphinx import AudioFile

def download_video(url_output_path):
    url, output_path = url_output_path
    ydl_opts = {'outtmpl': output_path, 'format': 'best'}
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except Exception as e:
        print(f"Error downloading video {url}: {e}")

def extract_audio(video_path_audio_path):
    video_path, audio_path = video_path_audio_path
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
        audio.close()
        video.close()
    except Exception as e:
        print(f"Error extracting audio from {video_path}: {e}")

def split_audio(audio_path_chunk_length):
    audio_path, chunk_length_ms = audio_path_chunk_length
    try:
        audio = AudioSegment.from_wav(audio_path)
        chunks = []
        for i in range(0, len(audio), chunk_length_ms):
            chunk = audio[i:i + chunk_length_ms]
            chunk_name = f"chunk_{i // chunk_length_ms}.wav"
            chunk.export(chunk_name, format="wav")
            chunks.append(chunk_name)
        return chunks
    except Exception as e:
        print(f"Error splitting audio {audio_path}: {e}")
        return []

def recognize_audio_chunk(chunk_path):
    try:
        audio = AudioFile(audio_file=chunk_path)
        text = ""
        for phrase in audio:
            text += phrase.hypothesis() + " "
        print(f"Chunk {chunk_path} processed")
        return text.strip()
    except Exception as e:
        print(f"Error processing chunk {chunk_path}: {e}")
        return ""

def process_video(video_info):
    index, video_url, output_folder, chunk_length_ms = video_info
    try:
        video_path = os.path.join(output_folder, f"temp_video_{index}.mp4")  
        audio_path = os.path.join(output_folder, f"temp_audio_{index}.wav")

        download_video((video_url, video_path))
        extract_audio((video_path, audio_path))

        chunks = split_audio((audio_path, chunk_length_ms))

        texts = [recognize_audio_chunk(chunk) for chunk in chunks]

        episode_number = re.search(r"episode-(\d+)", video_url).group(1)
        transcript_path = os.path.join(output_folder, f"transcript_{episode_number}.txt")

        with open(transcript_path, 'w') as f:
            f.write(" ".join([text for text in texts if text]))

        [os.remove(file) for file in [video_path, audio_path] + chunks]
        
    except Exception as e:
        print(f"Problem with URL {video_url}: {e}")

def process_videos_from_excel(excel_path, output_folder="Bannon", chunk_length_ms=10000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        df = pd.read_csv(excel_path)
        df = df.dropna()
        print(df.head(5))
    except FileNotFoundError:
        print(f"Error: The file '{excel_path}' does not exist.")
        return

    stop = 0
    for index, row in df.iterrows():
        video_info = (index, f"https://rumble.com{row['href']}", output_folder, chunk_length_ms) 
        process_video(video_info)
        stop += 1
        if stop == 10:
            df.drop(df.head(10).index, inplace=True)
            df.to_csv('bannon2.csv', encoding='utf-8', index=False)
            return

if __name__ == "__main__":
    excel_path = 'bannon2.csv'  # Update this path if necessary
    process_videos_from_excel(excel_path)
