In [None]:
!pip install torch torchvision torchaudio transformers opencv-python-headless moviepy pillow reportlab yt-dlp google-api-python-client nbformat nbconvert psutil

In [None]:
import yt_dlp
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
import os
import psutil


In [None]:
def download_audio_from_youtube(link, output_path):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path,
        'quiet': True,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([link])
    print(f"Audio downloaded and saved at {output_path}")


In [None]:
def print_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Memory usage: {process.memory_info().rss / 1024 ** 2:.2f} MB")


In [None]:
def transcribe_audio_with_huggingface(audio_path, batch_size=30):
    print_memory_usage()
    print("Loading the model...")
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    print_memory_usage()

    print("Processing audio...")
    audio, rate = librosa.load(audio_path, sr=16000)
    total_length = len(audio)
    transcription = ""

    for i in range(0, total_length, batch_size * rate):
        batch_audio = audio[i:i + batch_size * rate]
        if len(batch_audio) == 0:
            break
        input_values = processor(batch_audio, sampling_rate=rate, return_tensors="pt").input_values
        print_memory_usage()

        print("Transcribing audio batch...")
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        if predicted_ids.max() >= model.config.vocab_size:
            print(f"Warning: Predicted ID {predicted_ids.max()} is out of range.")
            continue
        batch_transcription = processor.decode(predicted_ids[0])
        transcription += batch_transcription + " "

        # Clear intermediate data
        del batch_audio, input_values, logits, predicted_ids
        torch.cuda.empty_cache()
        print_memory_usage()

    return transcription.strip()


In [None]:
def generate_text_file_from_transcription(transcription, output_text_path):
    print("Generating text file...")
    with open(output_text_path, 'w') as file:
        file.write("Audio Transcription Report\n\n")
        file.write(transcription)
    print(f"Text file saved at {output_text_path}")


In [None]:
def extract_video_metadata(link):
    ydl_opts = {'quiet': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(link, download=False)
    title = info.get("title", "No title available")
    description = info.get("description", "No description available")
    return title, description


In [None]:
def read_transcription_file(transcription_path):
    with open(transcription_path, 'r') as file:
        transcription = file.read()
    return transcription


In [None]:
def summarize_text_chunk(chunk, title, description):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    text = "Title: " + title + " Description: " + description + " transcript chunk: " + chunk
    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']


In [None]:
def generate_text_file_from_summary(summary, output_text_path):
    print("Generating summary text file...")
    with open(output_text_path, 'w') as file:
        file.write("Summary Report\n\n")
        points = summary.split('. ')
        for point in points:
            file.write(f"- {point.strip()}.\n")
    print(f"Summary text file saved at {output_text_path}")


In [None]:
def process_video(link, audio_path, audio_file_path_creation, text_path, summary_text_path):
    download_audio_from_youtube(link, audio_file_path_creation)
    transcription = transcribe_audio_with_huggingface(audio_path)
    generate_text_file_from_transcription(transcription, text_path)
    title, description = extract_video_metadata(link)
    transcription = read_transcription_file(text_path)
    chunks = [transcription[i:i + 1000] for i in range(0, len(transcription), 1000)]
    final_summary = ""
    for chunk in chunks:
        summary = summarize_text_chunk(chunk, title, description)
        final_summary += summary + " "
    final_summary = summarize_text_chunk(final_summary, title, description)
    generate_text_file_from_summary(final_summary.strip(), summary_text_path)


In [None]:
video_link = "https://www.youtube.com/watch?v=0oGJTQCy4cQ"
audio_file_path = "./processed/audio_extracted.wav"
audio_file_path_creation = "./processed/audio_extracted"
text_file_path = "./processed/audio_transcription.txt"
summary_text_path = "./processed/summary_report.txt"
os.makedirs("./processed", exist_ok=True)
process_video(video_link, audio_file_path, audio_file_path_creation, text_file_path, summary_text_path)
