In [None]:
!pip install torch torchvision torchaudio transformers opencv-python-headless moviepy pillow reportlab yt-dlp google-api-python-client nbformat nbconvert psutil

In [None]:
import yt_dlp
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import os
import psutil


In [None]:
def download_audio_from_youtube(link, output_path):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path,
        'quiet': True,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([link])
    print(f"Audio downloaded and saved at {output_path}")


In [None]:
def print_memory_usage():
    process = psutil.Process(os.getpid())
    print(f"Memory usage: {process.memory_info().rss / 1024 ** 2:.2f} MB")


In [None]:
def transcribe_audio_with_huggingface(audio_path, batch_size=30):
    print_memory_usage()
    print("Loading the model...")
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    print_memory_usage()

    print("Processing audio...")
    audio, rate = librosa.load(audio_path, sr=16000)
    total_length = len(audio)
    transcription = ""

    for i in range(0, total_length, batch_size * rate):
        batch_audio = audio[i:i + batch_size * rate]
        if len(batch_audio) == 0:
            break
        input_values = processor(batch_audio, sampling_rate=rate, return_tensors="pt").input_values
        print_memory_usage()

        print("Transcribing audio batch...")
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        batch_transcription = processor.decode(predicted_ids[0])
        transcription += batch_transcription + " "

        # Clear intermediate data
        del batch_audio, input_values, logits, predicted_ids
        torch.cuda.empty_cache()
        print_memory_usage()

    return transcription.strip()


In [None]:
def generate_text_file_from_transcription(transcription, output_text_path):
    print("Generating text file...")
    with open(output_text_path, 'w') as file:
        file.write("Audio Transcription Report\n\n")
        file.write(transcription)
    print(f"Text file saved at {output_text_path}")


In [None]:
def generate_pdf_from_text(transcription, output_pdf_path):
    print("Generating PDF...")
    c = canvas.Canvas(output_pdf_path, pagesize=letter)
    c.setFont("Helvetica", 12)
    c.drawString(100, 750, "Audio Transcription Report")
    text_lines = transcription.split('. ')
    y_position = 720
    for line in text_lines:
        if y_position < 50:  
            c.showPage()
            c.setFont("Helvetica", 12)
            y_position = 750
        c.drawString(100, y_position, line.strip() + '.')
        y_position -= 20
    c.save()
    print(f"PDF saved at {output_pdf_path}")


In [None]:
def extract_video_metadata(link):
    ydl_opts = {'quiet': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(link, download=False)
    title = info.get("title", "No title available")
    description = info.get("description", "No description available")
    return title, description


In [None]:
def read_transcription_file(transcription_path):
    with open(transcription_path, 'r') as file:
        transcription = file.read()
    return transcription


In [None]:
def summarize_text(transcription):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(transcription, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']


In [None]:
def generate_pdf_from_summary(summary, output_pdf_path):
    c = canvas.Canvas(output_pdf_path, pagesize=letter)
    c.setFont("Helvetica", 12)
    c.drawString(100, 750, "Summary Report")
    text_lines = summary.split('. ')
    y_position = 720
    for line in text_lines:
        if y_position < 50:  
            c.showPage()
            c.setFont("Helvetica", 12)
            y_position = 750
        c.drawString(100, y_position, line.strip() + '.')
        y_position -= 20
    c.save()
    print(f"PDF saved at {output_pdf_path}")


In [None]:
def process_video(link, audio_path, text_path, pdf_path, summary_pdf_path):
    download_audio_from_youtube(link, audio_path)
    transcription = transcribe_audio_with_huggingface(audio_path)
    generate_text_file_from_transcription(transcription, text_path)
    generate_pdf_from_text(transcription, pdf_path)
    title, description = extract_video_metadata(link)
    transcription = read_transcription_file(text_path)
    summary = summarize_text(title + " " + description + " " + transcription)
    generate_pdf_from_summary(summary, summary_pdf_path)


In [None]:
video_link = "https://www.youtube.com/watch?v=lGgIhxYuSHM"
audio_file_path = "./processed/audio_extracted.wav"
text_file_path = "./processed/audio_transcription.txt"
pdf_file_path = "./processed/video_summary_report.pdf"
summary_pdf_path = "./processed/summary_report.pdf"
os.makedirs("./processed", exist_ok=True)
process_video(video_link, audio_file_path, text_file_path, pdf_file_path, summary_pdf_path)
