<a href="https://colab.research.google.com/github/Chandruts25/GoogleColab/blob/main/YoutubeVideoToPdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opencv-python-headless
!pip install scikit-image
!pip install fpdf
!pip install yt-dlp

import sys
from PIL import ImageFile
sys.modules['ImageFile'] = ImageFile
import cv2
import os
import tempfile
import re
from fpdf import FPDF
from PIL import Image
import yt_dlp
from skimage.metrics import structural_similarity as ssim
from scipy.spatial import distance
from google.colab import files

def download_video(url, filename, max_retries=3):
    ydl_opts = {
        'outtmpl': filename,
        'format': 'best',
    }
    retries = 0
    while retries < max_retries:
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
                return filename
        except yt_dlp.utils.DownloadError as e:
            print(f"Error downloading video: {e}. Retrying... (Attempt {retries + 1}/{max_retries})")
            retries += 1
    raise Exception("Failed to download video after multiple attempts.")

def get_video_id(url):
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/|v\/|youtu.be\/)([0-9A-Za-z_-]{11})',
        r'(?:watch\?v=|watch\?.+&v=)([0-9A-Za-z_-]{11})',
        r'(?:live\/|live\?.+&v=)([0-9A-Za-z_-]{11})', # Added live stream pattern
        r"shorts\/(\w+)" # Added Shorts pattern
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_playlist_videos(playlist_url):
    ydl_opts = {
        'ignoreerrors': True,
        'playlistend': 1000,  # Maximum number of videos to fetch
        'extract_flat': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        playlist_info = ydl.extract_info(playlist_url, download=False)
        return [entry['url'] for entry in playlist_info['entries']]

def extract_unique_frames(video_file, output_folder, n=3, ssim_threshold=0.8):
    cap = cv2.VideoCapture(video_file)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    last_frame = None
    saved_frame = None
    frame_number = 0
    last_saved_frame_number = -1
    timestamps = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_number % n == 0:
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            gray_frame = cv2.resize(gray_frame, (128, 72))

            if last_frame is not None:
                similarity = ssim(gray_frame, last_frame, data_range=gray_frame.max() - gray_frame.min())

                if similarity < ssim_threshold:
                    if saved_frame is not None and frame_number - last_saved_frame_number > fps:
                        frame_path = os.path.join(output_folder, f'frame{frame_number:04d}_{frame_number // fps}.png')
                        cv2.imwrite(frame_path, saved_frame)
                        timestamps.append((frame_number, frame_number // fps))

                    saved_frame = frame
                    last_saved_frame_number = frame_number
                else:
                    saved_frame = frame

            else:
                frame_path = os.path.join(output_folder, f'frame{frame_number:04d}_{frame_number // fps}.png')
                cv2.imwrite(frame_path, frame)
                timestamps.append((frame_number, frame_number // fps))
                last_saved_frame_number = frame_number

            last_frame = gray_frame

        frame_number += 1

    cap.release()
    return timestamps

def convert_frames_to_pdf(input_folder, output_file, timestamps):
    frame_files = sorted(os.listdir(input_folder), key=lambda x: int(x.split('_')[0].split('frame')[-1]))
    pdf = FPDF("L")
    pdf.set_auto_page_break(0)

    for i, (frame_file, (frame_number, timestamp_seconds)) in enumerate(zip(frame_files, timestamps)):
        frame_path = os.path.join(input_folder, frame_file)
        image = Image.open(frame_path)
        pdf.add_page()

        pdf.image(frame_path, x=0, y=0, w=pdf.w, h=pdf.h)

        timestamp = f"{timestamp_seconds // 3600:02d}:{(timestamp_seconds % 3600) // 60:02d}:{timestamp_seconds % 60:02d}"

        x, y, width, height = 5, 5, 60, 15
        region = image.crop((x, y, x + width, y + height)).convert("L")
        mean_pixel_value = region.resize((1, 1)).getpixel((0, 0))
        if mean_pixel_value < 64:
            pdf.set_text_color(255, 255, 255)
        else:
            pdf.set_text_color(0, 0, 0)

        pdf.set_xy(x, y)
        pdf.set_font("Arial", size=12)
        pdf.cell(0, 0, timestamp)

    pdf.output(output_file)

def get_video_title(url):
    ydl_opts = {
        'skip_download': True,
        'ignoreerrors': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        video_info = ydl.extract_info(url, download=False)
        title = video_info['title'].replace('/', '-').replace('\\', '-').replace(':', '-').replace('*', '-').replace('?', '-').replace('<', '-').replace('>', '-').replace('|', '-').replace('"', '-').strip('.')
        return title


def main(url):
    video_id = get_video_id(url)
    if video_id:  # It's a single video URL (including Shorts and live)
        try: #  Handle potential download errors more gracefully
            video_file = download_video(url, "video.mp4")
            video_title = get_video_title(url)
            output_pdf_name = f"{video_title}.pdf"

            with tempfile.TemporaryDirectory() as temp_folder:
                timestamps = extract_unique_frames(video_file, temp_folder)
                convert_frames_to_pdf(temp_folder, output_pdf_name, timestamps)

            files.download(output_pdf_name)
        except Exception as e: # Catch and print any exceptions during processing
            print(f"An error occurred processing the video: {e}")
        finally: # Ensure cleanup even if errors occur
            if os.path.exists("video.mp4"):
                os.remove("video.mp4")


    elif "list=" in url:  # It's a playlist URL
        try:
            video_urls = get_playlist_videos(url)
            for video_url in video_urls:
                try:  # Handle per-video errors within the playlist
                    video_file = download_video(video_url, "video.mp4")
                    video_title = get_video_title(video_url)
                    output_pdf_name = f"{video_title}.pdf"

                    with tempfile.TemporaryDirectory() as temp_folder:
                        timestamps = extract_unique_frames(video_file, temp_folder)
                        convert_frames_to_pdf(temp_folder, output_pdf_name, timestamps)

                    files.download(output_pdf_name)
                except Exception as e:
                    print(f"An error occurred processing a video in the playlist: {e}")
                finally:
                    if os.path.exists("video.mp4"):
                        os.remove("video.mp4")
        except Exception as e: # Catch errors specific to playlist processing
            print(f"An error occurred processing the playlist: {e}")

    else:
        print("Invalid YouTube URL or playlist link.")

if __name__ == "__main__":
    main("https://youtu.be/BxG85bGo_kU?si=wGN3vBfYmMs7GBcZ")  # Replace with the URL of the video or playlist

[youtube] Extracting URL: https://youtu.be/BxG85bGo_kU?si=wGN3vBfYmMs7GBcZ
[youtube] BxG85bGo_kU: Downloading webpage
[youtube] BxG85bGo_kU: Downloading tv client config
[youtube] BxG85bGo_kU: Downloading player 753b1819-main
[youtube] BxG85bGo_kU: Downloading tv player API JSON
[youtube] BxG85bGo_kU: Downloading ios player API JSON
[youtube] BxG85bGo_kU: Downloading m3u8 information
[info] BxG85bGo_kU: Downloading 1 format(s): 18
[download] Destination: video.mp4
[download] 100% of   25.53MiB in 00:00:07 at 3.42MiB/s   
[youtube] Extracting URL: https://youtu.be/BxG85bGo_kU?si=wGN3vBfYmMs7GBcZ
[youtube] BxG85bGo_kU: Downloading webpage
[youtube] BxG85bGo_kU: Downloading tv client config
[youtube] BxG85bGo_kU: Downloading player 753b1819-main
[youtube] BxG85bGo_kU: Downloading tv player API JSON
[youtube] BxG85bGo_kU: Downloading ios player API JSON
[youtube] BxG85bGo_kU: Downloading m3u8 information


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>