<a href="https://colab.research.google.com/github/CodeofRahul/-YouTube-Study-Notes-Automator/blob/main/02_Yt2Pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. This is working good"

In [4]:
# @title 🚀 YouTube to PDF (Simplified Folders)
import os
import shutil
import cv2
import yt_dlp
import numpy as np
from fpdf import FPDF
from tqdm.notebook import tqdm
from PIL import Image

# --- User Settings ---
MIN_INTERVAL_SECONDS = 5  # Capture a slide every 5 seconds (if changed)
SENSITIVITY = 15          # Lower = captures more minor changes

# --- Setup Folders ---
PDF_FOLDER = "Final_PDF_Notes"
RESOURCE_FOLDER = "Captured_Resources"

for folder in [PDF_FOLDER, RESOURCE_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)

def clean_name(text):
    return "".join([c for c in text if c.isalnum() or c in (' ', '_')]).strip()

def process_video(url):
    # 1. Download Video
    print(f"\n🌐 Accessing: {url}")
    ydl_opts = {
        'format': 'best[height<=720]',
        'outtmpl': f'{RESOURCE_FOLDER}/temp_video.mp4',
        'overwrites': True,
        'quiet': True
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        video_title = clean_name(info.get('title', 'Study_Notes'))

    # 2. Extract Slides
    video_path = f'{RESOURCE_FOLDER}/temp_video.mp4'
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    last_frame_gray = None
    saved_count = 0

    print(f"📸 Extracting slides from: {video_title}")
    pbar = tqdm(total=total_frames, desc="Progress")

    current_frame = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        should_save = False
        if last_frame_gray is None:
            should_save = True
        else:
            # Compare current frame with the last saved one
            diff = cv2.absdiff(gray, last_frame_gray)
            change_percent = (np.count_nonzero(diff > 25) * 100) / (gray.shape[0] * gray.shape[1])
            if change_percent > SENSITIVITY:
                should_save = True

        if should_save:
            img_path = f"{RESOURCE_FOLDER}/slide_{saved_count:03d}.jpg"
            cv2.imwrite(img_path, frame)
            last_frame_gray = gray
            saved_count += 1

            # Jump forward to avoid "duplicate" shots of the same slide
            current_frame += int(fps * MIN_INTERVAL_SECONDS)
        else:
            current_frame += int(fps) # Check every 1 second if no change

        cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
        pbar.update(int(fps * MIN_INTERVAL_SECONDS))

    cap.release()
    pbar.close()

    # 3. Generate PDF
    if saved_count > 0:
        print(f"📄 Creating PDF from {saved_count} slides...")
        images = sorted([f for f in os.listdir(RESOURCE_FOLDER) if f.startswith("slide_")])

        # Get dimensions from first image
        with Image.open(f"{RESOURCE_FOLDER}/{images[0]}") as img:
            w, h = img.size

        pdf = FPDF(unit="pt", format=[w, h])
        for img_name in images:
            pdf.add_page()
            pdf.image(f"{RESOURCE_FOLDER}/{img_name}", 0, 0, w, h)

        pdf_filename = f"{PDF_FOLDER}/{video_title}.pdf"
        pdf.output(pdf_filename, "F")
        print(f"✅ SUCCESS: {pdf_filename} created!")
    else:
        print("❌ Failed to detect any slides.")

    # 4. Clean up resources for next video
    for f in os.listdir(RESOURCE_FOLDER):
        os.remove(os.path.join(RESOURCE_FOLDER, f))

# --- Run ---
input_url = input("Enter YouTube Link: ")
if "playlist" in input_url:
    with yt_dlp.YoutubeDL({'extract_flat': True, 'quiet': True}) as ydl:
        playlist_info = ydl.extract_info(input_url, download=False)
        for entry in playlist_info['entries']:
            process_video(entry['url'])
else:
    process_video(input_url)

Enter YouTube Link: https://youtu.be/HVvN4T6AzSk?si=bH5Rw63lue9dY1-v

🌐 Accessing: https://youtu.be/HVvN4T6AzSk?si=bH5Rw63lue9dY1-v




📸 Extracting slides from: भरत एव इसक पडस दश 1  BLOCK STATISTICAL OFFICER  BSO EXAM परपरशन


Progress:   0%|          | 0/235733 [00:00<?, ?it/s]

📄 Creating PDF from 153 slides...
✅ SUCCESS: Final_PDF_Notes/भरत एव इसक पडस दश 1  BLOCK STATISTICAL OFFICER  BSO EXAM परपरशन.pdf created!


# **2. This is Best**

In [12]:
# @title 🚀 YouTube to PDF (Anti-Duplicate & Super Fast)
import os, shutil, cv2, yt_dlp, numpy as np
from fpdf import FPDF
from tqdm.notebook import tqdm
from PIL import Image

# --- Advanced Tuning ---
JUMP_SECONDS = 15     # Check every 15 seconds (Standard for lectures)
SIMILARITY_LIMIT = 0.95 # Higher = stricter. 0.95 means 95% similar images are ignored.
DOWNLOAD_RES = "480"

PDF_FOLDER = "Final_PDF_Notes"
RESOURCE_FOLDER = "Captured_Resources"
for folder in [PDF_FOLDER, RESOURCE_FOLDER]:
    if not os.path.exists(folder): os.makedirs(folder)

def get_similarity(img1, img2):
    """Calculates how similar two slides are (0.0 to 1.0)"""
    # Resize to very small for a 'global' comparison
    img1 = cv2.resize(img1, (128, 128))
    img2 = cv2.resize(img2, (128, 128))
    # Compute Template Matching or Histogram
    res = cv2.matchTemplate(img1, img2, cv2.TM_CCOEFF_NORMED)
    return res[0][0]

def process_video_smart(url):
    print(f"\n🌐 Downloading Video...")
    ydl_opts = {
        'format': f'best[height<={DOWNLOAD_RES}]',
        'outtmpl': f'{RESOURCE_FOLDER}/temp_video.mp4',
        'overwrites': True, 'quiet': True
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        video_title = "".join([c for c in info.get('title', 'Notes') if c.isalnum() or c==' ']).strip()

    cap = cv2.VideoCapture(f'{RESOURCE_FOLDER}/temp_video.mp4')
    fps = cap.get(cv2.CAP_PROP_FPS) or 30
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    last_saved_gray = None
    saved_count = 0

    print(f"🎬 Processing: {video_title}")
    pbar = tqdm(total=total_frames)

    curr_frame = 0
    while curr_frame < total_frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        ret, frame = cap.read()
        if not ret: break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        is_different = False
        if last_saved_gray is None:
            is_different = True
        else:
            similarity = get_similarity(gray, last_saved_gray)
            # If similarity is less than 95%, it's a new slide
            if similarity < SIMILARITY_LIMIT:
                is_different = True

        if is_different:
            cv2.imwrite(f"{RESOURCE_FOLDER}/slide_{saved_count:03d}.jpg", frame)
            last_saved_gray = gray
            saved_count += 1

        curr_frame += int(fps * JUMP_SECONDS)
        pbar.update(int(fps * JUMP_SECONDS))

    cap.release()
    pbar.close()

    if saved_count > 0:
        images = sorted([f for f in os.listdir(RESOURCE_FOLDER) if f.startswith("slide_")])
        with Image.open(f"{RESOURCE_FOLDER}/{images[0]}") as img:
            w, h = img.size

        pdf = FPDF(unit="pt", format=[w, h])
        for img_name in images:
            pdf.add_page()
            pdf.image(f"{RESOURCE_FOLDER}/{img_name}", 0, 0, w, h)

        pdf_path = f"{PDF_FOLDER}/{video_title}.pdf"
        pdf.output(pdf_path, "F")
        print(f"✅ FINISHED! Total Pages: {saved_count}")
        print(f"📂 Location: {pdf_path}")

    # Cleanup resources
    for f in os.listdir(RESOURCE_FOLDER): os.remove(os.path.join(RESOURCE_FOLDER, f))

# --- Run ---
input_url = input("Enter YouTube Link: ")
process_video_smart(input_url)

Enter YouTube Link: https://youtu.be/W3Mk7EbFASI?si=kGr4fJph896zPyrm

🌐 Downloading Video...




🎬 Processing: भरत एव इसक पडस दश 2   BSSC INTER LEVEL EXAM DATE  BLOCK STATISTICAL OFFICER  EXAM


  0%|          | 0/103194 [00:00<?, ?it/s]

✅ FINISHED! Total Pages: 112
📂 Location: Final_PDF_Notes/भरत एव इसक पडस दश 2   BSSC INTER LEVEL EXAM DATE  BLOCK STATISTICAL OFFICER  EXAM.pdf


# upload pdf and delete unwanted pages (Pdf cleaning)

In [13]:
# @title ✂️ PDF Page Manager (With Select/Deselect All)
import os
from google.colab import files
from IPython.display import display, clear_output
import ipywidgets as widgets

# 1. --- Install Dependencies ---
print("Preparing tools...")
!apt-get install -y poppler-utils > /dev/null
!pip install -q pdf2image
from pdf2image import convert_from_path
from PIL import Image

# 2. --- File Upload ---
uploaded = files.upload()
if not uploaded:
    print("No file uploaded.")
else:
    pdf_filename = list(uploaded.keys())[0]

    # 3. --- Convert PDF ---
    print("Generating preview... please wait.")
    preview_pages = convert_from_path(pdf_filename, dpi=40) # Even lower DPI for speed
    page_status = [True] * len(preview_pages)
    button_widgets = []

    # 4. --- Toggle Logic ---
    def update_button(btn, index):
        if page_status[index]:
            btn.description = f"KEEP P{index+1}"
            btn.button_style = 'success'
        else:
            btn.description = f"REMOVE P{index+1}"
            btn.button_style = 'danger'

    def on_toggle_click(b):
        idx = b.index_ref
        page_status[idx] = not page_status[idx]
        update_button(b, idx)

    # 5. --- Bulk Action Buttons ---
    def select_all(b):
        for i in range(len(page_status)):
            page_status[i] = True
            update_button(button_widgets[i], i)

    def deselect_all(b):
        for i in range(len(page_status)):
            page_status[i] = False
            update_button(button_widgets[i], i)

    btn_all = widgets.Button(description="Select All (Green)", button_style='info')
    btn_none = widgets.Button(description="Deselect All (Red)", button_style='warning')
    btn_all.on_click(select_all)
    btn_none.on_click(deselect_all)

    # 6. --- Build Gallery ---
    items = []
    for i, page in enumerate(preview_pages):
        img_out = widgets.Output()
        with img_out:
            display(page.resize((180, int(180 * page.height / page.width))))

        btn = widgets.Button(description=f"KEEP P{i+1}", button_style='success', layout=widgets.Layout(width='180px'))
        btn.index_ref = i
        btn.on_click(on_toggle_click)
        button_widgets.append(btn)

        box = widgets.VBox([img_out, btn], layout=widgets.Layout(border='1px solid #ddd', margin='5px', align_items='center'))
        items.append(box)

    grid = widgets.GridBox(items, layout=widgets.Layout(grid_template_columns="repeat(auto-fill, minmax(200px, 1fr))"))

    print("\n✅ Use Bulk Actions or click individual buttons:")
    display(widgets.HBox([btn_all, btn_none]))
    display(grid)

    # 7. --- Final Save ---
    final_btn = widgets.Button(description="DOWNLOAD CLEANED PDF", button_style='primary', layout=widgets.Layout(width='300px', height='50px', margin='20px'))
    status_out = widgets.Output()

    def download_final(b):
        with status_out:
            clear_output()
            print("⏳ Creating high-quality PDF... Please wait.")
            high_res = convert_from_path(pdf_filename, dpi=150)
            final_list = [high_res[i] for i, keep in enumerate(page_status) if keep]

            if not final_list:
                print("❌ No pages selected!")
                return

            out_name = f"Cleaned_{pdf_filename}"
            final_list[0].save(out_name, save_all=True, append_images=final_list[1:])
            print(f"✅ Success! Saved {len(final_list)} pages.")
            files.download(out_name)

    final_btn.on_click(download_final)
    display(final_btn, status_out)

Preparing tools...


Saving भरत एव इसक पडस दश 2   BSSC INTER LEVEL EXAM DATE  BLOCK STATISTICAL OFFICER  EXAM.pdf to भरत एव इसक पडस दश 2   BSSC INTER LEVEL EXAM DATE  BLOCK STATISTICAL OFFICER  EXAM.pdf
Generating preview... please wait.

✅ Use Bulk Actions or click individual buttons:


HBox(children=(Button(button_style='info', description='Select All (Green)', style=ButtonStyle()), Button(butt…

GridBox(children=(VBox(children=(Output(), Button(button_style='success', description='KEEP P1', layout=Layout…

Button(button_style='primary', description='DOWNLOAD CLEANED PDF', layout=Layout(height='50px', margin='20px',…

Output()