# Download videos

In [35]:
from pytubefix import YouTube
from pytubefix.cli import on_progress

In [4]:
video_urls = {
    'cours 3': 'https://www.youtube.com/watch?v=WEB4N4xbAhI',
    'cours 5': 'https://www.youtube.com/watch?v=-tdC1Y4Wo2Y',
    'cours 7': 'https://www.youtube.com/watch?v=t90r_MMmHdQ',
    'cours 9': 'https://www.youtube.com/watch?v=ufLx_7YafmM',
    'cours 11': 'https://www.youtube.com/watch?v=-yqPZQYYwiI'
}

In [None]:
for cours_name, url in video_urls.items():
    print(f'Starting download of {cours_name}')
    yt = YouTube(url, use_oauth=True, on_progress_callback=on_progress)
    stream = yt.streams.filter(res="720p").first()
    stream.download(output_path='./videos', filename=f'video_{cours_name.replace(" ", "_")}.mp4')
    print(f'\nVideo of {cours_name} downloaded')


# Take screenshots of the relevant part of videos


In [1]:
import numpy as np
import cv2
import os

In [2]:
def video_processing(cap, interval, x, y, w_frame, h_frame):
    img_counter = 0

    frame_counter = 0
    while(cap.isOpened()):
        ret, frame = cap.read()

        if ret:
            # Check if the frame counter is a multiple of the interval
            if frame_counter % interval == 0:
                print(f'Frame {frame_counter} captured')
                # Crop the frame
                crop_frame = frame[y:h_frame, x:w_frame] # row, column

                # Save the cropped image
                cv2.imwrite(f'./screenshots/{cours_name.replace(" ", "_")}/screenshot_{img_counter}.jpg', crop_frame)
                img_counter += 1
        else:
            break

        frame_counter += 1

In [None]:
for cours_name, url in video_urls.items():
    print(f'Starting screenshot of {cours_name}')
    
    # Get video filename
    filename = f'video_{cours_name.replace(" ", "_")}.mp4'
    filepath = f'./videos/{filename}'
    
    # Open video
    cap = cv2.VideoCapture(filepath)
    
    # Original video characteristics
    w_frame, h_frame = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)   
    interval = int(fps * 30) # Get a frame every 30 seconds

    # Cropping values
    x,y = 250, 695 # column/width, row/height
    
    # Create screenshots folder
    os.makedirs(f'./screenshots/{cours_name.replace(" ", "_")}', exist_ok=True)
    
    # Video processing
    video_processing(cap, interval, x, y, w_frame, h_frame)
    
    # Nettoyage
    cap.release()
    cv2.destroyAllWindows()
    
    print(f'Screenshot of {cours_name} done')

# Remove images without text

In [8]:
import cv2
import pytesseract
import os
import pandas as pd

In [55]:
def detect_text_in_image(image_path):
    # Read the image
    img = cv2.imread(image_path)
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Use Tesseract to detect text
    text = pytesseract.image_to_string(gray)
    return text.strip()

In [10]:
screenshots_dirs = ['./screenshots/cours_3','./screenshots/cours_5', './screenshots/cours_7', './screenshots/cours_9', './screenshots/cours_11']

In [None]:
for screenshots_dir in screenshots_dirs:
    print(f'Removing unrelevant images from {screenshots_dir}')
    os.makedirs(f'{screenshots_dir}/unrelevant', exist_ok=True)
    
    image_files = [f for f in os.listdir(screenshots_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    for image_file in image_files:
        try :
            detected_text = detect_text_in_image(f'{screenshots_dir}/{image_file}')
            if not detected_text:
                print(f'No text detected in {image_file}')
                # Move image to unrelevant folder
                os.rename(f'{screenshots_dir}/{image_file}', f'{screenshots_dir}/unrelevant/{image_file}')
        except Exception as e:
            print(f'Error processing {image_file}: {str(e)}')
    print(f'Unrelevant images removed from {screenshots_dir}')


# Flag duplicates images

We read the text in the images, if they have more than 90% similarity, we consider it a duplicate. We make sure to never mark two distinct files as duplicates.<br>
We move non-duplicate images to "screenshots_clean". <br>
We do the final sorting manually.


In [270]:
import os
import difflib
import pytesseract
from PIL import Image, ImageOps, ImageFilter, ImageEnhance
import matplotlib.pyplot as plt
import cv2
import numpy as np

In [271]:
def preprocess_and_ocr(image_path, lang='eng'):
    # Open the image with Pillow
    img = Image.open(image_path)
    
    # Image dimensions
    width, height = img.size
    
    # Enlarge by a factor, e.g., 4
    factor = 4
    new_width = width * factor
    new_height = height * factor

    # Resize with high-quality interpolation
    resized_img = img.resize((new_width, new_height), Image.LANCZOS)

    # Apply a sharpening filter
    resized_img = resized_img.filter(ImageFilter.SHARPEN)

    # Increase contrast
    enhancer = ImageEnhance.Contrast(resized_img)
    resized_img = enhancer.enhance(5.0)  # 2.0 = double the contrast
    
    # Convert to grayscale
    gray_img = resized_img.convert("L")
    
    # Invert (black text on white background) if necessary
    gray_img = ImageOps.invert(gray_img)

    # Convert Pillow -> NumPy for OpenCV
    gray_img = np.array(gray_img)       
    
    # Apply morphological "opening" operation
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    opened_np = cv2.morphologyEx(gray_img, cv2.MORPH_OPEN, kernel)
    
    # Display the result
    # plt.figure(figsize=(30, 10))
    # plt.imshow(opened_np, cmap='gray')
    # plt.show()

    # Convert back NumPy -> Pillow for Tesseract
    opened_img = Image.fromarray(opened_np)

    # Perform OCR, for example in "single line" mode (PSM 7)
    text = pytesseract.image_to_string(
        opened_img,
        config='--psm 7',
        lang=lang
    )

    return text.strip()


In [272]:
def are_texts_similar(text1, text2, threshold=0.8):
    ratio = difflib.SequenceMatcher(None, text1, text2).ratio()
    return ratio >= threshold

def handle_duplicate_files(folder_path, similarity_threshold):
    # Retrieve all image files
    image_files = [
        f for f in os.listdir(folder_path)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    ]
    
    # Sort files by number
    image_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))

    current_file_index = 0
    while current_file_index < len(image_files):
        # Current file
        filename = image_files[current_file_index]
        file_path = os.path.join(folder_path, filename)

        # Extract text if not already done
        text_n = preprocess_and_ocr(file_path)

        # Flag to continue or not the comparison
        check_next_file = True
        # Compare with the next file, n+1
        next_file_index = current_file_index + 1

        while check_next_file:
            # If we have exceeded the last index, exit
            if next_file_index == len(image_files):
                check_next_file = False
                current_file_index = next_file_index
                break

            # File to compare
            next_file = image_files[next_file_index]
            next_file_path = os.path.join(folder_path, next_file)

            # Extract text if not already done
            text_next = preprocess_and_ocr(next_file_path)

            # Compare the texts
            if are_texts_similar(text_n, text_next, similarity_threshold):
                print(f"[DUPLICATE] {filename} is a duplicate of {next_file}")
                # For example: rename the next file to mark it as a duplicate
                base, ext = os.path.splitext(next_file)
                new_name = f"{base}_duplicate{ext}"
                new_path = os.path.join(folder_path, new_name)
                os.rename(next_file_path, new_path)
                next_file_index += 1
            else:
                print(f"[NOT DUPLICATE] {filename} is not a duplicate of {next_file}")
                # Stop the comparison and advance the main index
                check_next_file = False
                current_file_index = next_file_index

    print("End of processing.")


In [274]:
screenshots_folders = ["./screenshots/cours_3", "./screenshots/cours_5", "./screenshots/cours_7", "./screenshots/cours_9", "./screenshots/cours_11"]
for folder in screenshots_folders:
    handle_duplicate_files(folder, similarity_threshold=0.9)

[DUPLICATE] screenshot_5.jpg est un doublon de screenshot_6.jpg
[NOT DUPLICATE] screenshot_5.jpg n'est pas un doublon de screenshot_7.jpg
[DUPLICATE] screenshot_7.jpg est un doublon de screenshot_8.jpg
[DUPLICATE] screenshot_7.jpg est un doublon de screenshot_9.jpg
[DUPLICATE] screenshot_7.jpg est un doublon de screenshot_10.jpg
[NOT DUPLICATE] screenshot_7.jpg n'est pas un doublon de screenshot_12.jpg
[DUPLICATE] screenshot_12.jpg est un doublon de screenshot_13.jpg
[DUPLICATE] screenshot_12.jpg est un doublon de screenshot_14.jpg
[NOT DUPLICATE] screenshot_12.jpg n'est pas un doublon de screenshot_18.jpg
[NOT DUPLICATE] screenshot_18.jpg n'est pas un doublon de screenshot_19.jpg
[NOT DUPLICATE] screenshot_19.jpg n'est pas un doublon de screenshot_20.jpg
[NOT DUPLICATE] screenshot_20.jpg n'est pas un doublon de screenshot_21.jpg
[NOT DUPLICATE] screenshot_21.jpg n'est pas un doublon de screenshot_22.jpg
[NOT DUPLICATE] screenshot_22.jpg n'est pas un doublon de screenshot_23.jpg
[DUPLI

# Move non-duplicate images to relevant_screenshots

In [None]:
import shutil

In [275]:
screenshots_folders = ["./screenshots/cours_3", "./screenshots/cours_5", "./screenshots/cours_7", "./screenshots/cours_9", "./screenshots/cours_11"]

os.makedirs('./relevant_screenshots', exist_ok=True)

for folder in screenshots_folders:
    os.makedirs(f'./relevant_screenshots/{folder.split("/")[-1]}', exist_ok=True)
    for image in os.listdir(folder):
        if image.endswith('.jpg') and not image.endswith('_duplicate.jpg'):
            shutil.move(os.path.join(folder, image), f'./relevant_screenshots/{folder.split("/")[-1]}')