# Download video

In [None]:
%pip install pytubefix

In [35]:
from pytubefix import YouTube
from pytubefix.cli import on_progress

In [36]:
video_urls = {
    'cours 3': 'https://www.youtube.com/watch?v=WEB4N4xbAhI',
    'cours 5': 'https://www.youtube.com/watch?v=-tdC1Y4Wo2Y',
    'cours 7': 'https://www.youtube.com/watch?v=t90r_MMmHdQ',
    'cours 9': 'https://www.youtube.com/watch?v=ufLx_7YafmM',
    'cours 11': 'https://www.youtube.com/watch?v=-yqPZQYYwiI'
}

In [None]:
for cours_name, url in video_urls.items():
    print(f'Starting download of {cours_name}')
    yt = YouTube(url, use_oauth=True, on_progress_callback=on_progress)
    stream = yt.streams.filter(res="720p").first()
    stream.download(output_path='./videos', filename=f'video_{cours_name.replace(" ", "_")}.mp4')
    print(f'\nVideo of {cours_name} downloaded')


# Crop relevant part of the video


In [None]:
%pip install opencv-python

In [75]:
import numpy as np
import cv2
import os

In [76]:
def video_processing(cap, interval, x, y, w_frame, h_frame):
    img_counter = 0

    frame_counter = 0  # Compteur pour suivre le nombre de frames traitées
    while(cap.isOpened()):
        ret, frame = cap.read()

        if ret:
            # Vérifier si le compteur de frames est un multiple de l'intervalle
            if frame_counter % interval == 0:
                print(f'Frame {frame_counter} captured')
                # Recadrage du cadre
                crop_frame = frame[y:h_frame, x:w_frame] # row, column

                # Sauvegarde de l'image recadrée
                cv2.imwrite(f'./screenshots/{cours_name.replace(" ", "_")}/screenshot_{img_counter}.jpg', crop_frame)
                img_counter += 1
        else:
            break

        frame_counter += 1

In [77]:
for cours_name, url in video_urls.items():
    print(f'Starting screenshot of {cours_name}')
    
    # Get video filename
    filename = f'video_{cours_name.replace(" ", "_")}.mp4'
    filepath = f'./videos/{filename}'
    
    # Open video
    cap = cv2.VideoCapture(filepath)
    
    # Original video characteristics
    w_frame, h_frame = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)   
    interval = int(fps * 30) # Get a frame every 30 seconds

    # Cropping values
    x,y = 250, 700 # column/width, row/height
    
    # Create screenshots folder
    os.makedirs(f'./screenshots/{cours_name.replace(" ", "_")}', exist_ok=True)
    
    # Video processing
    video_processing(cap, interval, x, y, w_frame, h_frame)
    
    # Nettoyage
    cap.release()
    cv2.destroyAllWindows()
    
    print(f'Screenshot of {cours_name} done')

Starting screenshot of cours 3
Frame 0 captured
Frame 750 captured
Frame 1500 captured
Frame 2250 captured
Frame 3000 captured
Frame 3750 captured
Frame 4500 captured
Frame 5250 captured
Frame 6000 captured
Frame 6750 captured
Frame 7500 captured
Frame 8250 captured
Frame 9000 captured
Frame 9750 captured
Frame 10500 captured
Frame 11250 captured
Frame 12000 captured
Frame 12750 captured
Frame 13500 captured
Frame 14250 captured
Frame 15000 captured
Frame 15750 captured
Frame 16500 captured
Frame 17250 captured
Frame 18000 captured
Frame 18750 captured
Frame 19500 captured
Frame 20250 captured
Frame 21000 captured
Frame 21750 captured
Frame 22500 captured
Frame 23250 captured
Frame 24000 captured
Frame 24750 captured
Frame 25500 captured
Frame 26250 captured
Frame 27000 captured
Frame 27750 captured
Frame 28500 captured
Frame 29250 captured
Frame 30000 captured
Frame 30750 captured
Frame 31500 captured
Frame 32250 captured
Frame 33000 captured
Frame 33750 captured
Frame 34500 captured


# Remove images without text



In [None]:
%pip install opencv-python pytesseract pandas


In [68]:
import cv2
import pytesseract
import os
import pandas as pd

In [55]:
def detect_text_in_image(image_path):
    # Read the image
    img = cv2.imread(image_path)
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Use Tesseract to detect text
    text = pytesseract.image_to_string(gray)
    return text.strip()

In [None]:
screenshots_dirs = ['./screenshots/cours_3','./screenshots/cours_5', './screenshots/cours_7', './screenshots/cours_9', './screenshots/cours_11']

In [None]:
for screenshots_dir in screenshots_dirs:
    print(f'Removing unrelevant images from {screenshots_dir}')
    os.makedirs(f'{screenshots_dir}/unrelevant', exist_ok=True)
    
    image_files = [f for f in os.listdir(screenshots_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    for image_file in image_files:
        try :
            detected_text = detect_text_in_image(f'{screenshots_dir}/{image_file}')
            if not detected_text:
                print(f'No text detected in {image_file}')
                # Move image to unrelevant folder
                os.rename(f'{screenshots_dir}/{image_file}', f'{screenshots_dir}/unrelevant/{image_file}')
        except Exception as e:
            print(f'Error processing {image_file}: {str(e)}')
    print(f'Unrelevant images removed from {screenshots_dir}')


# Read text from images

In [71]:
screenshots_dirs = ['./screenshots/cours_3']

In [73]:
def read_text_from_image(image_path):
    # Read the image
    img = cv2.imread(image_path)
    # Use Tesseract to detect text
    text = pytesseract.image_to_string(img)
    return text.strip()

In [None]:
df = pd.DataFrame()

for screenshots_dir in screenshots_dirs:
    print(f'Reading text from {screenshots_dir}')
    
    image_files = [f for f in os.listdir(screenshots_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    for image_file in image_files:
        try :
            detected_text = detect_text_in_image(f'{screenshots_dir}/{image_file}')
            print(f'Text detected in {image_file}: {detected_text}')
            df = pd.concat([df, pd.DataFrame({'filename': [image_file], 'text': [detected_text]})], ignore_index=True)
        except Exception as e:
            print(f'Error processing {image_file}: {str(e)}')
            df = pd.concat([df, pd.DataFrame({'filename': [image_file], 'text': [None]})], ignore_index=True)
            
    print(f'Text read from {screenshots_dir}')

    df.to_csv(f'./text_from_images_{screenshots_dir.split("/")[-1]}.csv', index=False)