# PaddleOCR Text Extraction from Videos (Google Colab)

This notebook extracts text from all videos in a GitHub repository using PaddleOCR with GPU support, saves results to a CSV, and displays logs for each video processed.

## 1. Install Dependencies
Install PaddleOCR, OpenCV, and all required packages. Ensure GPU support is enabled for PaddleOCR.

In [None]:
# Install PaddleOCR and dependencies (with GPU support)
# Use the latest compatible paddlepaddle-gpu version available for your Python and CUDA version
!pip install -q "paddlepaddle-gpu==2.6.2" "paddleocr==3.3.1" "opencv-python-headless" "numpy>=1.23,<2.1"
import os
import paddle
# Check GPU availability
print('PaddlePaddle GPU available:', paddle.is_compiled_with_cuda())
if paddle.is_compiled_with_cuda():
    print('Paddle device:', paddle.device.get_device())
else:
    print('GPU not available or PaddlePaddle not installed with GPU support.')

## 2. Clone GitHub Repository and Set Up Paths
Clone the repository containing the videos and set up the paths for processing and output.

In [None]:
# Clone the repository (if not already cloned)
import os
if not os.path.exists('text.extraction.paddle.ocr'):
    !git clone https://github.com/DexterMorganAlpha/text.extraction.paddle.ocr.git

# Set up paths
REPO_PATH = 'text.extraction.paddle.ocr'
VIDEOS_PATH = os.path.join(REPO_PATH, 'VIDEOS')
OUTPUT_CSV = os.path.join(REPO_PATH, 'extracted_texts.csv')

# Ensure the output directory exists
os.makedirs(REPO_PATH, exist_ok=True)

## 3. Import Required Libraries
Import all necessary libraries for video processing, OCR, and CSV handling.

In [None]:
import os
import cv2
import csv
import re
import numpy as np
import logging
from paddleocr import PaddleOCR

## 4. Initialize PaddleOCR with GPU Support
Set up the PaddleOCR object to use GPU and English language.

In [None]:
# Initialize PaddleOCR with GPU support and English language
# PaddleOCR will use GPU if paddlepaddle-gpu is installed and CUDA is available
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='en'
    # Do not pass use_gpu, gpu, or use_textline_orientation for PaddleOCR 3.3.1 + paddlepaddle-gpu 2.6.2
    # GPU/CPU is auto-detected by PaddleOCR based on installed paddlepaddle-gpu
    )

## 5. Define Helper Functions
Define functions for extracting text from video frames, filtering, saving to CSV, and sorting the CSV file.

In [None]:
def get_reel_number(filename):
    import re
    match = re.search(r"Video_(\d+)", filename)
    return int(match.group(1)) if match else None

def sort_csv(csv_file_path):
    if not os.path.exists(csv_file_path):
        print(f"CSV file {csv_file_path} does not exist. Skipping sorting.")
        return
    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        sorted_data = sorted(reader, key=lambda row: int(row['Reel Number']))
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['Reel Number', 'Text'])
        writer.writeheader()
        writer.writerows(sorted_data)

def save_text_to_csv(csv_file_path, reel_number, extracted_text):
    file_exists = os.path.exists(csv_file_path)
    with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            writer.writerow(['Reel Number', 'Text'])
        writer.writerow([reel_number, extracted_text])
        print(f"Saved text for reel {reel_number} to {csv_file_path}")

def extract_text_from_white_area(frame):
    frame_height, frame_width = frame.shape[:2]
    min_x, max_x = 0, frame_width
    min_y, max_y = 0, frame_height
    white_area = frame[min_y:max_y, min_x:max_x]
    result = ocr.predict(white_area)
    white_area_text = ""
    if result and len(result) > 0:
        for res in result:
            if isinstance(res, dict) or hasattr(res, '__getitem__'):
                rec_texts = res.get('rec_texts', []) if isinstance(res, dict) else getattr(res, 'rec_texts', [])
                rec_scores = res.get('rec_scores', []) if isinstance(res, dict) else getattr(res, 'rec_scores', [])
                for text, score in zip(rec_texts, rec_scores):
                    if score > 0.5:
                        white_area_text += text + " "
    return white_area_text.strip()

def filter_text(complete_text, unnecessary_words=None, unnecessary_patterns=None, remove_before=None, remove_after=None, fallback_text=""):
    filtered_text = complete_text    
    filtered_text = re.sub(r'[^\w\s]', '', filtered_text)
    if unnecessary_words:
        words = filtered_text.split()
        filtered_text = " ".join(word for word in words if word not in unnecessary_words)
    if unnecessary_patterns:
        lines = filtered_text.splitlines()
        filtered_text = "\n".join(line for line in lines if not any(re.search(pattern, line) for pattern in unnecessary_patterns))
    if remove_before:
        for word in remove_before:
            match = re.search(rf'\b{word}\b', filtered_text, re.IGNORECASE)
            if match:
                filtered_text = filtered_text[match.start():]
                break
    if remove_after:
        for word in remove_after:
            match = re.search(rf'\b{word}\b', filtered_text, re.IGNORECASE)
            if match:
                filtered_text = filtered_text[:match.end()]
                break
    if not filtered_text.strip():
        filtered_text = fallback_text
    return filtered_text.strip()

## 6. Process All Videos and Save Results
Iterate through all video files, extract text, log the reel number and text, and save to CSV.

In [None]:
# Remove existing CSV if present
if os.path.exists(OUTPUT_CSV):
    os.remove(OUTPUT_CSV)

unnecessary_words = ["W", "WA"]
unnecessary_patterns = []
remove_before = []
remove_after = []
custom_text = "If only there is a page dedicated"

video_files = [f for f in os.listdir(VIDEOS_PATH) if f.endswith('.mp4') and f.startswith('Video')]
video_files.sort(key=lambda x: get_reel_number(x) or 0)

for filename in video_files:
    reel_number = get_reel_number(filename)
    input_video_path = os.path.join(VIDEOS_PATH, filename)
    cap = cv2.VideoCapture(input_video_path)
    if not cap.isOpened():
        print(f"Failed to open video file: {input_video_path}")
        continue
    ret, frame = cap.read()
    if not ret:
        print(f"Failed to read the first frame of video: {input_video_path}")
        cap.release()
        continue
    white_area_text = extract_text_from_white_area(frame)
    filtered_text = filter_text(white_area_text, unnecessary_words, unnecessary_patterns, remove_before, remove_after, custom_text)
    print(f"Reel {reel_number}: {filtered_text}")
    save_text_to_csv(OUTPUT_CSV, reel_number, filtered_text)
    cap.release()

sort_csv(OUTPUT_CSV)

## 7. Display Extracted Texts from CSV
Read and display the extracted texts for verification.

In [None]:
import pandas as pd
if os.path.exists(OUTPUT_CSV):
    df = pd.read_csv(OUTPUT_CSV)
    display(df)
else:
    print(f"CSV file {OUTPUT_CSV} not found.")