To use pytesseract :  
1. It works only on images so convert all pdf to images using `pdf2image`.
2. `Pytesseract` is just a wrapper it need tesseract to be installed on your system by ```brew install tesseract``` for mac.
3. 

In [None]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os
import csv
import glob
import re
from pathlib import Path
from tqdm import tqdm

def extract_text_from_scanned_pdf(pdf_path, tesseract_cmd=None):
    """Extract text from a scanned PDF using Tesseract OCR."""
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
    # Convert PDF to images
    images = convert_from_path(pdf_path)
    text = ""
    for i, image in enumerate(images):
        page_text = pytesseract.image_to_string(image)
        text += f"\n--- Page {i+1} ---\n{page_text}"
    return text

pdf_file = 'data/first_year/chemistry/Mid_Sem/13-14 chem3.pdf'
# If tesseract is not in PATH, specify its location, e.g. '/usr/local/bin/tesseract'
tesseract_path = "/opt/homebrew/bin/tesseract"

# extracted_text = extract_text_from_scanned_pdf(pdf_file, tesseract_cmd=tesseract_path)
# print(extracted_text)

# Helper to extract year from filename or folder
def extract_academic_year(name):
    match = re.search(r'(\d{2,4}-\d{2,4})', name)
    return match.group(1) if match else ''

# Helper to extract type of examination from path
def extract_exam_type(path):
    if 'Mid' in path or 'mid' in path:
        return 'Mid Term'
    elif 'End' in path or 'end' in path:
        return 'End Term'
    return ''

# Helper to extract course and year from path
def extract_course_and_year(path):
    parts = path.split(os.sep)
    course = ''
    year = ''
    for part in parts:
        if 'btech' in part.lower():
            course = 'B.Tech'
        elif 'bsc' in part.lower():
            course = 'B.Sc'
        if 'first_year' in part.lower():
            year = '1st'
        elif 'second_year' in part.lower():
            year = '2nd'
        elif 'third_year' in part.lower():
            year = '3rd'
        elif 'fourth_year' in part.lower():
            year = '4th'
    return course, year

# Helper to extract subject from path
def extract_subject(path):
    parts = path.split(os.sep)
    for part in parts:
        if part.lower() in ['math', 'chemistry', 'physics'] or 'math' in part.lower() or 'chem' in part.lower() or 'phys' in part.lower():
            return part
    return ''

# Prepare CSV columns
csv_columns = [
    'unique_id', 'academic_year', 'exam_type', 'course', 'year', 'subject', 'topic', 'subtopic', 'question'
 ]

data_dir = 'data'
pdf_files = glob.glob(f'{data_dir}/**/*.pdf', recursive=True)

output_rows = []
unique_id = 1

for pdf_path in tqdm(pdf_files, desc='Processing PDFs'):
    # Extract info from path
    academic_year = extract_academic_year(pdf_path)
    exam_type = extract_exam_type(pdf_path)
    course, year = extract_course_and_year(pdf_path)
    subject = extract_subject(pdf_path)
    topic = ''  # To be filled later
    subtopic = ''  # To be filled later

    # Extract text from PDF (using your function)
    try:
        text = extract_text_from_scanned_pdf(pdf_path, tesseract_cmd=tesseract_path)
    except Exception as e:
        print(f'Error processing {pdf_path}: {e}')
        continue

    # Split text into questions (naive split by '?', can be improved)
    questions = [q.strip()+'?' for q in text.split('?') if q.strip()]

    for question in questions:
        output_rows.append([unique_id, academic_year, exam_type, course, year, subject, topic, subtopic, question])
        unique_id += 1

# Write to CSV
with open('questions_extracted.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(csv_columns)
    writer.writerows(output_rows)
print(f'Extraction complete. {len(output_rows)} questions written to questions_extracted.csv')

Processing PDFs: 100%|██████████| 121/121 [11:24<00:00,  5.66s/it]

Extraction complete. 597 questions written to questions_extracted.csv



