In [73]:
from PyPDF2 import PdfReader
import fitz  
import os
import shutil
import re
import pandas as pd
import cv2
import numpy as np


# Read pdf file from pages start to end, outputs all images in image_folder
def read(file_path, start, end, image_folder):
    
    result = []
    
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)
        # end = num_pages
        if end > num_pages:
            print("end number outside of total pages")
            return None


        if os.path.exists(image_folder):
            shutil.rmtree(image_folder)
        os.makedirs(image_folder)
        
        for page_num in range(start, end):

            # Text extraction
            doc = pdf_reader.pages[page_num]
            text = doc.extract_text()
            result.append(text)

            # Image extraction
            doc = fitz.open(file_path)
            image_list = doc.get_page_images(page_num)

            for i in range(len(image_list)):
                img = image_list[i]
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.save(os.path.join(image_folder, "page_{}_{}.png".format(page_num+1,i)))
            

    return result

# standard process for textbooks with image_list
def process(textbook, start_page, end_page):
    def remove_pdf_extension(filename):
        if filename.lower().endswith(".pdf"):
            return os.path.splitext(filename)[0]
        return filename


    book_name = remove_pdf_extension(textbook)
    contents = read(textbook, start_page, end_page, "temp")
    

    
    data = []

    d = {}
    for i in range(len(contents)):
        # figures = re.findall(r'(Figure \d+\.\d+\. [\s\S]*?)(?=\n[A-Z0-9]|\Z)', contents[i])
        figures = re.findall(r'(Fig. \d+\.\d+ [\s\S]*?)(?=\n[A-Z0-9]|\Z)', contents[i])
        
        figures = [' '.join(s.splitlines()).strip().replace("- ", "") for s in figures]
    
        
        if len(figures) > 0:
            page_number = i + 2
    
            f = []
            
            for figure in figures:
                # figure_number = match = re.search(r'Figure \d+\.\d+\.', figure).group(0)
                # figure_desc = re.sub(r'Figure \d+\.\d+\.\s*', '', figure)

                figure_number = match = re.search(r'Fig. \d+\.\d+', figure).group(0)
                figure_desc = re.sub(r'Fig. \d+\.\d+\s*', '', figure)
     
                data.append([figure_desc, ("images/" + figure_number+"png").replace(" ", "_")])
                
                f.append(figure_number)
            d[page_number] = f


    
    if os.path.exists("results/" + book_name  + "/images"):
        shutil.rmtree("results/" + book_name  + "/images")
    os.makedirs("results/" + book_name + "/images")
    
    missing = []
    for page, figs in d.items():
    
        for i in range(len(figs)):
            if os.path.exists("temp/page_{}_{}.png".format(page,i)):
                os.rename("temp/page_{}_{}.png".format(page,i) , "results/{}/images/{}.png".format(book_name, figs[i].replace(" ", "_")))
            else:
                missing.append(figs[i])

    shutil.rmtree("temp")

    df = pd.DataFrame(data)
    df.columns = ["text", "file_path"]

    
    df.to_json("results/" + book_name + '/output.jsonl', orient='records', lines=True)

    with open("results/" + book_name + "/missing.txt", "w") as file:
        for item in missing:
            file.write(f"{item}\n")



def process_no_list(textbook, start_page, end_page):
    def extract_images_from_page(image_path, output_folder):
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
    
        img = cv2.imread(image_path)
        if img is None:
            print(f"Could not read the image file: {image_path}")
            return
    
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                       cv2.THRESH_BINARY_INV, 11, 2)
        
        kernel = np.ones((5,5),np.uint8)
        dilated = cv2.dilate(thresh, kernel, iterations=1)
        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        image_count = 0
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
    
        
            if w > 100 and h > 100:
                cropped_image = img[y:y+h, x:x+w]
    
                # Save the cropped image
                image_filename = os.path.join(output_folder, f"extracted_image_{image_count}.png")
                cv2.imwrite(image_filename, cropped_image)
                image_count += 1
    
        return image_count
    
    def process_directory(input_directory, output_directory):
        for filename in os.listdir(input_directory):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(input_directory, filename)
                specific_output_folder = os.path.join(output_directory, os.path.splitext(filename)[0])
                os.makedirs(specific_output_folder, exist_ok=True)
                number_of_images = extract_images_from_page(file_path, specific_output_folder)

    def remove_pdf_extension(filename):
        if filename.lower().endswith(".pdf"):
            return os.path.splitext(filename)[0]
        return filename


    book_name = remove_pdf_extension(textbook)
    contents = read(textbook, start_page, end_page, "temp")
    process_directory("temp", "results/" + book_name)

    


In [74]:
# textbook = "Sternberg.pdf" <-- need fix missing
textbook = "Histology.pdf" 




process_no_list(textbook, 1, 100)

In [None]:
import os
import glob

current_directory = os.getcwd()
pdf_files = glob.glob(os.path.join(current_directory, "*.pdf"))
textbooks = [os.path.basename(pdf) for pdf in pdf_files]

failed_textbooks = []

for textbook in textbooks:
    try:
        print(f"Processing {textbook}...")
        process(textbook, 1, 170)
    except Exception as e:
        print(f"Failed to process {textbook}: {e}")
        failed_textbooks.append(textbook)

if failed_textbooks:
    print("\nThe following textbooks could not be processed:")
    for failed_textbook in failed_textbooks:
        print(failed_textbook)
