In [20]:
from PyPDF2 import PdfReader
import fitz  
import os
import shutil
import re
import pandas as pd


# Read pdf file from pages start to end, outputs all images in image_folder

def read(file_path, start, end, image_folder):
    
    result = []
    
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)
        if end > num_pages:
            print("end number outside of total pages")
            return None


        if os.path.exists(image_folder):
            shutil.rmtree(image_folder)
        os.makedirs(image_folder)
        
        for page_num in range(start, end):

            # Text extraction
            doc = pdf_reader.pages[page_num]
            text = doc.extract_text()
            result.append(text)

            # Image extraction
            doc = fitz.open(file_path)
            image_list = doc.get_page_images(page_num)

            for i in range(len(image_list)):
                img = image_list[i]
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.save(os.path.join(image_folder, "page_{}_{}.png".format(page_num+1,i)))
            

    return result

In [21]:
textbook = "Sternberg.pdf"
contents = read(textbook, 1, 200, "images")

In [22]:
data = []

d = {}
for i in range(len(contents)):
    figures = re.findall(r'(Figure \d+\.\d+\. [\s\S]*?)(?=\n[A-Z0-9]|\Z)', contents[i])
    figures = [' '.join(s.splitlines()).strip().replace("- ", "") for s in figures]

    
    if len(figures) > 0:
        page_number = i + 2

        f = []
        
        for figure in figures:
            figure_number = match = re.search(r'Figure \d+\.\d+\.', figure).group(0)
            figure_desc = re.sub(r'Figure \d+\.\d+\.\s*', '', figure)
            data.append([figure_desc, ("outputs/" + figure_number+"png").replace(" ", "_")])
            
            f.append(figure_number)
        d[page_number] = f

In [23]:
if os.path.exists("outputs"):
    shutil.rmtree("outputs")
os.makedirs("outputs")

missing = []
for page, figs in d.items():

    for i in range(len(figs)):
        if os.path.exists("images/page_{}_{}.png".format(page,i)):
            os.rename("images/page_{}_{}.png".format(page,i) , "outputs/{}png".format(figs[i].replace(" ", "_")))
        else:
            missing.append(figs[i])

In [24]:
print(missing)

df = pd.DataFrame(data)
df.columns = ["text", "file_path"]

df.to_json('output.jsonl', orient='records', lines=True)

['Figure 2.3.', 'Figure 2.5.', 'Figure 2.4.', 'Figure 2.6.']
