In [1]:
import requests
import pdf2image 
import numpy as np
from PIL import Image
import pytesseract
import re
from IPython.display import display
from tqdm import tqdm
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import unidecode
import io
import PyPDF2
import bs4 as bs
import os
import string
from itertools import product

In [2]:
pattern = r" 0\\n\( (\d+ \d+ \d+ \d+) ((?:0\\n\d+ \d+ \d+ \d+ \d+ )+)0\\n\) \d+ \d+ \d+ \d+"
compiled_pattern = re.compile(pattern)

def get_num_pages(pdf):
    pdf_file = io.BytesIO(pdf.content)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    return num_pages

In [142]:
pattern = re.compile(r"(\(\d+\))")
pattern2 = re.compile(r"(\d+ \d+ \d+ \d+);")
pattern_3 = re.compile(r"(\d+\.\d+(?:\s+[^ \n]+){0,5}?\s+(?:Theorem|Definition|Definitions|Proposition|Corollary|Remark|Lemma))", re.MULTILINE)

def get_text(soup):
    text = []
    for word in soup.find_all('span', class_='ocrx_word'):
        text.append(word.get_text())

    # Join the extracted words into a single string
    extracted_text = ' '.join(text)
    return extracted_text

def get_split(text, initial = 'None'):
    # Assuming `extracted_text` is your input text
    split_text = re.split(pattern_3, text)
    text_chunks = {}
    current_key = initial 
    for piece in split_text:
        if pattern_3.match(piece):
            current_key = piece  # Update the key when a pattern is matched
        else:
            if current_key in text_chunks:
                text_chunks[current_key] += piece  # Append text to existing key
            else:
                text_chunks[current_key] = piece  # Create new key
    return text_chunks, list(text_chunks.keys())[-1]

def get_key(text_chunks, match):
    for key,val in text_chunks.items():
        if match in val:
            return key
        
def custom_selector(tag):
    if type(tag) is bs.element.Tag:
    # Return "span" tags with a class name of "target_span"
        condition1= tag.name == "span" and "ocrx_word" in tag.get("class")
        condition2 =  tag.previousSibling == '\n' and tag.next_sibling == '\n'
        child = tag.find_all_next() 
        if child: 
            condition4 = child[0].find_all_previous()[0] == tag # checking to make sure it is first child
            condition3 = re.match(pattern, str(tag.string)) 
            return condition1 and condition2 and condition3 and condition4
    return False

fn = lambda x : 255 if x >= 127.5 else 0

def extract_formulas(image_, chapter, initial):
    image = image_.convert('L').point(fn, mode='1')
    w,h = image.size
    hocr = pytesseract.image_to_pdf_or_hocr(image, extension='hocr')
    soup = bs.BeautifulSoup(hocr,'html.parser')
    text_chunks, initial = get_split(get_text(soup), initial)
    matches = soup.find_all(custom_selector)
    im = np.asarray(image_)
    prev = initial
    for match in matches: 
        match_text = get_text(match.find_parent('p'))
        x_0,y_0,x_1,y_1 = [int(x) for x in re.findall(pattern2, match.get("title"))[0].split(' ')]
        if x_0 < w/4: # extra check to make sure it doesn't misclassify an i for a 1
            continue
        temp = get_key(text_chunks, match_text)
        if temp == 'None' and prev != 'None':
            initial = prev
        else:
            initial = temp
        initial = get_key(text_chunks, match_text)
        prev = initial
        formula_num = int(''.join(c for c in str(match.string) if c.isdigit()))
        final_image = Image.fromarray(im[y_0-120:y_1+120, :])
        fname = f'real_and_complex/{initial}_{chapter}_{formula_num}.jpg'
        count = 0
        if os.path.isfile(fname): # multiple formulas named the same in the same chapter
            continue
        final_image.save(fname)
    return initial

In [4]:
books = {'principles_of_mathematical_analysis': 'https://download.tuxfamily.org/openmathdep/analysis_real/Principles_of_Mathematical_Analysis-Rudin.pdf', 
             'real_and_complex_analysis': 'https://perso.telecom-paristech.fr/decreuse/_downloads/c22155fef582344beb326c1f44f437d2/rudin.pdf', 
            'functional_analysis':'https://www.mymathscloud.com/api/download/modules/University/Textbooks/functional-analysis/Functional%20Analysis%20Rudin.pdf?id=48928539' }

In [5]:
url = books['real_and_complex_analysis']
pdf = requests.get(url, stream=True)

images = []
for first_page, last_page in tqdm([(x[0], x[-1]) for x in np.array_split(range(get_num_pages(pdf)), 10)]):
    images.append(pdf2image.convert_from_bytes(pdf.content, first_page=first_page, 
                    last_page=last_page, thread_count=10, grayscale=True, dpi = 500))
    
images = [item for row in images for item in row]

100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:50<00:00,  5.03s/it]


In [143]:
"""chapters = {1:(1,21), 2: (24,43), 3: (47,78), 4:(83,98), 5:(103, 114),\
6:(120,138), 7:(143, 165), 8:(172,196), 9:(204, 239), 10:(245, 288), 11:(300,332)}
chapters = {k:(v[0]+9, v[1]+9) for k,v in chapters.items()}
"""
chapters = {1:(5,31), 2: (33,57), 3: (61,71), 4:(76,92),\
            5:(95, 112), 6:(116,132), 7:(135, 156), 8:(160,174), 9:(178, 193), 10:(196, 227),\
            11:(231,249), 12: (253, 264), 13: (266, 276), 14: (278, 293), 15: (298, 315), \
           16: (319, 332), 17: (335, 352), 18: (356, 369), 19: (371, 383), 20: (386, 394)}
chapters = {k:(v[0]+14, v[1]+14) for k,v in chapters.items()}


In [None]:
with tqdm(total=chapters[1][1] - chapters[1][0] + 1 , \
          position = 0, leave = True, desc = f"Processing chapter 1") as pbar:
    for chapter, (start, end) in chapters.items():
        initial = 'None'
        if chapter == 1:
            pass
        else: 
            pbar.reset(total = end - start + 1)
            pbar.set_description(f"Processing chapter {chapter}")
        for page in range(start, end+1):
            if page == start: 
                list(string.ascii_lowercase)
            initial = extract_formulas(images[page], chapter, initial)
            pbar.update(1)
        

Processing chapter 1:  67%|██████████████████████████████████                 | 18/27 [00:41<00:18,  2.07s/it]