In [144]:
!pip install PyPDF2
!pip install pymupdf Pillow pytesseract



# File Reading: #

In [1]:
from PyPDF2 import PdfReader
import numpy as np
import re

def header_filter(text):
    if "\n" not in text:
        return text
    return text[text.find("\n", 1):]

def number_reference_filter(text):
    regex = r'\s*\(.*?\)'
    return re.sub(regex, '', text)

def page_number_filter(text):
    lines = text.strip().split('\n')
    if re.match(r'^\d+$', lines[-1]):
        lines = lines[:-1]
    
    return '\n'.join(lines)

def dot_filter(text):
    return text.replace("• ", "")


def multiple_dots_commas_filter(text):
    pattern = r'([.,]){2,}'
    result = re.sub(pattern, '', text)
    return result

def read(file_path, start, end):
    result = []
    
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)

        if end > num_pages:
            print("end number outside of total pages")
            return None
        
        for page_num in range(start, end):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()

            text = header_filter(text)
            text = dot_filter(text)
            text = number_reference_filter(text)
            text = page_number_filter(text)
            text = multiple_dots_commas_filter(text)
            
            
            result.append(text)
            
    return np.array(result)


# Text Processing: #

In [2]:
import numpy as np
    
def line_filter(text):
    return text.replace("-\n", "").replace("\n", " ")


def figure_filter(text):
    pattern = r'\(?FIGURES? \d+(\.\d+)*(, \d+(\.\d+)*)*\)?'
    text = re.sub(pattern, '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()

    pattern = r'\(?Figures? \d+(\.\d+)*(, \d+(\.\d+)*)*\)?'
    result = re.sub(pattern, '', text)
    result = re.sub(r'\s{2,}', ' ', result).strip()
    return result

In [3]:
def convert(dictionary, texts):

    sections = []
    chapters = []
    
    current_section = ""
    for page_content in texts:
        for line in page_content.split("\n"):
            if line.replace(" ", "")[0:6] == "FIGURE":
                continue

            
            if line in dictionary.keys():
                current_section = figure_filter(current_section)
                sections.append(current_section)
    
                if type(dictionary[line]) == dict:
                    chapters = dictionary[line].keys()
                else:
                    chapters = []
                    
                current_section = ""
                continue
    
            if line in chapters:
                continue
    
            if line and line[-1] == "-":
                current_section += line
                
            else:
                current_section += line + " "
    
    
    sections.append(current_section)
    sections.pop(0)
    return sections


# Final Pipeline #

In [8]:
textbook = "Histology for Pathologists 4.pdf"
chapters = [3, 31, 67, 85, 107, 131, 179, 211, 233, 261, 295, 343, 375, 399, 433, 461, 477, 505, 541, 563, 585, 605, 633, 647, 673, 697, 709, 733, 759, 777, 819, 835, 849, 891, 971, 987, 1003, 1027, 1045, 1059, 1071, 1119, 1149, 1185, 1209, 1231, 1255, 1277]

import json

with open('TOC.json', 'r') as json_file:
    sections = json.load(json_file)

In [9]:
results = []
for i in range(len(sections)):
    starting_page = chapters[i] + 26
    ending_page = chapters[i+1] + 26

    texts = read(textbook, starting_page - 1, ending_page - 1)
    results.append(convert(sections[i], texts))



# File Output #

In [10]:
def write_to_file(file_path, text):
    with open(file_path, 'w') as file:
        file.write(text)


In [12]:
for i in range(len(results)):
    for j in range(len(results[i])):
        print(i, j)
        write_to_file("outputs/Chapter_" + str(i+1) + "_" + list(sections[i].keys())[j] + ".txt", results[i][j])


0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
3 0
3 1
4 0
4 1
4 2
4 3
4 4
4 5
5 0
5 1
5 2
5 3
5 4
5 5
6 0
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
7 0
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
7 10
7 11
8 0
9 0
9 1
9 2
9 3
9 4
10 0
10 1
10 2
10 3
10 4
10 5
11 0
11 1
11 2
11 3
11 4
12 0
12 1
12 2
12 3
12 4
12 5
12 6
12 7
12 8
12 9
12 10
12 11
12 12
13 0
13 1
13 2
13 3
13 4
13 5
13 6
13 7


IndexError: list index out of range