In [1]:
import pymupdf
from tqdm import tqdm
import pandas as pd
import os
from pdf2image import convert_from_path
import pytesseract

In [2]:
def list_pdf_files(directory):
    files = os.listdir(directory)
    pdf_files = [file for file in files if file.lower().endswith('.pdf')]
    return pdf_files

# Environment variables

In [3]:
csv_path = ""   # TODO: Path to csv file containing the information of the modulhandbooks
dir_path = ""   # TODO: Path to the directory containing the pdf-files
out_dir = ""    # TODO: Path to the directory where the extracted text should go

# Get pdf names and load reference table

In [4]:
reference_table = pd.read_csv(csv_path, sep=";", encoding="latin1")
reference_table = reference_table.dropna()
reference_table

Unnamed: 0,identifier,study_program,university,type,location,study_form,degree,fields,options,website
0,2,Informatik,Rheinisch-Westfälische Technische Hochschule A...,Universitäten,Aachen / Nordrhein-Westfalen,Vollzeitstudium,Master,Informatik,"Theoretische Informatik,Software und Kommunika...",http://www.rwth-aachen.de/go/id/bcfg/?#aaaaaaa...
1,1,Computational Engineering Science,Rheinisch-Westfälische Technische Hochschule A...,Universitäten,Aachen / Nordrhein-Westfalen,Vollzeitstudium,Bachelor,"Angewandte Mathematik, Informatik, Maschinenbau","Numerik und Simulation,Programmierung und Algo...",http://www.rwth-aachen.de/go/id/bixo/?#aaaaaaa...
2,4,Computational Engineering Science,Rheinisch-Westfälische Technische Hochschule A...,Universitäten,Aachen / Nordrhein-Westfalen,Vollzeitstudium,Master,"Angewandte Mathematik, Informatik, Maschinenbau","Verfahrenstechnik,Numerik und Simulationstechn...",http://www.rwth-aachen.de/cms/root/Studium/Vor...
3,3,Automatisierungstechnik,Rheinisch-Westfälische Technische Hochschule A...,Universitäten,Aachen / Nordrhein-Westfalen,Vollzeitstudium,Master,"Automatisierungstechnik, Fertigungstechnik","Prozesstechnik,Fahrzeugtechnik,Medizintechnik,...",http://www.rwth-aachen.de/go/id/bjtg


In [5]:
pdfs_all = list_pdf_files(dir_path)
# use only files, that are present in the csv
pdfs_numbers = [name.replace(".pdf", "") for name in pdfs_all]
# pdfs_numbers
pdfs = [elem for elem in pdfs_numbers if elem in reference_table['identifier'].values.astype(str)]
pdfs = [fname + ".pdf" for fname in pdfs]

## (Optional) Create subfolders for files

In [None]:
#for pdf_file in tqdm(pdfs, desc="Creating dirs"):
#    pdf_dir = pdf_file.replace(".pdf", "")
#    os.makedirs(f"{out_dir}/{pdf_dir}", exist_ok=True)

# Extract content per page (and optionally save it to subfolder)
Also identify wich pages are empty and wich document weren't able to be loaded 

In [6]:
unopenable_files = []
empty_pages = []

for filename in tqdm(pdfs):
    path = os.path.join(dir_path, filename)
    try:
        with pymupdf.open(path) as document:
            for page_num in range(len(document)):
                page = document.load_page(page_num)
                page_text = page.get_text()
                if page_text == "":
                    empty_pages.append(f"{filename}_page_{page_num+1}")
                
                #else:
                #    with open(f"{out_dir}/{filename.replace('.pdf', '')}/page_{page_num+1}.txt", "w", encoding="utf-8") as text_file:
                #        text_file.write(page_text)
    except Exception as e:
        unopenable_files.append(filename)

100%|██████████| 4/4 [00:07<00:00,  1.78s/it]


## Analyze empty pages

In [7]:
empty_pages

[]

## Use Pytesseract to extract text from empty pages

In [None]:
pytesseract.pytesseract.tesseract_cmd = ""  # TODO: Path to pytesseract
filePath = ""   # TODO: File to convert (.pdf)
outPath = ""    # TODO: Output path
fileNr = ""   # TODO: Number of Modulhandbook

doc = convert_from_path(filePath)
path, fileName = os.path.split(filePath)
fileBaseName, fileExtension = os.path.splitext(fileName)

content = ""
for page_number, page_data in tqdm(enumerate(doc), desc="Pages"):
    txt = pytesseract.image_to_string(page_data, config="--psm 3")
    content += "\n\n" + txt

with open(f"{outPath}/{fileNr}.txt", "w", encoding="utf-8") as file:
    file.write(content)

# Extract text per Modulhandbook

In [8]:
for filename in tqdm(pdfs, desc="Extracting Handbooks"):
    path = os.path.join(dir_path, filename)
    content = ""
    with pymupdf.open(path) as document:
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            page_text = page.get_text()
            content += page_text + "\n\n"
                
    with open(f"{out_dir}/{filename.replace('.pdf', '')}.txt", "w", encoding="utf-8") as text_file:
            text_file.write(content)

Extracting Handbooks: 100%|██████████| 4/4 [00:09<00:00,  2.37s/it]
