In [13]:
! pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.7.1-py3-none-any.whl (39 kB)
Collecting pdfminer.six==20220524
  Downloading pdfminer.six-20220524-py3-none-any.whl (5.6 MB)
     ---------------------------------------- 5.6/5.6 MB 4.7 MB/s eta 0:00:00
Collecting Wand>=0.6.7
  Downloading Wand-0.6.7-py2.py3-none-any.whl (139 kB)
     -------------------------------------- 139.2/139.2 kB 2.7 MB/s eta 0:00:00
Installing collected packages: Wand, pdfminer.six, pdfplumber
Successfully installed Wand-0.6.7 pdfminer.six-20220524 pdfplumber-0.7.1


In [35]:

# Requires Python 3.6 or higher due to f-strings

# Import libraries
import platform
from tempfile import TemporaryDirectory
from pathlib import Path

import pytesseract
from pdf2image import convert_from_path
from PIL import Image

if platform.system() == "Windows":
  # We may need to do some additional downloading and setup...
  # Windows needs a PyTesseract Download
  # https://github.com/UB-Mannheim/tesseract/wiki/Downloading-Tesseract-OCR-Engine

  pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  )

  # Windows also needs poppler_exe
  path_to_poppler_exe = Path(r"C:\Users\Alexa\OneDrive\Desktop\Infoling\poppler-22.04.0\Library\bin")
  
  # Put our output files in a sane place...
  out_directory = Path(r"~\Desktop").expanduser()
else:
  out_directory = Path("~").expanduser()  

# Path of the Input pdf
PDF_file = Path(r"Question-Answering-System_f_ur_politische_Wahlprogramme_am_Beispielder_Bundestagswahl_2021.pdf")


# Store all the pages of the PDF in a variable
image_file_list = []

text_file = out_directory / Path("out_text.txt")

def main():
  ''' Main execution point of the program'''
  with TemporaryDirectory() as tempdir:
    # Create a temporary directory to hold our temporary images.

    """
    Part #1 : Converting PDF to images
    """

    if platform.system() == "Windows":
      pdf_pages = convert_from_path(
        PDF_file, 500, poppler_path=path_to_poppler_exe
      )
    else:
      pdf_pages = convert_from_path(PDF_file, 500)
    # Read in the PDF file at 500 DPI

    # Iterate through all the pages stored above
    for page_enumeration, page in enumerate(pdf_pages, start=1):
      # enumerate() "counts" the pages for us.

      # Create a file name to store the image
      filename = f"{tempdir}\page_{page_enumeration:03}.jpg"

      # Declaring filename for each page of PDF as JPG
      # For each page, filename will be:
      # PDF page 1 -> page_001.jpg
      # PDF page 2 -> page_002.jpg
      # PDF page 3 -> page_003.jpg
      # ....
      # PDF page n -> page_00n.jpg

      # Save the image of the page in system
      page.save(filename, "JPEG")
      image_file_list.append(filename)

    """
    Part #2 - Recognizing text from the images using OCR
    """

    with open("finalhope.txt", "a") as output_file:
      # Open the file in append mode so that
      # All contents of all images are added to the same file

      # Iterate from 1 to total number of pages
      for image_file in image_file_list:

        # Set filename to recognize text from
        # Again, these files will be:
        # page_1.jpg
        # page_2.jpg
        # ....
        # page_n.jpg

        # Recognize the text as string in image using pytesserct
        text = str(((pytesseract.image_to_string(Image.open(image_file)))))

        # The recognized text is stored in variable text
        # Any string processing may be applied on text
        # Here, basic formatting has been done:
        # In many PDFs, at line ending, if a word can't
        # be written fully, a 'hyphen' is added.
        # The rest of the word is written in the next line
        # Eg: This is a sample text this word here GeeksF-
        # orGeeks is half on first line, remaining on next.
        # To remove this, we replace every '-\n' to ''.
        # text = text.replace("-\n", "")

        # Finally, write the processed text to the file.
        output_file.write(text)

      # At the end of the with .. output_file block
      # the file is closed after writing all the text.
    # At the end of the with .. tempdir block, the
    # TemporaryDirectory() we're using gets removed!  
  # End of main function!
  




In [40]:
import pdfplumber as pdfp
import sys
pdfToString = ""
with pdfp.open('erziehungswissenschaft_ba_ab_ws1617.pdf') as pdf:
	for page in pdf.pages:
		pdfToString += page.extract_text()
	size = sys.getsizeof(pdfToString)
	if size > 500:
		file = open("finalhope2.txt","w", encoding="utf-8")
		file.write("%s = %s\n" %("input_dictionary", pdfToString))
 
		file.close()
 
		f = open('finalhope2.txt','r', encoding="utf-8")
		if f.mode=='r':
			contents= f.read()
	else:
		main()

