In [1]:

# Requires Python 3.6 or higher due to f-strings

# Import libraries
import platform
from tempfile import TemporaryDirectory
from pathlib import Path

import pytesseract
from pdf2image import convert_from_path
from PIL import Image

if platform.system() == "Windows":
  # We may need to do some additional downloading and setup...
  # Windows needs a PyTesseract Download
  # https://github.com/UB-Mannheim/tesseract/wiki/Downloading-Tesseract-OCR-Engine

  pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  )

  # Windows also needs poppler_exe
  path_to_poppler_exe = Path(r"C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\poppler-22.04.0\Library\bin")
  
  # Put our output files in a sane place...
  out_directory = Path(r"~\Desktop").expanduser()
else:
  out_directory = Path("~").expanduser()  

# Path of the Input pdf



# Store all the pages of the PDF in a variable

"""# image_file_list = []"""



def ocr(pdf, targetdir, targetfilename):
  ''' Main execution point of the program'''
  PDF_file = pdf
  image_file_list = []
  with TemporaryDirectory() as tempdir:
    # Create a temporary directory to hold our temporary images.

    """
    Part #1 : Converting PDF to images
    """

    if platform.system() == "Windows":
      pdf_pages = convert_from_path(
        PDF_file, 500, poppler_path=path_to_poppler_exe
      )
    else:
      pdf_pages = convert_from_path(PDF_file, 500)
    # Read in the PDF file at 500 DPI

    # Iterate through all the pages stored above
    for page_enumeration, page in enumerate(pdf_pages, start=1):
      # enumerate() "counts" the pages for us.

      # Create a file name to store the image
      filename = f"{tempdir}\page_{page_enumeration:03}.jpg"

      # Declaring filename for each page of PDF as JPG
      # For each page, filename will be:
      # PDF page 1 -> page_001.jpg
      # PDF page 2 -> page_002.jpg
      # PDF page 3 -> page_003.jpg
      # ....
      # PDF page n -> page_00n.jpg

      # Save the image of the page in system
      page.save(filename, "JPEG")
      image_file_list.append(filename)

    """
    Part #2 - Recognizing text from the images using OCR
    """

    with open(targetdir+"/"+targetfilename,"w", encoding="utf-8") as output_file:
      # Open the file in append mode so that
      # All contents of all images are added to the same file

      # Iterate from 1 to total number of pages
      for image_file in image_file_list:

        # Set filename to recognize text from
        # Again, these files will be:
        # page_1.jpg
        # page_2.jpg
        # ....
        # page_n.jpg

        # Recognize the text as string in image using pytesserct
        text = str(((pytesseract.image_to_string(Image.open(image_file)))))

        # The recognized text is stored in variable text
        # Any string processing may be applied on text
        # Here, basic formatting has been done:
        # In many PDFs, at line ending, if a word can't
        # be written fully, a 'hyphen' is added.
        # The rest of the word is written in the next line
        # Eg: This is a sample text this word here GeeksF-
        # orGeeks is half on first line, remaining on next.
        # To remove this, we replace every '-\n' to ''.
        # text = text.replace("-\n", "")

        # Finally, write the processed text to the file.
        output_file.write(text)

      # At the end of the with .. output_file block
      # the file is closed after writing all the text.
    # At the end of the with .. tempdir block, the
    # TemporaryDirectory() we're using gets removed!  
  # End of main function!
  




In [7]:
import os
import fnmatch
from collections import defaultdict

"""A function that returns a list of all the pdf files in the folder."""
def getpdfspaths():
   default_path = "C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt"
   pdfabspath = []
   for item in os.listdir(default_path): #Fachordner = item
      if os.path.isdir(default_path + "/" + item): 
         for element in os.listdir(default_path + "/" + item): 
            if os.path.isdir(default_path + "/" + item + "/" + element) and "pdf" in element: #pdf Ordner
               for elem in os.listdir(default_path + "/" + item + "/" + element):
                  pdfabspath.append(default_path + "/" + item + "/" + element + "/" + elem)
   return pdfabspath    
   

In [11]:
import pdfplumber as pdfp
import sys

"""The above code is scanning through all the pdf files in the pdfs folder and converting them to text files."""
pdf_path = getpdfspaths()
pdf_path = pdf_path[198:]
for abspath in pdf_path:
	print(f"scanning file {pdf_path.index(abspath)+1} of {len(pdf_path)}")
	pdfToString = ""
	with pdfp.open(abspath) as pdf:
		for page in pdf.pages:
			pdfToString += page.extract_text()
	size = sys.getsizeof(pdfToString)
	abspath_target ="/".join(abspath.split("/")[:-2])
	file_name = abspath.split("/")[-1].replace(".pdf", ".txt")
	if size > 500:
    
		print(abspath_target+"/"+file_name)
		file = open(abspath_target+"/"+file_name,"w", encoding="utf-8")
		file.write("%s = %s\n" %("input_dictionary", pdfToString))
	
		file.close()
		
	
		
	else:
		print("Using OCR")
		ocr(abspath, abspath_target, file_name)
		



scanning file 1 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/molekulare-medizin-bsc/molekulare-medizin-bsc-mo.txt
scanning file 2 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/molekulare-medizin-bsc/molekularmedizin-bsc-po.txt
scanning file 3 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/musik-grundschule/lehramt-po.txt
scanning file 4 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/musik-grundschule/musik-grundschule-mo-dritteldidaktik.txt
scanning file 5 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/musik-grundschule/musik-grundschule-mo.txt
scanning file 6 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/musik-grundschule/musik-lehramt-po.txt
scanning file 7 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/musik-gymnasium/lehramt-po.txt
scanning file 8 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/musik-gymnasium/mathematik-gymnasium-mo.txt
scanning file 9 of 88
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt

In [37]:
import shutil

default_path = "C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt"
pdfabspath = []
for item in os.listdir(default_path): #Fachordner = item
   if os.path.isdir(default_path + "/" + item): 
      for element in os.listdir(default_path + "/" + item): 
         if "txt" in element:
            src_path = default_path + "/" + item + "/" + element
            dst_path = "C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/1. all txt"
            shutil.copy(src_path, dst_path)
            print(default_path + "/" + item + "/" + element + ":wurde kopiert!")

C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/amerikanistik-ba/amerikanistik-ba-mo.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/amerikanistik-ba/amerikanistik-ba-po.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/amerikanistik-ba/amerikanistik-ba.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/anglistik-ba/anglistik-ba-mo.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/anglistik-ba/anglistik-ba-po.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/anglistik-ba/anglistik-ba.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/archaeologie-ba/archaeologie-ba-mo.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/archaeologie-ba/archaeologie-ba-po.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/archaeologie-ba/archaeologie-ba.txt:wurde kopiert!
C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/betriebswirtschaftslehre-ba/betrieb

In [39]:
from haystack.utils import print_answers
from haystack.nodes import PreProcessor, TextConverter
from haystack.nodes import ElasticsearchRetriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader


all_txt = convert_files_to_docs(dir_path="C:/Users/Alexa/OneDrive/Desktop/UE/Infoling-2/txt/1. all txt")
print(all_txt)


INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\amerikanistik-ba-mo.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\amerikanistik-ba-po.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\amerikanistik-ba.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\anglistik-ba-mo.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\anglistik-ba-po.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\anglistik-ba.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\Desktop\UE\Infoling-2\txt\1. all txt\archaeologie-ba-mo.txt
INFO - haystack.utils.preprocessing -  Converting C:\Users\Alexa\OneDrive\

KeyboardInterrupt: 