### DEPENDENCIES:

• pdf2image  
    `converts each page in a pdf file into a jpeg image`

• pytesseract  
    `converts the jpeg images into text`

In [None]:
%%capture
!sudo apt-get install tesseract-ocr
!sudo apt-get install poppler-utils --fix-missing
!pip install pytesseract
!pip install pdf2image

In [None]:
from typing import List, Tuple
import pytesseract 
from pdf2image import convert_from_path 
import os

from google.colab import drive
drive.mount('/content/drive')

• This code uses google drive in order to host the files. If using a different file structure, set your own paths variables `PDF_DIR` and `TXT_DIR`

In [3]:
# Location of PDFs in Google Drive
PDF_DIR: str = "./drive/MyDrive/PDFs"
# Location of .txt files in Google Drive
TXT_DIR: str = "./drive/MyDrive/PDFtxts"
# List of PDF file names

In [4]:
pdf_files = os.listdir(PDF_DIR)
# Example of how to construct path with
PDF_DIR + '/' + pdf_files[0]

'./drive/MyDrive/PDFs/140630293-Jose-Javier-Linares-Valencia-A059-170-269-BIA-Aug-23-2012.pdf'

In [5]:
def convert_pdf_to_text(pdf_file: str, pdf_dir=PDF_DIR) -> str:
    '''
    Converts PDF to JPEG and uses Tesseract to convert JEPG to text. 
    '''
    # Assemble path to PDF files
    path_to = lambda file: pdf_dir + '/' + file 
    # Convert pages to list of JPEGS
    pages = convert_from_path(path_to(pdf_file))
    # Convert JPEGS into list of strings
    pages_text: List[str] = [pytesseract.image_to_string(p) for p in pages]
    del pages

    return ''.join(pages_text)


def pdf_text_filename(pdf_file: str) -> str:
    '''
    Replace "*.pdf" with "*.txt".
    '''
    return pdf_file[:-3] + "txt"

In [6]:
def convert_and_save_all(pdf_files: List[str], 
                         pdf_dir=PDF_DIR, 
                         txt_dir=TXT_DIR) -> None:
    '''
    Given a list of pdf files in a directory, this func will convert all
    into text files.
    '''
    txt_files: List[str] = os.listdir(txt_dir)

    # List of (*.pdf, *.txt)
    translated_file_names: List[Tuple[str, str]] = [
        (pdf, pdf_text_filename(pdf)) for pdf in pdf_files 
                                      if pdf[-3:] == "pdf"
    ]
    # Filtered list of (*.pdf, *.txt) for unconverted PDFs
    not_converted: List[Tuple[str, str]] = list(filter(
        lambda f: f[1] not in txt_files,
        translated_file_names
    ))
    print(
        f"{ len(not_converted) } "
        f"out of {len(translated_file_names)} PDF files to convert."
    )

    num_to_convert = len(not_converted)
    num_converted = 0

    # Assemble path to location of saved .txt file
    path_to = lambda file: txt_dir + '/' + file

    for pdf, txt_fname in not_converted:
        # Print progress
        if num_converted % 50 == 0:
            print(f"{ round(num_converted/num_to_convert*100) }% converted...")

        text = convert_pdf_to_text(pdf, pdf_dir)
        f = open(path_to(txt_fname), 'w')
        f.write(text)
        f.close()
        # inc num converted
        num_converted += 1

    print("Finished!")

In [None]:
pdf_files: List[str] = os.listdir(PDF_DIR)
convert_and_save_all(pdf_files)

886 out of 1502 PDF files to convert.
0% converted...
