## Imports

In [42]:
import os
import pytesseract

from fpdf import FPDF

## Functions

In [31]:
def export_to_text_file(
    strings_list: list[str],
    relative_dir_path: str,
    file_name: str,
) -> None:
    """
    Exports a list of strings to a text file, with each string written on a new line.

    Args:
        strings_list (list[str]): A list of strings to be written to the file.
        relative_dir_path (str): Relative path to the directory where you want to
            export the .txt file to.
        filename (str): The name of the file (including the path if necessary) 
            where the list will be saved.

    Returns:
        None: This function does not return a value, it writes the strings to a file.

    Example:
        >>> export_to_textfile(["Hello", "World"], "output.txt")
        # This will create a file 'output.txt' with "Hello" and "World" on separate lines.
    """
    with open(file_name, 'w') as file:
        for string in strings_list:
            relative_file_path = "/".join([relative_dir_path, string])
            file.write(relative_file_path + '\n')
    print(f"List successfully written to {relative_file_path}")

In [32]:
def export_long_string_to_file(
    long_string: str, 
    relative_dir_path: str,
    file_name: str
) -> None:
    """
    Exports a long string to a text file.

    Args:
        long_string (str): The long string to be written to the file.
        filename (str): The name of the file (including the path if necessary) where the string will be saved.

    Returns:
        None: This function does not return a value, it writes the string to the file.

    Example:
        >>> export_long_string_to_file("This is a very long string...", "output.txt")
        # This will create a file 'output.txt' with the content of the long string.
    """
    relative_file_path = "/".join([relative_dir_path, file_name])
    with open(relative_file_path, 'w') as file:
        file.write(long_string)
    print(f"String successfully written to {file_name}")

In [60]:
def text_to_pdf(
    relative_dir_path: str,
    input_file_name: str,
    output_file_name: str
) -> None:
    """
    Converts a text document to a PDF document.

    Args:
        input_file (str): The path to the input text file.
        output_file (str): The path to the output PDF file.

    Returns:
        None: This function does not return a value. It writes the PDF to the specified output file.
    
    Raises:
        FileNotFoundError: If the input file is not found.
        IOError: If there's an error writing to the output file.
    """
    try:
        input_file_path = "/".join([relative_dir_path, input_file_name])
        output_file_path = "/".join([relative_dir_path, output_file_name])
                                   
        # Open and read the input text file
        with open(input_file_path, 'r', encoding='iso-8859-1') as file:
            text = file.readlines()

        # Create an FPDF object
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()

        # Set font: Arial, size 12
        pdf.set_font('Arial', size=12)

        # Add each line of text to the PDF
        for line in text:
            pdf.multi_cell(0, 10, line)

        # Output the PDF to the specified file
        pdf.output(output_file_path)
        print(f"PDF successfully created: {output_file_path}")
    
    except FileNotFoundError:
        raise FileNotFoundError(f"The input file {input_file_path} was not found.")
    
    except IOError as e:
        raise IOError(f"Error writing the output file {output_file_path}: {e}")


## Setup

In [40]:
relative_dir_path = "data/institutions_of_the_european_union_images"
allowed_file_extensions = ["jpg"]
list_of_images_file_name = "list_of_image_files.txt"
output_txt_file_name = "all_text.txt"
output_pdf_file_name = "all_text.pdf"

In [34]:
relative_file_paths = [
    x for x in os.listdir(relative_dir_path)
    if x.split('.')[-1] in allowed_file_extensions
]

In [35]:
export_to_text_file(
    strings_list=relative_file_paths,
    relative_dir_path=relative_dir_path,
    file_name=list_of_images_file_name
)

List successfully written to data/institutions_of_the_european_union_images/H17-8.jpg


## Convert to images to text and export to .txt file

In [36]:
export_long_string_to_file(
    long_string = pytesseract.image_to_string("/".join([relative_dir_path, list_of_images_file_name])),
    relative_dir_path=relative_dir_path,
    file_name=output_txt_file_name
)

String successfully written to all_text.txt


## Convert output .txt file to .pdf file

In [61]:
text_to_pdf(
    relative_dir_path=relative_dir_path,
    input_file_name=output_txt_file_name,
    output_file_name=output_pdf_file_name
)

PDF successfully created: data/institutions_of_the_european_union_images/all_text.pdf
