In [None]:
# Install system dependencies
# Tesseract OCR engine is installed first.
!sudo apt-get install tesseract-ocr

# Install Python libraries
# pytesseract is the Python wrapper for Tesseract.
# pdf2image will be used to convert PDF pages into images.
# poppler-utils is a dependency required by pdf2image.
!pip install pytesseract pdf2image
!sudo apt-get install poppler-utils

In [None]:
from google.colab import files
import io

print("Please upload a PDF file to test with Tesseract OCR.")
uploaded = files.upload()

# Store the uploaded filename and content for the next cell
if uploaded:
    pdf_file_name = next(iter(uploaded))
    pdf_file_content = uploaded[pdf_file_name]
    print(f"\nSuccessfully uploaded '{pdf_file_name}'")
else:
    print("\nNo file was uploaded. Please run the cell again to upload.")

In [None]:
from pdf2image import convert_from_bytes

if 'pdf_file_content' in locals():
    print("Converting PDF pages to images...")
    # The convert_from_bytes function takes the PDF content and returns a list of images.
    # Each item in the list is a PIL (Pillow) Image object representing one page.
    # dpi=300 is used for higher quality, which improves OCR accuracy.
    try:
        images = convert_from_bytes(pdf_file_content, dpi=300)
        print(f"Successfully converted {len(images)} pages into images.")
    except Exception as e:
        print(f"An error occurred during PDF to image conversion: {e}")
        print("The file might be corrupted or not a valid PDF.")
else:
    print("PDF file not found. Please upload a file in the previous cell.")

In [None]:
import matplotlib.pyplot as plt

if 'images' in locals() and images:
    print("--- Displaying Converted Images ---")

    # Determine the number of pages to display
    num_pages = len(images)
    print(f"Found {num_pages} page(s) to display.")

    # Loop through each image (page) and display it
    for i, image in enumerate(images):
        plt.figure(figsize=(10, 15)) # You can adjust the figure size as needed
        plt.title(f"Page {i + 1}")
        plt.imshow(image)
        plt.axis('off') # Hide the x and y axes for a cleaner look
        plt.show()

    print("\n--- Image Display Complete ---")
else:
    print("No images found to display. Please run the previous cells successfully.")

In [9]:
import pytesseract

if 'images' in locals() and images:
    print("\n--- Starting OCR Process ---")
    # Loop through each image (page) that was converted
    for i, image in enumerate(images):
        print(f"\n\n--- EXTRACTED TEXT FROM PAGE {i + 1} ---")

        # Use pytesseract's image_to_string function to perform OCR on the image.
        # 'lang="eng"' specifies that we expect the text to be in English.
        try:
            extracted_text = pytesseract.image_to_string(image, lang="eng")
            print(extracted_text)
        except Exception as e:
            print(f"An error occurred during OCR on page {i + 1}: {e}")

    print("\n\n--- OCR Process Complete ---")
else:
    print("No images found to process. Please ensure the previous cells ran successfully.")


--- Starting OCR Process ---


--- EXTRACTED TEXT FROM PAGE 1 ---
21GNH101J- PHILOSOPHY OF ENGINEERING
UNIT-1

¢ Engineering is the discipline and profession of applying technical and

scientific knowledge and utilizing natural laws and physical resources in
order to design and implement materials, structures, machines, devices,
systems, and processes that safely realize a desired objective and meet
specified criteria.

The American Engineers' Council for Professional Development (ECPD,
the predecessor of ABET) has defined engineering as follows:

“The creative application of scientific principles to design or develop
structures, machines, apparatus, or manufacturing processes, or works
utilizing them singly or in combination;

or to construct or operate the same with full cognizance of their design;
or to forecast their behavior under specific operating conditions; all as
respects an intended function, economics of operation and safety to life
and property.”



--- EXTRACTED TEXT FR

KeyboardInterrupt: 