In [None]:
!pip uninstall google-cloud-vision -y

!pip install google-cloud-vision
!pip show google-cloud-vision

In [None]:
import json
import re
import io
import os
from google.cloud import vision_v1
from google.cloud.vision_v1 import types

#extract text using google cloud vision
def detect_text_ML(pdf_path, destination_path):
    """OCR with PDF/TIFF as source files on local machine"""
    # Set the path to your Google Cloud service account key file
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'optimal-via.json'

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    client = vision_v1.ImageAnnotatorClient()

    with io.open(pdf_path, 'rb') as pdf_file:
        content = pdf_file.read()
    #define the configuration of the input
    input_config = types.InputConfig(
        mime_type=mime_type, content=content)
    #define the service to be done
    feature = types.Feature(
        type=types.Feature.Type.DOCUMENT_TEXT_DETECTION)

    #define the requests
    requests = types.AnnotateFileRequest(
        input_config=input_config, features=[feature])
    request = types.BatchAnnotateFilesRequest(requests=[requests])
    #batch_annotate_files takes BatchAnnotateFilesRequest as parameter
    response = client.batch_annotate_files(request)

    #out put the text returned by response
    for response in response.responses:
        text = response.responses[0].full_text_annotation.text
        print(text)
detect_text_ML("Transcript.pdf","Transcript.txt")

In [None]:
!pip uninstall kraken
!pip install kraken


In [None]:
import kraken
from kraken import binarization
from kraken.lib import models
from kraken import rpred
from kraken.pageseg import segment
from PIL import Image
import pdf2image
import io 

#extract text using kraken
def detecttxt_kraken(filepath):
  # Use pdf2image to convert the PDF file to a list of PIL images
  pages = pdf2image.convert_from_path(filepath)
  # Save images to disk
  for i, page in enumerate(pages):
      page.save(f"transcript-{i}.png", "PNG")
      # Open the image
      pdf_image = Image.open(f"transcript-{i}.png")
      # bw_im = binarization.nlbin(pdf_image)

      # Segment the image into individual text lines using Kraken
      # Get the bounding box coordinates
      lines = segment(pdf_image.convert("1"))
      text_direction = 'horizontal-lr'  # Example text direction
      bounds = {'boxes': lines['boxes'], 'text_direction': text_direction}

      rec_model_path = 'en_best.mlmodel'
      model = models.load_any(rec_model_path)

      # Use Kraken to recognize the text in the image
      # can only extract text in one line which are inside bounds
      pred_it = rpred.rpred(model, pdf_image, bounds,bidi_reordering=False)

      print(lines['boxes'])
      # Print the recognized text
      for record in pred_it:
        print(record)

detecttxt_kraken("Transcript.pdf")

In [None]:
!pip install easyocr
!pip install pdf2image
!sudo apt-get install poppler-utils

In [None]:
import easyocr
import pdf2image
from PIL import Image
import numpy as np

#extract text using easyocr
def detecttxt_easyocr(filepath):
  # Open PDF file and get the number of pages
  pages = pdf2image.convert_from_path(filepath)
  # Save images to disk
  for i, page in enumerate(pages):
      page.save(f"images-{i}.png", "PNG")

  # Initialize EasyOCR reader
  reader = easyocr.Reader(['en'])

  # Loop over pages and extract text
  for i, page in enumerate(pages):
      result = reader.readtext(f"images-{i}.png")
      # Print the text
      for text in result:
        print(text[1])

detecttxt_easyocr("Transcript.pdf")

In [None]:
!pip install pytesseract
!apt-get install tesseract-ocr


In [None]:
import pytesseract
import cv2
import numpy as np
import pdf2image
from PIL import Image

def extracttxt_pytesseract(pdf_path):
  # Define the path to the txt file
  txt_file = pdf_path[0:pdf_path.index(".")] + ".txt"
  # Use pdf2image to convert the PDF file to a list of PIL images
  pages = pdf2image.convert_from_path(pdf_path)

  #write the extracted text into a file
  with open(txt_file, 'w') as file:
    # Loop through each page of the PDF
    for page in pages:
        #do noise reduction before extraction to improve accuracy
        # convert the image to a NumPy array
        image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
        # apply Gaussian filtering
        # filtered_img = cv2.GaussianBlur(image, (5, 5), 0)
        # convert the image to grayscale
        gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # define the minimum and maximum pixel values
        min_val, max_val, _, _ = cv2.minMaxLoc(gray_img)

        # stretch the pixel values to the range of 0-255
        stretched_img = np.uint8((gray_img - min_val) * (255 / (max_val - min_val)))


        # Use pytesseract to extract text from the page
        text = pytesseract.image_to_string(stretched_img)

        # write some text to the file
        file.write(text)
  return txt_file

extracttxt_pytesseract("Transcript.pdf")