In [6]:
# Extracting text from images

from PIL import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Program Files\Tesseract-OCR\tesseract.exe"
)

img = Image.open('documents/sample.jpg')
img = img.convert('L')

text = pytesseract.image_to_string(img)

print(text.replace("\x0c", "").strip())

A Simple PDF File This is (inde:
a small demonstration .pdf file -

for use in the

just
Virtual Mechanics

tutorials. More text. And more text.

And more text.

And more text. And more

text. And more text. And more text.

And more text.
text. And more
more text. And
text. And more
more text. And
text. And more
And more text.
text. And more

And more text.
nace 2

And more text. And more
text. Boring, zzzzz. And
more text. And more
text. And more text. And
more text. And more
text. And more text.
And more text. And more
text. And more text.
Even more. Continued on


In [None]:
# Image Captioning using BLIP

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

image = Image.open("documents/sample.jpg")
image = image.convert('L')

inputs = processor(images=image, return_tensors="pt") #type: ignore (pylance error; code works fine)

outputs = model.generate(**inputs) #type: ignore (pylance error; code works fine)
caption = processor.decode(outputs[0], skip_special_tokens=True) #type: ignore (pylance error; code works fine)
print("Generated Caption:", caption)

Generated Caption: a simple text editor


In [15]:
# Combined Text Exraction and Image Captioning (Tesseract + BLIP)
from PIL import Image
import pytesseract
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image


pytesseract.pytesseract.tesseract_cmd = ( r"C:\Program Files\Tesseract-OCR\tesseract.exe" )
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img = Image.open('documents/sample.jpg')
img = img.convert('L')
inputs = processor(images=img, return_tensors="pt") #type: ignore (pylance error; code works fine)

text = pytesseract.image_to_string(img)
outputs = model.generate(**inputs) #type: ignore (pylance error; code works fine)
caption = processor.decode(outputs[0], skip_special_tokens=True) #type: ignore (pylance error; code works fine)


# Empty dictionary to store the final output
response = {}

response["Text"] = text.replace("\x0c", "").strip()
response["Caption"] = caption

print(response)

{'Text': 'A Simple PDF File This is (inde:\na small demonstration .pdf file -\n\nfor use in the\n\njust\nVirtual Mechanics\n\ntutorials. More text. And more text.\n\nAnd more text.\n\nAnd more text. And more\n\ntext. And more text. And more text.\n\nAnd more text.\ntext. And more\nmore text. And\ntext. And more\nmore text. And\ntext. And more\nAnd more text.\ntext. And more\n\nAnd more text.\nnace 2\n\nAnd more text. And more\ntext. Boring, zzzzz. And\nmore text. And more\ntext. And more text. And\nmore text. And more\ntext. And more text.\nAnd more text. And more\ntext. And more text.\nEven more. Continued on', 'Caption': 'a simple text editor'}


In [4]:
# Extracting texts from PDFs (PyMuPDF)

import fitz

pdf_response = []

doc = fitz.open('documents/sample.pdf')
for page in doc:
    text = page.get_text()
    pdf_response.append(text)

print(pdf_response)

['A Simple PDF File \nThis is a small demonstration .pdf file \njust for use in the Virtual Mechanics tutorials. More text. And more  \ntext. And more text. And more text. And more text. \nAnd more text. And more text. And more text. And more text. And more.  \ntext. And more text. Boring, zzzzz. And more text. And more text. And \nmore text. And more text. And more text. And more text. And more text.  \nAnd more text. And more text. \nAnd more text. And more text. And more text. And more text. And more  \ntext. And more text. And more text. Even more. Continued on page 2 ... \nSimple PDF File 2 \n...continued from page 1. Yet more text. And more text. And more text.  \nAnd more text. And more text. And more text. And more text. And more  \ntext. Oh, how boring typing this stuff. But not as boring as watching  \npaint dry. And more text. And more text. And more text. And more text.  \nBoring. More, a little more text. The end, and just as well. \n']


In [6]:
# Extracting tables from PDFs (PyMuPDF, using Fitz)

import fitz

pdf_table_response = []

doc = fitz.open('documents/sample-tables.pdf')
for page in doc:
    tabs = page.find_tables()
    if tabs.tables:
        pdf_table_response.append(tabs[0].extract())


print(pdf_table_response)


[[['Column header (TH)', 'Column header (TH)', 'Column header (TH)'], ['Row header (TH)', 'Data cell (TD)', 'Data cell (TD)'], ['Row header(TH)', 'Data cell (TD)', 'Data cell (TD)']], [['Role', 'Actor'], ['Main character', 'Daniel Radcliffe'], ['Sidekick 1', 'Rupert Grint'], ['Sidekick 2', 'Emma Watson'], ['Lovable ogre', 'Robbie Coltrane'], ['Professor', 'Maggie Smith'], ['Headmaster', 'Richard Harris']], [['Non-current assets', '2010', '2009', '2008'], ['Property', '345', '445', '222'], ['Investment', '567', '654', '423'], ['Intangibles', '423', '123', '453']], [['', '2011', None, '2010 restated', None], ['General income', '', '250,000', '', '200,000'], ['Increase in value, WIP', '', '15,000', '', '30,000'], ['', '', '265,000', '', '230,000'], ['Administrative costs', None, None, None, None], ['Staff costs', '(200,000)', '', '(150,000)', ''], ['Early departures', '(10,000)', '', '(20,000)', ''], ['Other', '(25,000)', '', '(10,000)', ''], ['Depreciation', '(10,000)', '', '(10,000)', '

In [None]:
# Extracting texts from Audio (MP3, WAV, etc. files, using SpeechRecognition and Pydub (for extracting large audio files))

import speech_recognition as sr

filename = 'documents/sample.wav'

r = sr.Recognizer()

with sr.AudioFile(filename) as source:
    audio_data = r.record(source)
    text = r.recognize_google(audio_data) #type: ignore (pylance error; code works fine)
    print(text)

I believe you are just talking nonsense


In [None]:
# Continued code for audio extraction, but for larger audio files, taken from the documentation (https://thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python)

# importing libraries 
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence

r = sr.Recognizer()

def transcribe_audio(path):
    with sr.AudioFile(path) as source:
        audio_listened = r.record(source)
        text = r.recognize_google(audio_listened) #type: ignore (pylance error; code works fine)
    return text


def get_large_audio_transcription_on_silence(path):
    """Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks"""
    sound = AudioSegment.from_file(path)  
    # split audio sound where silence is 500 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 500,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        try:
            text = transcribe_audio(chunk_filename)
        except sr.UnknownValueError as e:
            print("Error:", str(e))
        else:
            text = f"{text.capitalize()}. "
            print(chunk_filename, ":", text)
            whole_text += text
    return whole_text