In [3]:

!pip install PyMuPDF pdfplumber pytesseract transformers sentencepiece Pillow
 
import pickle
import fitz  # PyMuPDF for extracting text from PDF
import pdfplumber  # For alternative PDF text extraction
from PIL import Image
import pytesseract  # For OCR in images
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the model and tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name, local_files_only=False)
model = PegasusForConditionalGeneration.from_pretrained(model_name, local_files_only=False)

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Alternative function to extract text from PDF using pdfplumber
def extract_text_from_pdf_alternative(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to extract text from images (JPG/PNG) using OCR
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

# Function to summarize text using Pegasus model
def summarize_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024, padding="longest")
    summary_ids = model.generate(**inputs)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to save the extracted text using pickle
def save_text_to_pickle(text, pickle_filename):
    with open(pickle_filename, 'wb') as f:
        pickle.dump(text, f)

# Function to load text from pickle
def load_text_from_pickle(pickle_filename):
    with open(pickle_filename, 'rb') as f:
        text = pickle.load(f)
    return text

# Dynamic function to handle different file formats and summarize
def process_and_summarize(file_path, pickle_filename=None):
    text = ""
    if file_path.lower().endswith(".pdf"):
        # Extract text from PDF
        text = extract_text_from_pdf(file_path)
        if not text:
            # If extraction fails, use alternative method
            text = extract_text_from_pdf_alternative(file_path)
    elif file_path.lower().endswith((".jpg", ".jpeg", ".png")):
        # Extract text from image (JPG/PNG) using OCR
        text = extract_text_from_image(file_path)
    
    if pickle_filename:
        # Save text to pickle
        save_text_to_pickle(text, pickle_filename)
    
    # Summarize the extracted text
    summary = summarize_text(text)
    return summary

# Example usage:
file_path = "your_file.pdf"  # Change this to the file you want to summarize
pickle_filename = "extracted_text.pkl"  # File to store the extracted text

# Process the file and summarize
summary = process_and_summarize(file_path, pickle_filename)
print("Summary:")
print(summary)
 
loaded_text = load_text_from_pickle(pickle_filename)
print("Loaded Text from Pickle:")
print(loaded_text)
 

^C


ModuleNotFoundError: No module named 'pdfplumber'

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
     ---------------------------------------- 0.0/42.5 kB ? eta -:--:--
     ---------------------------- ----------- 30.7/42.5 kB 1.3 MB/s eta 0:00:01
     ---------------------------- ----------- 30.7/42.5 kB 1.3 MB/s eta 0:00:01
     ---------------------------- ----------- 30.7/42.5 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 42.5/42.5 kB 258.0 kB/s eta 0:00:00
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.2 kB ? eta -:--:--
     ---------------------------------------- 0.0/48.2 kB ? eta -:--:--
     ----------------------------------------

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\divya.DIVYAM\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\divya.DIVYAM\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\divya.DIVYAM\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\divya.DIVYAM\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 98, in read
    data: bytes = self.__fp.read(amt)
                  ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\divya.DIVYAM\AppData\Local\Programs\Python\Python311\Lib\http\client.py", lin