In [None]:
pip install pymupdf
pip install einops flash_attn
pip install python-docx

In [None]:
import os
import fitz  
from PIL import Image, ImageDraw, ImageFont
import io
from io import BytesIO
import pytesseract  
import torch 
import requests  
from transformers import Blip2Processor, Blip2ForConditionalGeneration, PegasusForConditionalGeneration, PegasusTokenizer, AutoProcessor, AutoModelForCausalLM 
from docx import Document 
from docx.shared import Inches  
import tempfile  
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BartTokenizer
import numpy as np 
import pandas as pd 
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

In [None]:
def run_example(image, task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    caption = parsed_answer["<MORE_DETAILED_CAPTION>"]
    return caption

In [None]:
def pdf_doc(pdf_document):
    doc = fitz.open(pdf_document)

    page_text_list = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)  
        image_list = page.get_images(full=True)  

        page_text = page.get_text()
        page_text_combined = page_text

        
        for img_index, img in enumerate(image_list):
            xref = img[0] 
            base_image = doc.extract_image(xref)  
            image_bytes = base_image["image"]
            
            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            
            prompt = "<MORE_DETAILED_CAPTION>"
            generated_caption = run_example(image, prompt)

            combined_text = f"\n\nImage {img_index + 1}:\nCaption: {generated_caption}\n"

            page_text_combined += combined_text

        page_text_list.append(page_text_combined)
    return page_text_list

In [None]:
def docx_doc(docx_document):
    doc = Document(docx_document)
    doc_text_list = []
    for para in doc.paragraphs:
        doc_text_list.append(para.text)

        for run in para.runs:
            if run.element.xml.find("pic:blipFill") != -1:
                image = run.element.xpath(".//a:blip")[0]
                image_data = image.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
                image_part = doc.part.related_parts[image_data]
                image_bytes = image_part.blob

                image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
                prompt = "<MORE_DETAILED_CAPTION>"
                generated_caption = run_example(image, prompt)
                combined_text = f"Caption: {generated_caption}\n"
                doc_text_list.append(combined_text)
    return doc_text_list

    



In [None]:
def determine_file_type(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.docx':
        return 'docx'
    elif file_extension.lower() == '.pdf':
        return 'pdf'
    else:
        return 'unknown'

In [None]:
def get_text(path):
    text_each_page=[]
    file_type=determine_file_type(path)
    if file_type=='pdf':
        text_each_page=pdf_doc(path)
    elif file_type=='docx':
        text_each_page=docx_doc(path)
    else :
        print("invalid file type")
    return text_each_page

In [None]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)


In [None]:
def process_text_in_sentence_chunks(path, max_chunk_size):
    text=('\n'.join(page for page in get_text(path)))
    sentences = text.replace("\n", " ") 
    sentences = nltk.sent_tokenize(sentences)

    current_chunk = ""
    text_list = []
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += " " + sentence
        else:
              text_list.append(current_chunk.strip())
              current_chunk = sentence

    if current_chunk:
        text_list.append(current_chunk.strip())
    return text_list

# Models****

In [None]:
model_name = 't5-large'
tokenizer1 = T5Tokenizer.from_pretrained(model_name)
model1 = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device)

def generate_summary_t5(text, max_length=60):
    preprocess_text = text.strip().replace("\n", " ")
    t5_prepared_text = "summarize: " + preprocess_text

    tokenized_text = tokenizer1.encode(t5_prepared_text, return_tensors="pt", max_length=512, truncation=True)
    tokenized_text = tokenized_text.to(device)
    summary_ids = model1.generate(tokenized_text, max_length=max_length, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer1.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
model_name = "facebook/bart-large-cnn"
tokenizer2 = BartTokenizer.from_pretrained(model_name)
model2 = BartForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2.to(device)


def generate_summary_bart(input_text):
    inputs = tokenizer2(input_text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
    summary_ids = model2.generate(inputs['input_ids'], 
                                 max_length=80, 
                                 min_length=5, 
                                 length_penalty=2.0, 
                                 num_beams=4, 
                                 early_stopping=True)
    summary = tokenizer2.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
model_name = "google/pegasus-large"
tokenizer3 = PegasusTokenizer.from_pretrained(model_name)
model3 = PegasusForConditionalGeneration.from_pretrained(model_name)


def generate_summary_pegasus(input_text, max_input_length=1024, max_summary_length=60, min_summary_length=15):
    inputs = tokenizer3(input_text, return_tensors="pt", max_length=max_input_length, truncation=True)
    summary_ids = model3.generate(inputs['input_ids'], 
                                 max_length=max_summary_length,
                                 min_length=min_summary_length,  
                                 num_beams=4, 
                                 length_penalty=2.0, 
                                 early_stopping=True)
    summary = tokenizer3.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
path="/kaggle/input/bsebse/BSE102_Lecture 3_Chemical basis of life.pdf"


In [None]:
input_text=process_text_in_sentence_chunks(path,512)
for text in input_text:
    print(generate_summary_t5(text))

In [None]:
input_text=process_text_in_sentence_chunks(path,900)
for text in input_text:
    print(generate_summary_bart(text))

In [None]:
input_text=process_text_in_sentence_chunks(path,512)
for text in input_text:
    print(generate_summary_pegasus(text))