In [1]:
import fitz  # PyMuPDF
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def chunk_text(text, max_chunk_size=512):
    chunks = []
    current_chunk = ""
    for word in text.split():
        if len(current_chunk) + len(word) + 1 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = word
        else:
            if current_chunk:
                current_chunk += " " + word
            else:
                current_chunk = word
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

def generate_title(text):
    # Check if CUDA is available and use it if possible
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    chunks = chunk_text(text)
    summaries = []

    for chunk in chunks:
        inputs = tokenizer.encode("summarize: " + chunk, return_tensors="pt", max_length=512, truncation=True).to(device)
        outputs = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)

    full_summary = " ".join(summaries)
    inputs = tokenizer.encode("summarize: " + full_summary, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs, max_length=15, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return title


In [3]:

def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    title = generate_title(text)
    return title

if __name__ == "__main__":
    pdf_path = "/home/arunav/Downloads/jeff103.pdf"
    title = main(pdf_path)
    print("Generated Title:", title)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Title: boy bent his head and ran away to hole under ledge where
