In [3]:
import fitz  # PyMuPDF
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
import torch
from summa import keywords, summarizer

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def extract_keywords_and_sentences(text):
    key_words = keywords.keywords(text, words=10).split('\n')
    important_sentences = summarizer.summarize(text, words=100)
    return key_words, important_sentences

def generate_title(important_sentences):
    # Check if CUDA is available and use it if possible
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    inputs = tokenizer.encode("summarize: " + important_sentences, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs, max_length=15, min_length=5, length_penalty=5.0, num_beams=4, early_stopping=True)
    title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return title

def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    key_words, important_sentences = extract_keywords_and_sentences(text)
    print(f"Keywords: {' '.join(key_words)}")
    title = generate_title(important_sentences)
    return title

if __name__ == "__main__":
    pdf_path = "/home/arunav/Downloads/jeff104.pdf"
    title = main(pdf_path)
    print("Generated Title:", title)


Keywords: anne diary diaries amanda write writing writes reprint ii frank iii word words make makes making


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Title: compare what Anne writes to what she writes in her diary. compare
