In [1]:
# # Install Package
# !pip install langchain_community
# !pip install transformers
# !pip install torch
# !pip install num2words
# !pip install contractions
# !pip install pypdf

In [2]:
# Import Libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import re
from num2words import num2words
import contractions
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [3]:
# Define GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
# Define Variables
name_model = "google/pegasus-large"
chunk_size = 200
chunk_overlap = 20
max_length = 100
min_length = 50
num_beams = 3
model_generate_name = "llama3.2:1b"
temperature = 0.7

In [5]:
# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(name_model).to(device)
tokenizer = AutoTokenizer.from_pretrained(name_model)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Load document using langchain
def load_documents(file_path):
  loader = PyPDFLoader(file_path)
  document = loader.load()
  return document

# Preprocessing text
def preprocessing_text(document):
  text = " ".join(page.page_content for page in document)
  # Clear the text form expanding contractions "don't => do not"
  expanded_text = contractions.fix(text)
  # Convert number to text
  words = expanded_text.split()
  for index, word in enumerate(words):
    if word.isdigit():
      words[index] = num2words(int(word), lang = "en")
  expanded_text = " ".join(words)
  # Remove Special Character
  cleaned_text = re.sub(r"[^\w\s]", "", expanded_text)
  
  # # Using POS Tagging with spacy
  # doc = nlp(cleaned_text)
  # pos_tags = [(token.text, token.pos_) for token in doc]

  # # concat the pos tags with token
  # text_with_pos = " ".join([f"{word}_{pos}" for word, pos in pos_tags])

  return cleaned_text

# Chunk "Split" the document
def chunk_document(document):
  chunk_text = RecursiveCharacterTextSplitter(
      chunk_size = chunk_size,
      chunk_overlap = chunk_overlap
  )
  chunks = chunk_text.split_text(document)
  return chunks

In [7]:
def summary_model(chunk):
  inputs = tokenizer(
      text = chunk,
      truncation = True,
      padding = True,
      return_tensors = "pt"
  ).to(device)
  summary_process = model.generate(
      inputs["input_ids"],
      max_length = max_length,
      min_length = min_length,
      num_beams = num_beams,
      early_stopping = True
  )
  outputs = tokenizer.batch_decode(summary_process, skip_special_tokens = True)
  return outputs

In [8]:
def summarization(chunks):
  summary = []
  for chunk in chunks:
    chunk_summary = summary_model(chunk)
    if chunk_summary:
      summary.append(chunk_summary[0])
    else:
      summary.append("No Summary Available")
  summarize_text = " ".join(summary)
  final_summary = summary_model(summarize_text)
  return final_summary

In [9]:
pdf_file = "The Chronicles of Aeloria.pdf"

In [10]:
document = load_documents(pdf_file)

In [11]:
cleaned_document = preprocessing_text(document = document)

In [12]:
chunks = chunk_document(document = cleaned_document)

In [13]:
model_summary = summary_model(chunk = chunks)

In [14]:
summary = summarization(chunks = model_summary)
print(summary)

['The Chronicles of Aeloria The Lost Kingdom In the land of Aeloria where rolling hills met dense forests and towering mountains there existed a kingdom shrouded in mystery Once a thriving realm of Herodotus The Lost Kingdom In the land of Aeloria where rolling hills met dense forests and towering mountains there existed a kingdom shrouded in mystery Once a thriving realm of Herodotus a thriving realm of magic and wonder Aeloria had fallen into ruin centuries ago its people scattered and its secrets buried beneath the sands']


In [15]:
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableSequence
import time

In [28]:
class Gneration:
    def __init__(self, documents):
        self.documents = documents
        self.llm_model = OllamaLLM(
            model = model_generate_name,
            temperature = temperature,
            streaming = True
        )
        self.prompt_template = PromptTemplate(
        input_variables = ["topic", "document", "word_limit"],
        template = (
                "Hello, could you please rephrase the following text about '{topic}' for better clarity and quality? "
                "Focus on the main points, use proper grammar and spelling, and ensure the language is clear and concise. "
                "Highlight all essential aspects. Please rewrite the following text in a professional and elegant manner. "
                "Present the content in a clear and structured format. Begin with an engaging opening sentence that provides context "
                "to the topic '{topic}'. Follow it with a logically ordered presentation of the main points. Use precise and scientific "
                "language, incorporating essential terms where appropriate. Conclude with a brief and impactful closing sentence. "
                "Ensure the final text appears polished and visually appealing, suitable for a professional audience. "
                "Please limit the response to exactly {word_limit} words. The text is as follows:\n\n{document}"
            )
        )
        self.runnable_sequence = RunnableSequence(self.prompt_template | self.llm_model)
    
    def interact(self, topic = None, word_limit = 100):
        inputs = {
            "document": self.documents,
            "topic": topic,
            "word_limit": word_limit
        }
        response = self.runnable_sequence.invoke(inputs)
        print("AI Summary : ", end = "", flush = True)
        for chunk in response:
            print(chunk, end = "", flush = True)
            time.sleep(0.05)

In [29]:
topic = "The Chronicles of Aeloria: The Lost Kingdom"
word_limit = 120

In [30]:
generate_summary = Gneration(documents = summary)
detailed_summary = generate_summary.interact(topic = topic)
print(detailed_summary)

AI Summary : **The Chronicles of Aeloria: The Lost Kingdom**

In the realm of Aeloria, where hills gave way to forests and mountains rose high, a kingdom once thrived under Herodotus' guidance. Aeloria, a land of majesty and wonder, was ravaged by time, leaving behind only whispers of its former glory. Once a bastion of magic and enchantment, the realm succumbed to darkness, with secrets buried beneath the sands that had witnessed its decline.

**Key Points:**

* The kingdom existed in Aeloria
* Rolling hills and dense forests characterize the landscape
* Towering mountains dominate the terrain
* The kingdom was once a thriving realm of magic and wonder
* It fell into ruin centuries ago

**Conclusion:**
Aeloria's downfall serves as a reminder of the fragile nature of even the most majestic realms.None
