# RAG SYSTEM FOR DOCUMENT SUMMARIZATION

### API KEY SETUP

In [1]:
# setting up all the api keys 
# i have stored my all of the keys in 'Keys' modul
import os
from Keys import LangChain_api, OpenAi_api
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = LangChain_api
os.environ['OPENAI_API_KEY'] = OpenAi_api

# Part 1: Data preprocessing
### pdfs featching

In [2]:
# loading the pdfs into a document obeject of langchain
from langchain_community.document_loaders import PyPDFLoader

docs = []
loader = PyPDFLoader("D:/Masters/Sem 3/Deep learning/data/United States v. Keysight.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'iText® Core 7.2.3 (production version) ©2000-2022 iText Group NV, Government Publishing Office', 'creator': 'govinfo, U. S. Government Publishing Office', 'creationdate': '2025-06-12T03:45:31+00:00', 'moddate': '2025-06-11T23:46:53-04:00', 'source': 'D:/Masters/Sem 3/Deep learning/data/United States v. Keysight.pdf', 'total_pages': 81, 'page': 0, 'page_label': '1'}, page_content='24870 Federal Register / Vol. 90, No. 112 / Thursday, June 12, 2025 / Notices \nDEPARTMENT OF JUSTICE \nAntitrust Division \nUnited States v. Keysight \nTechnologies Inc., et al.; Proposed \nFinal Judgment and Competitive \nImpact Statement \nNotice is hereby given pursuant to the \nAntitrust Procedures and Penalties Act, \n15 U.S.C. 16(b)–(h), that a proposed \nFinal Judgment, Stipulation, and \nCompetitive Impact Statement have \nbeen filed with the United States \nDistrict Court for the District of \nColumbia in United States of America v. \nKeysight Technologies, Inc., et a

### Document splits

In [3]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# spliting each document in size of 2000 with overlap of 200
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000, chunk_overlap=800)
splits = splitter.split_documents(docs)
for split in splits:
    print("-"*100)
    print(split)


USER_AGENT environment variable not set, consider setting it to identify your requests.


----------------------------------------------------------------------------------------------------
page_content='24870 Federal Register / Vol. 90, No. 112 / Thursday, June 12, 2025 / Notices 
DEPARTMENT OF JUSTICE 
Antitrust Division 
United States v. Keysight 
Technologies Inc., et al.; Proposed 
Final Judgment and Competitive 
Impact Statement 
Notice is hereby given pursuant to the 
Antitrust Procedures and Penalties Act, 
15 U.S.C. 16(b)–(h), that a proposed 
Final Judgment, Stipulation, and 
Competitive Impact Statement have 
been filed with the United States 
District Court for the District of 
Columbia in United States of America v. 
Keysight Technologies, Inc., et al., Civil 
Action No. 1:25–cv–01734–CJN. On 
June 2, 2025 the United States filed a 
Complaint alleging that Keysight’s 
proposed acquisition of Spirent 
Communications plc would violate 
Section 7 of the Clayton Act, 15 U.S.C. 
18. The proposed Final Judgment, filed 
at the same time as the Complaint, 
requires Ke

### Summarizing each splits 

In [4]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# this chain use each split and pass it to a gpt 3.5 and summarize the split
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(model="gpt-3.5-turbo",max_retries=0)
    | StrOutputParser()
)

summaries = chain.batch(splits, {"max_concurrency": 5})

for s in summaries:
    print("-"*100)
    print(s)

----------------------------------------------------------------------------------------------------
The document announces a proposed Final Judgment related to the acquisition of Spirent Communications plc by Keysight Technologies Inc., which is alleged to violate the Clayton Act. The proposed Final Judgment requires Keysight and Spirent to divest certain assets and provide transitional services to support the divested businesses. The document invites public comment and provides information on how to submit comments. It also includes details about the nature of the action, the defendants, the proposed transaction, jurisdiction, and venue.
----------------------------------------------------------------------------------------------------
The document discusses the importance of specialized testing equipment in the communications industry for verifying the performance of networks and devices. It specifically focuses on high-speed ethernet testing equipment, network security testing equ

# Part 2: Data store 
###### This section store each of the summaries into a vector database with the document split id.
###### And the original split of which summaries are generated are stored in a storage layer.
###### At the time of retrival both the summaries and original document split will be retrived.

In [5]:
from langchain.storage import InMemoryByteStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=OpenAIEmbeddings())

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in splits]

# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

  vectorstore = Chroma(collection_name="summaries",


# PART 3: LLM PROMPT

In [6]:
# This is the system prompt that set the behaviour and the output preferance of the LLM.
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
"""
You are an expert legal document summarizer tasked with creating a concise, accurate, and factually correct summary of a long legal document. The document may span hundreds of pages and has been split into smaller sections, with summaries of each section provided. Your task is to generate a comprehensive summary of the entire document based on the provided section summaries, ensuring no critical details are omitted and no information is fabricated. Follow these instructions:

1. **Input Context**: You will receive summaries of individual document sections retrieved from a vector database. These summaries represent key points from each section of the original legal document. If needed, you may reference the original document text from the document store for clarification or additional details.

2. **Summary Requirements**:
   - Produce a clear and concise summary of the entire document, capturing its main purpose, key arguments, critical legal points, and conclusions.
   - Ensure the summary is factually accurate and consistent with the provided section summaries.
   - Avoid introducing any information not explicitly supported by the section summaries or original document.
   - Highlight any critical legal terms, obligations, rights, or clauses that are central to the document’s intent.
   - If the section summaries contain conflicting or ambiguous information, flag these issues and suggest consulting the original document for clarification.

3. **Tone and Style**:
   - Use formal, professional language appropriate for legal contexts.
   - Be objective and neutral, avoiding speculative or interpretive language unless explicitly supported by the summaries.
   - Structure the summary with clear headings or sections (e.g., Purpose, Key Provisions, Conclusions) for readability.

4. **Output Length**: Aim for a summary length that suitable for the document, depending on the document’s complexity, unless otherwise specified. Ensure the summary is comprehensive but avoids unnecessary repetition.

5. **Factual Integrity**: Legal documents require precision. Cross-reference the provided section summaries to ensure no distortion of facts, and do not hallucinate or infer details beyond the given information.

**Input Summaries**:
{retrieved_summaries}

**Task**:
Generate a comprehensive summary of the legal document based on the provided section summaries. If any clarification is needed from the original document, indicate the specific section or topic requiring further review. Ensure the summary is structured, accurate, and suitable for a legal audience.

**Output Format**:
- **Document Overview**: Briefly state the document’s purpose and context.
- **Key Provisions**: Summarize critical legal points, clauses, or arguments.
- **Conclusions or Outcomes**: Highlight the document’s conclusions, decisions, or implications.
- **Notes (if applicable)**: Flag any ambiguities or areas needing further review.
"""
)

# PART 4: Main Pipeline
###### This is the main chain that executes all the operation one by one.
###### First, based on use question it will retrive the documents and summaries from the vectore store and doc store.
###### Then it will pass those recived summaries to the llm with our predefined prompt.
###### Lastly, llm will give the output based on the summaries.

In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.7)
rag_chain = (
    {"retrieved_summaries": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Summarize the entire document in 1000 words.")

'**Comprehensive Summary of United States v. Keysight Document**\n\n---\n\n### Document Overview\n\nThe document is a legal filing related to the case *United States v. Keysight*, published in the Federal Register (Vol. 90, No. 112, June 12, 2025). It appears to concern regulatory or enforcement actions involving Keysight Technologies, with the U.S. Government Publishing Office as the source. The document spans 81 pages and is part of official government records, reflecting proceedings, notices, or determinations issued by a federal agency or court. The context involves government oversight, compliance, or adjudication regarding Keysight’s conduct or products, under applicable federal statutes or administrative rules.\n\n---\n\n### Key Provisions\n\n- **Regulatory Framework**: The document operates within the framework established by federal law and regulations governing corporate compliance and enforcement actions. Specific statutes or regulatory provisions referenced are not detailed