In [None]:
# Installing the required libraries
%pip install --upgrade pydantic                              # Upgrade pydantic to avoid any typing issues
%pip install "langchain==0.1.16"                              # langchain - for document loading and processing
%pip install "chromadb==0.4.24"                               # chromadb - for vector storage
%pip install "sentence-transformers==2.5.1"                   # sentence-transformers - for embedding generation
%pip install "llama-index==0.10.18"                           # llama-index - core framework for LLMs
%pip install "llama-index-llms-groq==0.1.3"                   # llama-index-llms-groq - Groq integration for LLMs
%pip install "groq==0.4.2"                                    # groq - API client for interacting with Groq models
%pip install "llama-index-embeddings-huggingface==0.2.0"      # HuggingFace embeddings for Llama-Index

In [None]:
# Importing libraries

from langchain.document_loaders import TextLoader  # For loading text files as documents
from langchain.text_splitter import CharacterTextSplitter  # For splitting documents into smaller chunks
from llama_index.core import VectorStoreIndex, ServiceContext  # For working with vector stores and services
from llama_index.llms.groq import Groq  # For Groq-based LLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # For HuggingFace-based embeddings
from llama_index.core import Document as LlamaDocument  # LlamaDocument class for document processing in Llama-Index
import json  # For saving output in JSON format

In [None]:
# Loading the document, and splitting it into smaller, manageable chunks

# Define the file path
file_path = "C:/Users/M.Salah/Desktop/project/legal_assistant/Legal_document.txt"

loader = TextLoader(file_path)  # Initialize TextLoader to load the document
documents = loader.load()  # Load the document

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)  # Splitting each chunk into 1000 characters with 50 characters overlap
chunks = text_splitter.split_documents(documents)  # Splitting the documents into chunks

In [None]:
# Initializing the Groq LLM model and the embedding model

# Define the API key for Groq integration
GROQ_API_KEY = "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"

llm = Groq(
    model="llama3-8b-8192",
    api_key=GROQ_API_KEY
    )  # Initialize the Groq model (Llama3-8b-8192) with the API key

# Set up the embedding model using HuggingFace
embed_model = HuggingFaceEmbedding()  # Initialize the HuggingFace embedding model

In [None]:
# Creating optimized summarization prompt

SUMMARY_PROMPT = (
    "Generate a comprehensive legal summary of the document with:\n"
    "1. Clear identification of all parties involved\n"
    "2. Key legal arguments presented\n"
    "3. Critical judicial reasoning\n"
    "4. Final ruling and its implications\n"
    "5. Notable legal precedents cited\n"
    "Maintain professional legal tone and avoid colloquial language, "
    "and ensure accurate representation of all material facts. "
    "Prioritize clarity and conciseness while preserving legal nuances."
)

In [None]:
# Setting up the service context and creating the vector store index for efficient document querying

# Preparing the LlamaDocument objects from the chunks of text
llama_documents = [
    LlamaDocument(
        text=str(chunk),  # Text chunk to be stored
        metadata={"source": file_path}  # Storing the file path as metadata for reference
    ) for chunk in chunks  # Loop over the chunks to process each one

    ## For full document processing instead of chunk processing
    # LlamaDocument(
    #     text=doc.page_content,
    #     metadata=doc.metadata
    # ) for doc in documents
]

# Creating the vector store index from the processed Llama documents
service_context = ServiceContext.from_defaults(
    llm=llm,  # Groq LLM
    embed_model=embed_model,  # HuggingFace embedding model
    chunk_size=800  # Configuring chunk size for Llama-Index
)

# Creating the vector store index from the processed Llama documents
legal_index = VectorStoreIndex.from_documents(
    documents=llama_documents,  # Using the LlamaDocument list as input
    show_progress=True,  # Display progress
    service_context=service_context,  # Using the defined service context
)

In [None]:
# Perform the summarization query

# Using the index to perform a summarization query on the legal documents
query_engine = legal_index.as_query_engine(
    similarity_top_k=3,  # Retrieve the top 3 most similar chunks/documents
    response_mode="compact"  # Return a concise response
)

# Performing the query with the defined prompt for summarization
summary_response = query_engine.query(SUMMARY_PROMPT)

In [None]:
# Save results to JSON

# Organizing the summary results into a dictionary
output_data = {
    "document_path": file_path,  # Path to the document being summarized
    "summary": str(summary_response)  # The resulting summary from the query
}

# Writing the summary results into a JSON file
with open("legal_summary.json", "w") as f:
    json.dump(output_data, f, indent=2)  # Saving the output in a readable format

print("Summary successfully saved to legal_summary.json")

In [None]:
# Read the json file

with open("legal_summary.json", "r") as f:
    data = json.load(f)
    print(json.dumps(data, indent=2))
