PROCESS DOCUMENT

In [None]:
import os
import shutil
from utils.document_processor import DocumentProcessor
from chromadb import Client
from sentence_transformers import SentenceTransformer

CHROMA_PATH = "chroma"
DATA_PATH = "./data"
VALID_EXTENSIONS = ('.pdf', '.docx', '.txt')

docs = DocumentProcessor()
chroma_client = Client()
collection = chroma_client.get_or_create_collection(name="document_vectors")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def load_documents():
    for filename in filter(lambda f: f.lower().endswith(VALID_EXTENSIONS), os.listdir(DATA_PATH)):
        filepath = os.path.join(DATA_PATH, filename)
        with open(filepath, "rb") as f:
            yield docs.process_document(f.read(), filename)

def split_documents(extracted_docs, chunk_size=500):
    for doc_index, (content, _, _, _) in enumerate(extracted_docs):
        for page_number, page_content in enumerate(content or []):
            for i in range(0, len(page_content), chunk_size):
                chunk = page_content[i:i+chunk_size].strip()
                if chunk:
                    yield f"doc{doc_index}_page{page_number}_chunk{i//chunk_size}", chunk, doc_index, page_number

def add_to_chroma(chunks):
    existing_ids = set(collection.get(include=[]).get("ids", []))
    new_chunks = [(cid, embedding_model.encode([chunk])[0].tolist(), doc_idx, page)
                  for cid, chunk, doc_idx, page in chunks if cid not in existing_ids]

    if new_chunks:
        ids, embeddings, metadata = zip(*[(cid, emb, {"doc_index": doc_idx, "page": page})
                                           for cid, emb, doc_idx, page in new_chunks])
        collection.add(ids=list(ids), embeddings=list(embeddings), metadatas=list(metadata))

def main(reset=False):
    if reset and os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    add_to_chroma(split_documents(load_documents()))


OLLAMA MODEL

In [39]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

In [40]:
OLLAMA_MODEL = "llama3.2"
COLLECTION_NAME = "ollama_vectore_test"

TEST OLLAMA MODEL WITH GRAPHRAG

In [None]:
from IPython.display import display, Markdown
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class GraphRAG:
    def __init__(self, documents, model, prompt_template):
        self.documents = documents
        self.model = model
        self.prompt_template = prompt_template
        self.graph = nx.Graph()
        
    def create_document_embeddings(self):
        vectorizer = TfidfVectorizer()
        doc_texts = [
            "\n".join(map(str, doc[0])) if isinstance(doc[0], list) else str(doc[0])
            for doc in self.documents
        ]
        embeddings = vectorizer.fit_transform(doc_texts)
        return embeddings
    
    def build_document_graph(self, embeddings, similarity_threshold=0.2):
        similarity_matrix = cosine_similarity(embeddings)
        
        for i in range(len(self.documents)):
            self.graph.add_node(i, text=self.documents[i])
            
        for i in range(len(self.documents)):
            for j in range(i+1, len(self.documents)):
                if similarity_matrix[i, j] > similarity_threshold:
                    self.graph.add_edge(i, j, weight=similarity_matrix[i, j])
    
    def summarize_documents(self):
        chain = self.prompt_template | self.model
        
        for doc_index, doc_data in enumerate(self.documents):
            if not doc_data or len(doc_data) < 4:
                continue
            
            content, _, _, _ = doc_data
            full_text = "\n".join(
                " ".join(map(str, page)) if isinstance(page, list) else str(page)
                for page in content
            ) if isinstance(content, list) else str(content)
            
            if not full_text.strip():
                continue
            
            summary = chain.invoke({"document": full_text})
            
            self.graph.nodes[doc_index]['summary'] = summary 
    
    def rank_documents(self):
        pagerank = nx.pagerank(self.graph)
        ranked_docs = sorted(
            [(idx, score) for idx, score in pagerank.items()], 
            key=lambda x: x[1], 
            reverse=True
        )
        return ranked_docs
    
    def generate_insights(self):
        insights = []
        ranked_docs = self.rank_documents()
        
        for idx, rank_score in ranked_docs[:1]: 
            node_data = self.graph.nodes[idx]
            summary = node_data.get('summary', '')
            
            insight_prompt = f"""
            Based on the summary of highly rated documents, 
            provide important insights
            
            Summary: {summary}
            """
            
            insight = self.model.invoke(insight_prompt)
            insights.append(insight)
        
        return insights

template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)

documents = list(load_documents())

graph_rag = GraphRAG(documents, model, prompt)

embeddings = graph_rag.create_document_embeddings()
graph_rag.build_document_graph(embeddings)

graph_rag.summarize_documents() 

insights = graph_rag.generate_insights()
for insight in insights:
    display(Markdown(insight))
