PROCESS DOCUMENT

In [1]:
import os
import shutil
from utils.document_processor import DocumentProcessor
from chromadb import Client
from sentence_transformers import SentenceTransformer

CHROMA_PATH = "chroma"
DATA_PATH = "./data"
VALID_EXTENSIONS = ('.pdf', '.docx', '.txt')

docs = DocumentProcessor()
chroma_client = Client()
collection = chroma_client.get_or_create_collection(name="document_vectors")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def load_documents():
    for filename in filter(lambda f: f.lower().endswith(VALID_EXTENSIONS), os.listdir(DATA_PATH)):
        filepath = os.path.join(DATA_PATH, filename)
        with open(filepath, "rb") as f:
            yield docs.process_document(f.read(), filename)

def split_documents(extracted_docs, chunk_size=500):
    for doc_index, (content, _, _, _) in enumerate(extracted_docs):
        for page_number, page_content in enumerate(content or []):
            for i in range(0, len(page_content), chunk_size):
                chunk = page_content[i:i+chunk_size].strip()
                if chunk:
                    yield f"doc{doc_index}_page{page_number}_chunk{i//chunk_size}", chunk, doc_index, page_number

def add_to_chroma(chunks):
    existing_ids = set(collection.get(include=[]).get("ids", []))
    new_chunks = [(cid, embedding_model.encode([chunk])[0].tolist(), doc_idx, page)
                  for cid, chunk, doc_idx, page in chunks if cid not in existing_ids]

    if new_chunks:
        ids, embeddings, metadata = zip(*[(cid, emb, {"doc_index": doc_idx, "page": page})
                                           for cid, emb, doc_idx, page in new_chunks])
        collection.add(ids=list(ids), embeddings=list(embeddings), metadatas=list(metadata))

def main(reset=False):
    if reset and os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    add_to_chroma(split_documents(load_documents()))


OLLAMA MODEL

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

In [3]:
OLLAMA_MODEL = "llama3.2"
COLLECTION_NAME = "ollama_vectore_test"

TEST OLLAMA MODEL WITH GRAPHRAG v1

In [4]:
import networkx as nx

def create_document_graph(documents):
    G = nx.Graph()
    
    for i, doc in enumerate(documents):
        G.add_node(i, content=doc)
        if i < len(documents) - 1:
            G.add_edge(i, i + 1)
    
    return G

def retrieve_relevant_documents(graph, query):
    return [graph.nodes[n]['content'] for n in graph.nodes]

In [8]:
from IPython.display import display, Markdown
import networkx as nx

# Template untuk prompt summarization
template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)
chain = prompt | model

def create_document_graph(documents):
    G = nx.Graph()
    
    for i, doc in enumerate(documents):
        G.add_node(i, content=doc)
        
        if i < len(documents) - 1:
            G.add_edge(i, i + 1)
    
    return G

def retrieve_relevant_documents(graph, query):
    return [graph.nodes[n]['content'] for n in graph.nodes]

def summarize_documents(documents, query):
    graph = create_document_graph(documents)
    
    relevant_documents = retrieve_relevant_documents(graph, query)
    
    summaries = []
    for doc_index, doc_data in enumerate(relevant_documents):
        if not doc_data or len(doc_data) < 4:
            continue
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        if not full_text.strip():
            continue
        
        summaries.append(chain.invoke({"document": full_text}))
    
    return summaries

documents = list(load_documents())
query = "What is discussed in this document?" 

summaries = summarize_documents(documents, query)

for summary in summaries:
    display(Markdown(summary))

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


This appears to be a collection of data about Indonesia, including geographical information, economic statistics, and cultural symbols. The data is presented in a structured format, with each piece of information enclosed in a set of curly brackets ({}).

Here is a summary of the main points:

* Geographical Information:
	+ Location: Asia Tenggara
	+ Borders: Malaysia, Filipina, Australia
	+ Highest Mountain: Puncak Jaya (4.884 mdpl)
	+ Longest River: Sungai Kapuas (±1.143 km)
* Economic Statistics:
	+ GDP: ±1,3 triliun USD (perkiraan 2024)
	+ Sector of Economy: Pertanian, Pertambangan, Industri, Pariwisata
	+ Population: Not explicitly stated
* Cultural Symbols:
	+ National Symbol: Garuda Pancasila
	+ National Anthem: Indonesia Raya
	+ National Flag: Merah Putih
	+ National Motto: Bhinneka Tunggal Ika
	+ National Mascot: Not explicitly stated

Overall, this data provides a comprehensive overview of Indonesia's geography, economy, and cultural identity.

TEST OLLAMA MODEL WITH GRAPHRAG v2

In [7]:
from IPython.display import display, Markdown
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class GraphRAG:
    def __init__(self, documents, model, prompt_template):
        self.documents = documents
        self.model = model
        self.prompt_template = prompt_template
        self.graph = nx.Graph()
        
    def create_document_embeddings(self):
        vectorizer = TfidfVectorizer()
        doc_texts = [
            "\n".join(map(str, doc[0])) if isinstance(doc[0], list) else str(doc[0])
            for doc in self.documents
        ]
        embeddings = vectorizer.fit_transform(doc_texts)
        return embeddings
    
    def build_document_graph(self, embeddings, similarity_threshold=0.2):
        similarity_matrix = cosine_similarity(embeddings)
        
        for i in range(len(self.documents)):
            self.graph.add_node(i, text=self.documents[i])
            
        for i in range(len(self.documents)):
            for j in range(i+1, len(self.documents)):
                if similarity_matrix[i, j] > similarity_threshold:
                    self.graph.add_edge(i, j, weight=similarity_matrix[i, j])
    
    def summarize_documents(self):
        summaries = []
        
        chain = self.prompt_template | self.model
        
        for doc_index, doc_data in enumerate(self.documents):
            if not doc_data or len(doc_data) < 4:
                continue
            
            content, _, _, _ = doc_data
            full_text = "\n".join(
                " ".join(map(str, page)) if isinstance(page, list) else str(page)
                for page in content
            ) if isinstance(content, list) else str(content)
            
            if not full_text.strip():
                continue
            
            summary = chain.invoke({"document": full_text})
            summaries.append(summary)
            
            self.graph.nodes[doc_index]['summary'] = summary
        
        return summaries
    
    def rank_documents(self):
        pagerank = nx.pagerank(self.graph)
        ranked_docs = sorted(
            [(idx, score) for idx, score in pagerank.items()], 
            key=lambda x: x[1], 
            reverse=True
        )
        return ranked_docs
    
    def generate_insights(self):
        insights = []
        ranked_docs = self.rank_documents()
        
        for idx, rank_score in ranked_docs[:3]: 
            node_data = self.graph.nodes[idx]
            summary = node_data.get('summary', '')
            
            insight_prompt = f"""
            Based on the summary of highly rated documents, 
            provide important insights:
            
            Summary: {summary}
            """
            
            insight = self.model.invoke(insight_prompt)
            insights.append(insight)
        
        return insights

template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)

documents = list(load_documents())

graph_rag = GraphRAG(documents, model, prompt)

embeddings = graph_rag.create_document_embeddings()
graph_rag.build_document_graph(embeddings)

summaries = graph_rag.summarize_documents()

for summary in summaries:
    display(Markdown(summary))

insights = graph_rag.generate_insights()
print("\n--- Insights ---")
for insight in insights:
    display(Markdown(insight))

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


The provided data appears to be a collection of information about Indonesia, including its geography, economy, politics, and culture. Here are some key points that can be extracted from the data:

**Geography**

* Location: Asia Tenggara (Southeastern Asia)
* Borders: Malaysia and Filipina in the north, Australia in the south
* Highest point: Puncak Jaya (4,884 m dpl)

**Economy**

* GDP: ±1.3 trillion USD (perkiraan 2024)
* Main sectors: Pertanian (Agriculture), Pertambangan (Mining), Industri (Industry), and Pariwisata (Tourism)
* Currency: Not specified

**Politics**

* Government type: Not specified
* National symbol: Garuda Pancasila (National Emblem)
* National anthem: Indonesia Raya (Lagu Kebangsaan)
* National flag: Merah Putih (Red-White)

**Culture**

* Symbol of national unity: Bhinneka Tunggal Ika
* National motto: Not specified

**Data Umum Negara Indonesia**

* A collection of general information about the Indonesian government and its institutions.

Overall, the data provides a comprehensive overview of Indonesia's geography, economy, politics, culture, and other relevant aspects.


--- Insights ---


Here are some important insights that can be extracted from the summary:

1. **Location and Borders**: Indonesia is situated in Southeastern Asia, sharing borders with Malaysia and the Philippines to the north, and Australia to the south.

2. **Geography's Highest Point**: The country has a significant geographical feature, Puncak Jaya, which stands at 4,884 meters (dpl) above sea level.

3. **Economic Sectors**: The primary sectors that drive Indonesia's economy are agriculture, mining, industry, and tourism.

4. **Political Structure**: Although the government type is not specified in the provided data, it implies a complex structure since other national symbols such as the Garuda Pancasila emblem and the Indonesian national anthem, "Indonesia Raya," are mentioned.

5. **Cultural Significance**: The symbol of national unity for Indonesia is Bhinneka Tunggal Ika, which represents diversity within a united nation.

6. **National Identity**: The country's national motto or any other defining phrases were not provided in the summary, indicating that their significance might be more nuanced and less explicitly expressed in the data given.

7. **Economic Value**: Indonesia's GDP is significant, with an estimated value of ±1.3 trillion USD in 2024, making it a substantial contributor to regional economic activities.

8. **Cultural Heritage**: The mention of "Merah Putih" as the national flag further emphasizes Indonesia's strong cultural identity and emphasis on its rich symbols of unity and pride.

These insights highlight key aspects of Indonesia's geography, economy, politics, culture, and other relevant information that are crucial for understanding this Southeast Asian nation.

TEST OLLAMA MODEL WITH RAG V3

In [9]:
from IPython.display import display, Markdown
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage

class SimpleSummarizer:
    def __init__(self, documents, model, prompt_template):
        self.documents = documents
        self.model = model
        self.prompt_template = prompt_template

    def prepare_text(self, doc_data):
        """Prepare document text for summarization"""
        if not doc_data or len(doc_data) < 4:
            return None
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        return full_text.strip() if full_text else None

    def direct_summarization(self):
        """Summarization for few documents"""
        summaries = []
        
        for doc_data in self.documents:
            full_text = self.prepare_text(doc_data)
            if not full_text:
                continue
            
            prompt = self.prompt_template.format(document=full_text)
            
            try:
                summary = self.model.invoke(prompt)
                summaries.append(summary)
            except Exception as e:
                print(f"Error summarizing document: {e}")
        
        return summaries

    def simple_multi_document_summarization(self):
        """Summarization for more documents"""
        return self.direct_summarization()

    def summarize_documents(self):
        """Select summarization method based on number of documents"""
        if len(self.documents) < 5:
            return self.direct_summarization()
        else:
            return self.simple_multi_document_summarization()

def create_summarizer(documents, model):
    template = """
    You are a helpful assistant for text summarization. 
    Only include information that is part of the document. 
    Do not include your own opinion or analysis.

    Document: 
    "{document}"
    Summary:
    """
    
    prompt_template = PromptTemplate.from_template(template)
    
    return SimpleSummarizer(documents, model, prompt_template)

try:
    documents = list(load_documents())
    
    model = OllamaLLM(model=OLLAMA_MODEL)
    
    summarizer = create_summarizer(documents, model)
    
    summaries = summarizer.summarize_documents()
    
    for summary in summaries:
        display(Markdown(summary))

except Exception as e:
    print(f"Error in summarization process: {e}")

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


The provided text appears to be a JSON-formatted list of data about Indonesia. Here is a summary of the information:

**General Information**

* Indonesia is located in Asia Tenggara (Southeastern Asia)
* The country has a total area of approximately 1,904,569 square kilometers
* The population is around 273 million people

**Economy**

* The GDP is estimated to be around $1.3 trillion USD (perkiraan 2024)
* The sector economy utama is agriculture, mining, industry, and tourism
* The country has a relatively high GDP per capita of approximately $5,000 USD

**Geography**

* Indonesia is bordered by Malaysia, Papua New Guinea, East Timor, Australia, and the Pacific Ocean
* The highest mountain peak is Puncak Jaya (4,884 meters above sea level)
* The longest river is Sungai Kapuas (approximately 1.143 kilometers long)

**National Symbols**

* The national symbol is the Garuda Pancasila (a five-legged bird of prey)
* The national anthem is "Indonesia Raya"
* The national flag is the Merah Putih (Red-White) flag
* The national motto is "Bhinneka Tunggal Ika" (Unity in Diversity)

**Other Information**

* The country has a total of 17,504 islands
* The official language is Indonesian
* The capital city is Jakarta

Note that this summary only includes the most relevant and concise information from the provided JSON data.

TEST OLLAMA MODEL WITHOUT RAG

In [None]:
from IPython.display import display, Markdown

# Template untuk prompt summarization
template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)
chain = prompt | model

def summarize_documents(documents):
    summaries = []
    for doc_index, doc_data in enumerate(documents):
        if not doc_data or len(doc_data) < 4:
            continue
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        if not full_text.strip():
            continue
        
        summaries.append(chain.invoke({"document": full_text}))
    
    return summaries

documents = list(load_documents())
summaries = summarize_documents(documents)

for summary in summaries:
    display(Markdown(summary))

TEST OPENAI MODEL

In [None]:
from IPython.display import display, Markdown
from dotenv import load_dotenv
import os
import openai

load_dotenv()

openai_client = openai.OpenAI()  # Inisialisasi klien baru

MODEL_NAME = "gpt-3.5-turbo"

def summarize_documents(documents):
    summaries = []
    for doc_index, doc_data in enumerate(documents):
        if not doc_data or len(doc_data) < 4:
            display(Markdown(f"**Skipping document {doc_index} due to incomplete extraction.**"))
            continue
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        if not full_text.strip():
            display(Markdown(f"**Skipping document {doc_index} due to empty content.**"))
            continue
        
        response = openai_client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a helpful assistant for text summarization. Only include information that is part of the document. Do not include your own opinion or analysis."},
                {"role": "user", "content": full_text}
            ]
        )
        
        summary = response.choices[0].message.content
        summaries.append(summary)
    
    return summaries

documents = load_documents()
summaries = summarize_documents(documents)

for summary in summaries:
    display(Markdown(summary))
