PROCESS DOCUMENT

In [2]:
import os
import shutil
from utils.document_processor import DocumentProcessor
from chromadb import Client
from sentence_transformers import SentenceTransformer

CHROMA_PATH = "chroma"
DATA_PATH = "./data"
VALID_EXTENSIONS = ('.pdf', '.docx', '.txt')

docs = DocumentProcessor()
chroma_client = Client()
collection = chroma_client.get_or_create_collection(name="document_vectors")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def load_documents():
    for filename in filter(lambda f: f.lower().endswith(VALID_EXTENSIONS), os.listdir(DATA_PATH)):
        filepath = os.path.join(DATA_PATH, filename)
        with open(filepath, "rb") as f:
            yield docs.process_document(f.read(), filename)

def split_documents(extracted_docs, chunk_size=500):
    for doc_index, (content, _, _, _) in enumerate(extracted_docs):
        for page_number, page_content in enumerate(content or []):
            for i in range(0, len(page_content), chunk_size):
                chunk = page_content[i:i+chunk_size].strip()
                if chunk:
                    yield f"doc{doc_index}_page{page_number}_chunk{i//chunk_size}", chunk, doc_index, page_number

def add_to_chroma(chunks):
    existing_ids = set(collection.get(include=[]).get("ids", []))
    new_chunks = [(cid, embedding_model.encode([chunk])[0].tolist(), doc_idx, page)
                  for cid, chunk, doc_idx, page in chunks if cid not in existing_ids]

    if new_chunks:
        ids, embeddings, metadata = zip(*[(cid, emb, {"doc_index": doc_idx, "page": page})
                                           for cid, emb, doc_idx, page in new_chunks])
        collection.add(ids=list(ids), embeddings=list(embeddings), metadatas=list(metadata))

def main(reset=False):
    if reset and os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    add_to_chroma(split_documents(load_documents()))


OLLAMA MODEL

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

In [4]:
OLLAMA_MODEL = "llama3.2"
COLLECTION_NAME = "ollama_vectore_test"

TEST OLLAMA MODEL WITH GRAPHRAG v1

In [5]:
import networkx as nx

def create_document_graph(documents):
    G = nx.Graph()
    
    for i, doc in enumerate(documents):
        G.add_node(i, content=doc)
        if i < len(documents) - 1:
            G.add_edge(i, i + 1)
    
    return G

def retrieve_relevant_documents(graph, query):
    return [graph.nodes[n]['content'] for n in graph.nodes]

In [6]:
from IPython.display import display, Markdown
import networkx as nx

# Template untuk prompt summarization
template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)
chain = prompt | model

def create_document_graph(documents):
    G = nx.Graph()
    
    for i, doc in enumerate(documents):
        G.add_node(i, content=doc)
        
        if i < len(documents) - 1:
            G.add_edge(i, i + 1)
    
    return G

def retrieve_relevant_documents(graph, query):
    return [graph.nodes[n]['content'] for n in graph.nodes]

def summarize_documents(documents, query):
    graph = create_document_graph(documents)
    
    relevant_documents = retrieve_relevant_documents(graph, query)
    
    summaries = []
    for doc_index, doc_data in enumerate(relevant_documents):
        if not doc_data or len(doc_data) < 4:
            continue
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        if not full_text.strip():
            continue
        
        summaries.append(chain.invoke({"document": full_text}))
    
    return summaries

documents = list(load_documents())
query = "What is discussed in this document?" 

summaries = summarize_documents(documents, query)

for summary in summaries:
    display(Markdown(summary))

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


This is a summary of a list of interesting facts about Canada:

* Canada has the longest running ballet company in North America - the Royal Winnipeg Ballet
* Canada is home to many cultural institutions and has a welcoming spirit
* It's the number 1 travel destination in the world, according to Travel + Leisure magazine (2017)
* Canada is the oldest ballet company in Canada, located in Manitoba
* The country has unspoiled landscapes, dynamic cities, and vibrant culture

Note: This summary only includes a few of the many interesting facts about Canada mentioned in the original text.

The provided text appears to be a summary or an extract from a Wikipedia article about Indonesia. The content includes various sections such as:

* Geography and geografi
* Economy and sector ekonomi utama
* Demographics and data umum negara indonesia
* Symbols of national identity, including the symbol nasional, lagu kebangsaan (national anthem), bendera (flag), lambang negara (state emblem), and semboyan (motto)
* National holidays and celebrations

There is no apparent logical flow or order to the content, and it seems to be a collection of related information about Indonesia rather than a cohesive narrative. The text also includes various formatting elements such as bold font, which suggests that it may be part of a larger document or webpage.

Without more context, it is difficult to provide a specific summary or interpretation of this content. However, the provided text appears to be a factual representation of information about Indonesia, likely from a reliable source such as Wikipedia.

TEST OLLAMA MODEL WITH GRAPHRAG v2

In [7]:
from IPython.display import display, Markdown
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class GraphRAG:
    def __init__(self, documents, model, prompt_template):
        self.documents = documents
        self.model = model
        self.prompt_template = prompt_template
        self.graph = nx.Graph()
        
    def create_document_embeddings(self):
        vectorizer = TfidfVectorizer()
        doc_texts = [
            "\n".join(map(str, doc[0])) if isinstance(doc[0], list) else str(doc[0])
            for doc in self.documents
        ]
        embeddings = vectorizer.fit_transform(doc_texts)
        return embeddings
    
    def build_document_graph(self, embeddings, similarity_threshold=0.2):
        similarity_matrix = cosine_similarity(embeddings)
        
        for i in range(len(self.documents)):
            self.graph.add_node(i, text=self.documents[i])
            
        for i in range(len(self.documents)):
            for j in range(i+1, len(self.documents)):
                if similarity_matrix[i, j] > similarity_threshold:
                    self.graph.add_edge(i, j, weight=similarity_matrix[i, j])
    
    def summarize_documents(self):
        summaries = []
        
        chain = self.prompt_template | self.model
        
        for doc_index, doc_data in enumerate(self.documents):
            if not doc_data or len(doc_data) < 4:
                continue
            
            content, _, _, _ = doc_data
            full_text = "\n".join(
                " ".join(map(str, page)) if isinstance(page, list) else str(page)
                for page in content
            ) if isinstance(content, list) else str(content)
            
            if not full_text.strip():
                continue
            
            summary = chain.invoke({"document": full_text})
            summaries.append(summary)
            
            self.graph.nodes[doc_index]['summary'] = summary
        
        return summaries
    
    def rank_documents(self):
        pagerank = nx.pagerank(self.graph)
        ranked_docs = sorted(
            [(idx, score) for idx, score in pagerank.items()], 
            key=lambda x: x[1], 
            reverse=True
        )
        return ranked_docs
    
    def generate_insights(self):
        insights = []
        ranked_docs = self.rank_documents()
        
        for idx, rank_score in ranked_docs[:3]: 
            node_data = self.graph.nodes[idx]
            summary = node_data.get('summary', '')
            
            insight_prompt = f"""
            Based on the summary of highly rated documents, 
            provide important insights:
            
            Summary: {summary}
            """
            
            insight = self.model.invoke(insight_prompt)
            insights.append(insight)
        
        return insights

template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)

documents = list(load_documents())

graph_rag = GraphRAG(documents, model, prompt)

embeddings = graph_rag.create_document_embeddings()
graph_rag.build_document_graph(embeddings)

summaries = graph_rag.summarize_documents()

for summary in summaries:
    display(Markdown(summary))

insights = graph_rag.generate_insights()
print("\n--- Insights ---")
for insight in insights:
    display(Markdown(insight))

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


The provided text appears to be a summary or an excerpt from an article about Canada. The content includes various interesting facts and highlights about the country, such as:

1. Canada's long history of ballet, with the Royal Winnipeg Ballet being one of the oldest in North America.
2. Canada being ranked as the number 1 travel destination in the world by Travel + Leisure magazine (2017).
3. The country's diverse landscapes, dynamic cities, and welcoming culture.
4. Canada's notable institutions, including its many museums, galleries, and cultural attractions.

Overall, the text suggests that Canada is a fascinating country with a rich history, vibrant culture, and plenty to offer visitors and residents alike.

The provided text appears to be a summary or a list of information about Indonesia. The content is not well-organized and lacks a clear structure, but it can be summarized as follows:

**Geography**: Indonesia is located in Asia Tenggara (Southeast Asia) and borders Malaysia and Australia to the north and south, respectively.

**Climate**: Indonesia has a tropical climate with temperatures ranging from 20°C to 30°C throughout the year.

**Economy**: Indonesia's economy is driven by agriculture, industry, and services. The country is one of the largest producers of palm oil, coffee, and cocoa in the world. The government aims to develop its tourism sector as well.

**Government**: Indonesia is a democratic republic with a president as head of state and government. The government has three branches: executive, legislative, and judicial.

**Culture**: Indonesian culture is influenced by its rich history, which includes Hinduism, Buddhism, Islam, Christianity, and traditional animist practices. The country celebrates many festivals throughout the year, including Independence Day on August 17th.

**National symbols**: Indonesia's national symbols include the Merah Putih (Red-White) flag, the Garuda Pancasila emblem, and the song "Indonesia Raya".

Overall, the text provides a brief overview of various aspects of Indonesia, but it could be improved with more organization and clarity.


--- Insights ---


Based on this summary, here are some important insights about Canada:

1. **Rich Cultural Heritage**: Canada has a long history of ballet, indicating its strong cultural roots and appreciation for the arts.

2. **Tourism and Recognition**: Being ranked as the number 1 travel destination in the world by Travel + Leisure magazine (2017) highlights Canada's unique attractions and appeal to tourists.

3. **Diverse Landscapes and Cities**: The country boasts diverse landscapes, dynamic cities, and a welcoming culture, suggesting a wide range of experiences for visitors and residents alike.

4. **Cultural Institutions**: The presence of numerous museums, galleries, and cultural attractions underscores Canada's commitment to its cultural heritage and its status as a hub for artistic expression.

5. **National Pride and Reputation**: These insights suggest that Canada is not only a beautiful country but also one with a strong national identity and reputation for excellence in various aspects of culture and tourism.

Based on the summary of highly rated documents about Indonesia, here are some important insights:

1. **Strategic Location**: Indonesia is strategically located in Southeast Asia, bordering Malaysia to the north and Australia to the south, making it an ideal location for trade and cultural exchange.

2. **Diverse Economy**: Indonesia's economy is driven by a diverse range of sectors, including agriculture (palm oil, coffee, cocoa), industry, and services, indicating its resilience and adaptability in the face of economic challenges.

3. **Tourism Potential**: With its rich culture and natural beauty, Indonesia has significant potential for tourism development, which could contribute to the country's economic growth and create jobs.

4. **Democratic System**: Indonesia is a democratic republic with a president as head of state and government, indicating a commitment to democratic principles and the rule of law.

5. **Cultural Diversity**: Indonesian culture is shaped by its rich history, which reflects the influence of various religions (Hinduism, Buddhism, Islam, Christianity) and traditional animist practices, making it one of the most culturally diverse countries in the world.

6. **National Identity**: Indonesia's national symbols, such as the Merah Putih flag, Garuda Pancasila emblem, and song "Indonesia Raya", reflect the country's strong sense of national identity and pride in its history and culture.

7. **Climate and Geography**: Indonesia's tropical climate and geography make it vulnerable to natural disasters, such as earthquakes and tsunamis, which can have significant impacts on the country's economy and population.

Overall, these insights suggest that Indonesia is a country with significant economic, cultural, and strategic importance in Southeast Asia, with a rich history and diverse culture that makes it an attractive destination for tourists and businesses alike.

TEST OLLAMA MODEL WITH RAG V3

In [8]:
from IPython.display import display, Markdown
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage

class SimpleSummarizer:
    def __init__(self, documents, model, prompt_template):
        self.documents = documents
        self.model = model
        self.prompt_template = prompt_template

    def prepare_text(self, doc_data):
        """Prepare document text for summarization"""
        if not doc_data or len(doc_data) < 4:
            return None
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        return full_text.strip() if full_text else None

    def direct_summarization(self):
        """Summarization for few documents"""
        summaries = []
        
        for doc_data in self.documents:
            full_text = self.prepare_text(doc_data)
            if not full_text:
                continue
            
            prompt = self.prompt_template.format(document=full_text)
            
            try:
                summary = self.model.invoke(prompt)
                summaries.append(summary)
            except Exception as e:
                print(f"Error summarizing document: {e}")
        
        return summaries

    def simple_multi_document_summarization(self):
        """Summarization for more documents"""
        return self.direct_summarization()

    def summarize_documents(self):
        """Select summarization method based on number of documents"""
        if len(self.documents) < 5:
            return self.direct_summarization()
        else:
            return self.simple_multi_document_summarization()

def create_summarizer(documents, model):
    template = """
    You are a helpful assistant for text summarization. 
    Only include information that is part of the document. 
    Do not include your own opinion or analysis.

    Document: 
    "{document}"
    Summary:
    """
    
    prompt_template = PromptTemplate.from_template(template)
    
    return SimpleSummarizer(documents, model, prompt_template)

try:
    documents = list(load_documents())
    
    model = OllamaLLM(model=OLLAMA_MODEL)
    
    summarizer = create_summarizer(documents, model)
    
    summaries = summarizer.summarize_documents()
    
    for summary in summaries:
        display(Markdown(summary))

except Exception as e:
    print(f"Error in summarization process: {e}")

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


The text appears to be a summary of Canadian tourism attractions and facts. Here is a breakdown of the main points:

1. **Ballet**: Canada is home to the longest running ballet company in North America, the Royal Winnipeg Ballet.
2. **Travel**: Canada is the number 1 travel destination in the world, according to Travel + Leisure magazine (2017).
3. **Nature and culture**: Canada offers unspoiled landscapes, dynamic cities, cultural institutions, and a welcoming spirit.
4. **Ballet companies**: The Royal Winnipeg Ballet is the oldest ballet company in Canada.
5. **Travel magazines**: Lonely Planet singled out Canada for its unique attractions.

Overall, the text highlights Canada's rich cultural heritage, stunning natural beauty, and warm hospitality, making it an attractive destination for tourists.

The text appears to be a collection of data and information about Indonesia, including its geography, economy, government, culture, and national symbols. The format is a list of key-value pairs, where each pair contains a brief description or fact about Indonesia.

Here are some summaries of the main categories:

**Geography**

* Location: Asia Tenggara (Southeast Asia)
* Borders: Malaysia to the north, Australia to the south
* Highest point: Puncak Jaya (4,884 m)

**Economy**

* GDP (perkiraan 2024): ±1,3 triliun USD (estimated $13 billion)
* Sector of economy: Pertanian, Pertambangan, Industri, Pariwisata (Agriculture, Mining, Industry, Tourism)
* Main industries: Agriculture, Mining, Manufacturing

**Government**

* Data Umum Negara Indonesia (Common Data State of Indonesia)
* Symbolic national: Garuda Pancasila
* Flag: Merah Putih (Red and White)

**Culture**

* National song: Indonesia Raya
* National anthem: Indonesia Raya
* National symbol: Bhinneka Tunggal Ika (Unity in Diversity)
* Coat of arms: Garuda Pancasila

Note that this summary is based on a limited sample of the text, and there may be additional information or categories not included here.

TEST OLLAMA MODEL WITHOUT RAG

In [9]:
from IPython.display import display, Markdown

# Template untuk prompt summarization
template = """
You are a helpful assistant for text summarization. 
Only include information that is part of the document. 
Do not include your own opinion or analysis.

Document: 
"{document}"
Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model=OLLAMA_MODEL)
chain = prompt | model

def summarize_documents(documents):
    summaries = []
    for doc_index, doc_data in enumerate(documents):
        if not doc_data or len(doc_data) < 4:
            continue
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        if not full_text.strip():
            continue
        
        summaries.append(chain.invoke({"document": full_text}))
    
    return summaries

documents = list(load_documents())
summaries = summarize_documents(documents)

for summary in summaries:
    display(Markdown(summary))

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


This text is a summary of various Canadian attractions and facts. It includes:

* A ballet company in Winnipeg
* The Royal Winnipeg Ballet being the oldest ballet company in Canada
* Canada being the number 1 travel destination in the world, according to Travel + Leisure magazine (2017)
* Various cultural institutions and welcoming spirit

The text does not provide a detailed analysis or explanation of these points, but rather presents them as facts about Canada.

Based on the provided data, here is a summary of Indonesia:

**Geography**

* Located in Asia Tenggara
* Borders Malaysia and Filipina to the north, Australia to the south
* Gunung Tertinggi (Puncak Jaya) at 4.884 mdpl
* Sungai Terpanjang (Sungai Kapuas) at ±1.143 km

**Climate**

* No information provided in the data

**Economy**

* GDP: ±1,3 triliun USD (perkiraan 2024)
* Sektor Ekonomi Utama: Pertanian, Pertambangan, Industri, Pariwisata
* PDB: ±1,3 triliun USD (perkiraan 2024)

**Demographics**

* No information provided in the data

**Culture**

* Simbol Nasional: Garuda Pancasila
* Lagu Kebangsaan: Indonesia Raya
* Bendera: Merah Putih
* Semboyan: Bhinneka Tunggal Ika

**Government**

* Data Umum Negara Indonesia (National General Information)
* Symbol of Nationality: Garuda Pancasila

Note that some information is missing from the data, and this summary only includes the provided data.

TEST OPENAI MODEL

In [10]:
from IPython.display import display, Markdown
from dotenv import load_dotenv
import os
import openai

load_dotenv()

openai_client = openai.OpenAI()  # Inisialisasi klien baru

MODEL_NAME = "gpt-3.5-turbo"

def summarize_documents(documents):
    summaries = []
    for doc_index, doc_data in enumerate(documents):
        if not doc_data or len(doc_data) < 4:
            display(Markdown(f"**Skipping document {doc_index} due to incomplete extraction.**"))
            continue
        
        content, _, _, _ = doc_data
        full_text = "\n".join(
            " ".join(map(str, page)) if isinstance(page, list) else str(page)
            for page in content
        ) if isinstance(content, list) else str(content)
        
        if not full_text.strip():
            display(Markdown(f"**Skipping document {doc_index} due to empty content.**"))
            continue
        
        response = openai_client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a helpful assistant for text summarization. Only include information that is part of the document. Do not include your own opinion or analysis."},
                {"role": "user", "content": full_text}
            ]
        )
        
        summary = response.choices[0].message.content
        summaries.append(summary)
    
    return summaries

documents = load_documents()
summaries = summarize_documents(documents)

for summary in summaries:
    display(Markdown(summary))


AttributeError: module 'openai' has no attribute 'OpenAI'