PROCESS DOCUMENT

In [2]:
import os
import json
from IPython.display import Markdown
from phi.agent import Agent
from phi.model.ollama import Ollama
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.documents import Document
from utils.document_processor import DocumentProcessor  

# Inisialisasi path dan model
DATA_PATH = "./data"
INDEX_PATH = "faiss_index"
CHUNKED_DATA_PATH = "./chunked_data"  
METADATA_PATH = "./metadata"  
OLLAMA_MODEL = "llama3.2"

# Pastikan folder tersedia
os.makedirs(CHUNKED_DATA_PATH, exist_ok=True)
os.makedirs(METADATA_PATH, exist_ok=True)

# Inisialisasi model Ollama
llm = Ollama(id=OLLAMA_MODEL)

# Inisialisasi embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Inisialisasi document processor
docs = DocumentProcessor()

# Inisialisasi Agent dengan model Ollama
agent = Agent(model=llm, show_tool_calls=True, markdown=True)

# Langkah 1: Proses dokumen dengan agentic chunking
extracted_docs = []
for filename in os.listdir(DATA_PATH):
    valid_extensions = ('.pdf', '.docx', '.txt')
    if not filename.lower().endswith(valid_extensions):
        continue

    filepath = os.path.join(DATA_PATH, filename)

    try:
        # Gunakan DocumentProcessor untuk membaca file
        with open(filepath, "rb") as f:
            document = f.read()
            result = docs.process_document(document, filename)

        if result is None:
            print(f"Gagal memproses {filename}, melewati file ini.")
            continue

        plain_text = result[3]  # Pastikan indeks benar

        print(f"[INFO] {filename} - Panjang teks sebelum pemrosesan: {len(plain_text)}")

        # **Gunakan agent untuk melakukan chunking berdasarkan pemahaman konteks**
        response = agent.run(
            f"Split the following document into meaningful and well-structured segments based on its content. "
            f"Ensure each segment is logically coherent and can be understood independently:\n\n{plain_text}",
            max_tokens=8000
        )

        # Ambil hasil chunking dari agent
        if isinstance(response, str):
            structured_text = response
        elif isinstance(response, dict):
            structured_text = response.get("text", "")
        else:
            structured_text = response.content if hasattr(response, "content") else str(response)

        structured_text = structured_text.strip()

        # Pisahkan chunk berdasarkan pemisah yang diberikan oleh agent (misalnya, menggunakan "\n\n###" sebagai pemisah)
        chunked_texts = structured_text.split("\n\n###")  

        # Simpan hasil chunking dalam list
        chunk_data = [{"chunk_id": i+1, "text": chunk.strip()} for i, chunk in enumerate(chunked_texts) if chunk.strip()]
        metadata = {"filename": filename, "total_chunks": len(chunk_data)}

        extracted_docs.extend([Document(page_content=chunk["text"], metadata={"chunk_id": chunk["chunk_id"], **metadata}) for chunk in chunk_data])

        # Simpan hasil chunking dalam file txt
        chunked_filepath = os.path.join(CHUNKED_DATA_PATH, f"chunked_{filename}.txt")
        with open(chunked_filepath, "w", encoding="utf-8") as chunked_file:
            for chunk in chunk_data:
                chunked_file.write(f"Chunk {chunk['chunk_id']}:\n")
                chunked_file.write(f"{chunk['text']}\n")
                chunked_file.write("\n---\n\n")  

        # Simpan metadata dalam file JSON
        metadata_filepath = os.path.join(METADATA_PATH, f"metadata_{filename}.json")
        with open(metadata_filepath, "w", encoding="utf-8") as metadata_file:
            json.dump(metadata, metadata_file, indent=4)

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Simpan ke FAISS secara lokal
vector_store = FAISS.from_documents(extracted_docs, embedding_model)
vector_store.save_local(INDEX_PATH)

print(f"Total chunks generated for {filename}: {len(chunk_data)}")
print("Proses chunking selesai. Hasilnya disimpan dalam folder 'chunked_data' dan metadata di 'metadata'.")


[INFO] Dokumen (1).pdf - Panjang teks sebelum pemrosesan: 3986
Total chunks generated for Dokumen (1).pdf: 1
Proses chunking selesai. Hasilnya disimpan dalam folder 'chunked_data' dan metadata di 'metadata'.


In [3]:
# Query dengan RAG
query = "Apa isi dokumen tentang topik X?"
query_embedding = embedding_model.embed_query(query)
retrieved_docs = vector_store.similarity_search_by_vector(query_embedding, k=3)

# Gabungkan dokumen hasil pencarian untuk input ke model
retrieved_text = "\n\n".join([doc.page_content for doc in retrieved_docs])

TEST OLLAMA MODEL

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

In [5]:
OLLAMA_MODEL = "llama3.2"
COLLECTION_NAME = "ollama_vectore_test"

In [6]:
template = """
  Anda adalah asisten yang membantu dalam meringkas teks.  
  Hanya sertakan informasi yang ada dalam dokumen.  
  Jangan tambahkan opini atau analisis Anda sendiri.  

  Dokumen:  
  "{document}"  
  Ringkasan:  
"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model=OLLAMA_MODEL)

chain = prompt | model

response = chain.invoke({"document": plain_text})
Markdown(response)


Berikut ringkasan dari dokumen tersebut:

- PT Pertamina (Persero) atau "PERTAMINA" didirikan sekitar tahun 1950-an melalui pendirian PT Eksploitasi Tambang Minyak Sumatera Utara.
- Pada 10 Desember 1957, nama perusahaan diubah menjadi PT Perusahaan Minyak Nasional (PERMINA).
- Pada 1 Juli 1961, PERMINA ditetapkan menjadi Perusahaan Negara dengan nama PN Pertambangan Minyak Nasional (Permina).
- Pada 20 Agustus 1968, PN Permina bergabung dengan PN Pertamin untuk membentuk PN Pertambangan Minyak dan Gas Bumi Negara (Pertamina).
- Pada tanggal 15 Desember 1971, nama PN Pertamina diubah menjadi Perusahaan Pertambangan Minyak dan Gas Bumi Negara.
- Pada tahun 2003, perusahaan ini berubah nama menjadi PT Pertamina (Persero) dan kemudian pada tahun 2017 menuntaskan akuisisi 72,65% saham perusahaan migas Prancis Maurel et Prom (M&P).
- Melalui kepemilikan saham mayoritas di M&P, PERTAMINA memiliki akses operasi di 12 negara yang tersebar di 4 benua.
- Pada tahun 2018, PT Pertamina Gas (Pertagas) bergabung dengan PT Perusahaan Gas Negara (PGN), memantapkan posisi PERTAMINA sebagai garda terdepan menjaga ketahanan energi nasional.
- Pada 2020-2021, PERTAMINA meluncurkan Roadmap pembentukan Holding Migas dengan pembentukan Subholding Gas, Upstream Subholding, Refinery and Petrochemical Subholding, Power & NRE Subholding, Commercial and Trading Subholding, dan Integrated Marine Logistics Subholding.

TEST OPENAI MODEL

In [7]:
from dotenv import load_dotenv
import os
import openai
load_dotenv()

openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")

model = "gpt-35-turbo"
response = openai.ChatCompletion.create(
  engine=model,
  messages=[
    {
      "role": "system", 
      "content": """
        You are a helpful assistant for text summarization.
        Only include information that is part of the document. 
        Do not include your own opinion or analysis.
      """
    },
    {
      "role": "user",
      "content": plain_text
    }
  ],
)

Markdown(response.choices[0].message.content)

PT Pertamina (Persero) is an energy company in Indonesia that was founded in the 1950s. Initially, it was named PT Eksploitasi Tambang Minyak Sumatera Utara and was assigned by the Indonesian government to manage oil fields in Sumatra. It changed its name to PT Perusahaan Minyak Nasional (PERMINA) in 1957 and became a state-owned enterprise (PN) on 1 July 1961, with the name PN Pertambangan Minyak Nasional (Permina). PN Permina later merged with PN Pertamin and became PN Pertambangan Minyak dan Gas Bumi Negara (Pertamina) in 1968. In 2003, Pertamina changed its name to PT Pertamina (Persero). Recently, Pertamina acquired a 72.65% stake in Maurel et Prom (M&P) in 2017 and completed the acquisition of a 51% stake in Pertamina Gas (Pertagas) in 2018, which further solidified its position as a leading energy company in Indonesia. In 2020, Pertamina formed six sub-holdings to focus on specific areas of its business, including upstream, gas, refinery and petrochemical, power and NRE, commercial and trading, and integrated marine logistics. The company's vision is to become a world-class national energy company.