PROCESS DOCUMENT

In [None]:
import os
import json
import re
from IPython.display import Markdown
from phi.agent import Agent
from phi.model.ollama import Ollama
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.documents import Document
from utils.document_processor import DocumentProcessor  

# Inisialisasi path dan model
DATA_PATH = "./data"
INDEX_PATH = "faiss_index"
CHUNKED_DATA_PATH = "./chunked_data"  
METADATA_PATH = "./metadata"  
OLLAMA_MODEL = "llama3.2"

# Pastikan folder tersedia
os.makedirs(CHUNKED_DATA_PATH, exist_ok=True)
os.makedirs(METADATA_PATH, exist_ok=True)

# Inisialisasi model Ollama
llm = Ollama(id=OLLAMA_MODEL)

# Inisialisasi embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Inisialisasi document processor
docs = DocumentProcessor()

# Inisialisasi Agent dengan model Ollama
agent = Agent(model=llm, show_tool_calls=True, markdown=True)

# Parameter chunking
CHUNK_SIZE = 1200  # Ditingkatkan untuk efisiensi
MIN_CHUNK_SIZE = 500  # Gabungkan chunk kecil
MAX_CHUNKS = 30  # Batasi jumlah chunk

extracted_docs = []

def clean_text(text):
    """ Membersihkan teks dari karakter kosong, whitespace berlebih, dan simbol aneh """
    text = re.sub(r'\s+', ' ', text).strip() 
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  
    return text

def clean_agent_output(text):
    """ Membersihkan output dari agent agar tidak mengandung pemisah yang tidak perlu """
    text = re.sub(r'\n?###.*?\n', '\n', text)  # Hapus judul markdown seperti ### Section
    text = re.sub(r'\n?\*\*\*.*?\n', '\n', text)  # Hapus pemisah ***
    text = re.sub(r'\n?-{3,}\n?', '\n', text)  # Hapus garis pemisah ---
    text = re.sub(r'(\s*-{2,}\s*)', ' ', text)  # Hapus pemisah --
    text = re.sub(r'(\s*\*{2,}\s*)', ' ', text)  # Hapus ** pemisah tebal
    text = re.sub(r'(\s*\*\s*)', ' ', text)  # Hapus * pemisah tunggal
    text = re.sub(r'(\s*-\s*)', ' ', text)  # Hapus pemisah -
    text = re.sub(r'\n{2,}', '\n\n', text).strip() 
    return text

for filename in os.listdir(DATA_PATH):
    valid_extensions = ('.pdf', '.docx', '.txt')
    if not filename.lower().endswith(valid_extensions):
        continue

    filepath = os.path.join(DATA_PATH, filename)

    try:
        with open(filepath, "rb") as f:
            document = f.read()
            result = docs.process_document(document, filename)

        if not result or len(result) < 4:
            print(f"[WARNING] Gagal memproses {filename}, melewati file ini.")
            continue

        plain_text = clean_text(result[3])  
        print(f"[INFO] {filename} - Panjang teks sebelum pemrosesan: {len(plain_text)}")

        # Gunakan agent untuk chunking secara bertahap
        structured_text = ""
        start_idx = 0
        chunk_count = 0

        while start_idx < len(plain_text) and chunk_count < MAX_CHUNKS:
            chunk_text = plain_text[start_idx:start_idx + CHUNK_SIZE]
            response = agent.run(
                f"Split the following text into meaningful segments ensuring logical separation:\n{chunk_text}",
                max_tokens=8000
            )
            
            if isinstance(response, str):
                structured_text += clean_agent_output(response) + "\n\n"
            elif isinstance(response, dict):
                structured_text += clean_agent_output(response.get("text", "")) + "\n\n"
            else:
                structured_text += clean_agent_output(getattr(response, "content", str(response))) + "\n\n"
            
            start_idx += CHUNK_SIZE
            chunk_count += 1

        structured_text = structured_text.strip()
        chunked_texts = structured_text.split("\n\n")

        # Gabungkan chunk yang terlalu pendek
        optimized_chunks = []
        temp_chunk = ""

        for chunk in chunked_texts:
            chunk = chunk.strip()
            if len(chunk) < MIN_CHUNK_SIZE:
                temp_chunk += " " + chunk
            else:
                if temp_chunk:
                    optimized_chunks.append(temp_chunk.strip())
                    temp_chunk = ""
                optimized_chunks.append(chunk)

        if temp_chunk:
            optimized_chunks.append(temp_chunk.strip())

        chunk_data = [{"chunk_id": i+1, "text": chunk.strip()} 
                      for i, chunk in enumerate(optimized_chunks[:MAX_CHUNKS]) if chunk.strip()]

        metadata = {
            "filename": filename,
            "total_chunks": len(chunk_data),
            "total_length": len(plain_text)
        }

        extracted_docs.extend([
            Document(page_content=chunk["text"], metadata={"chunk_id": chunk["chunk_id"], **metadata}) 
            for chunk in chunk_data
        ])

        # Simpan hasil chunking ke file
        chunked_filepath = os.path.join(CHUNKED_DATA_PATH, f"chunked_{filename}.txt")
        with open(chunked_filepath, "w", encoding="utf-8") as chunked_file:
            for chunk in chunk_data:
                chunked_file.write(f"Chunk {chunk['chunk_id']}:\n")
                chunked_file.write(f"{chunk['text']}\n")
                chunked_file.write("\n---\n\n")  

        metadata_filepath = os.path.join(METADATA_PATH, f"metadata_{filename}.json")
        with open(metadata_filepath, "w", encoding="utf-8") as metadata_file:
            json.dump(metadata, metadata_file, indent=4)

        print(f"[INFO] Total chunks generated for {filename}: {len(chunk_data)}")

    except Exception as e:
        print(f"[ERROR] Error processing {filename}: {e}")

# Simpan ke FAISS hanya jika ada dokumen yang diproses
if extracted_docs:
    vector_store = FAISS.from_documents(extracted_docs, embedding_model)
    vector_store.save_local(INDEX_PATH)
    print("[SUCCESS] Proses chunking selesai. Hasilnya disimpan dalam 'chunked_data' dan metadata di 'metadata'.")
else:
    print("[INFO] Tidak ada dokumen yang berhasil diproses.")


[INFO] Dokumen (1).pdf - Panjang teks sebelum pemrosesan: 3843
[INFO] Total chunks generated for Dokumen (1).pdf: 4
[SUCCESS] Proses chunking selesai. Hasilnya disimpan dalam 'chunked_data' dan metadata di 'metadata'.


In [20]:
# Query dengan RAG
query = "Apa isi dokumen tentang topik X?"
query_embedding = embedding_model.embed_query(query)
retrieved_docs = vector_store.similarity_search_by_vector(query_embedding, k=3)

retrieved_text = "\n\n".join([doc.page_content for doc in retrieved_docs])

TEST OLLAMA MODEL

In [21]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

In [22]:
OLLAMA_MODEL = "llama3.2"
COLLECTION_NAME = "ollama_vectore_test"

In [23]:
# Pastikan hanya data chunk yang digunakan dalam prompt
template = """
Anda adalah asisten AI yang ahli dalam menganalisis dokumen.  
Berdasarkan dokumen berikut, identifikasi perubahan nama, restrukturisasi, dan strategi ekspansi PT Pertamina.  
Gunakan hanya informasi yang terdapat dalam dokumen.  

Dokumen:
"{document}"
Ringkasan:
"""

template = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model=OLLAMA_MODEL)
chain = template | model

response = chain.invoke({"document": retrieved_text})
Markdown(response)


Dari dokumen yang disediakan, saya dapat melihat bahwa terdapat beberapa perubahan nama, restrukturisasi, dan strategi ekspansi PT Pertamina sebagai berikut:

Perubahan Nama:

* PT Eksploitasi Tambang Minyak Sumatera Utara menjadi PT Perusahaan Minyak Nasional (PERMINA) pada 10 Desember 1957
* PERMINA menjadi PN Pertambangan Minyak Nasional (Permina) pada 1 Juli 1961
* PN Permina bergabung dengan PN Pertamin menjadi PN Pertambangan Minyak dan Gas Bumi Negara (Pertamina) pada 20 Agustus 1968

Restrukturisasi:

* Pada tahun 2003, PT Pertamina (Persero) didirikan melalui Peraturan Pemerintah No. 31 Tahun 2003
* Pada tahun 2011, PT Pertamina menyempurnakan visinya menjadi Perusahaan Energi Nasional Kelas Dunia

Strategi Ekspansi:

* PT Pertamina mengacuikan visinya untuk menjadi Perusahaan Minyak Nasional Kelas Dunia pada tahun 2007
* Pada tahun 2017, PT Pertamina berhasil menuntaskan akuisisi 72,65% saham perusahaan migas Prancis Maurel et Prom (M&P) yang memungkinkan PT Pertamina memiliki akses operasi di 12 negara di 4 benua
* Pada tahun 2018, PT Pertamina berhasil menyelesaikan proses pengambilalihan kepemilikan 51% saham PT Pertamina Gas (Pertagas) oleh PT Perusahaan Gas Negara (PGN)

Dalam keseluruhan, perubahan nama, restrukturisasi, dan strategi ekspansi PT Pertamina menunjukkan komitmen perusahaan untuk menjadi salah satu penyedia energi terbesar di Indonesia.

TEST OPENAI MODEL

In [24]:
from dotenv import load_dotenv
import os
import openai
load_dotenv()

openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")

model = "gpt-35-turbo"
response = openai.ChatCompletion.create(
  engine=model,
  messages=[
    {
      "role": "system", 
      "content": """
        You are a helpful assistant for text summarization.
        Only include information that is part of the document. 
        Do not include your own opinion or analysis.
      """
    },
    {
      "role": "user",
      "content": plain_text
    }
  ],
)

Markdown(response.choices[0].message.content)

PT. Pertamina (Persero) journey began in the 1950s with the establishment of PT. Eksploitasi Tambang Minyak Sumatera Utara. In 1957, it changed its name to become PT. Perusahaan Minyak Nasional (PERMINA), which was later renamed to PN Pertambangan Minyak Nasional (PERMINA) and eventually PN Pertambangan Minyak dan Gas Bumi Negara (Pertamina) in 1968. In 1971, the government established the role of Pertamina as a company to produce and process oil and gas and changed its name to Perusahaan Pertambangan Minyak dan Gas Bumi Negara. In 2003, it changed its name to PT Pertamina (Persero). In 2007, Pertamina changed its vision to become a world-class national oil company. In 2011, it further improved its vision to become a world-class national energy company. Pertamina acquired Maurel et Prom (M&P) in 2017, which allowed it to operate in 12 countries across four continents. In 2018, Pertamina's position as the first line of defense for national energy sovereignty was strengthened after PT Pertamina Gas was acquired by PT Perusahaan Gas Negara (PGN). Pertamina formed six subholdings to achieve energy sovereignty, namely Upstream Subholding, Gas Subholding, Refinery and Petrochemical Subholding, Power & NRE Subholding, Commercial and Trading Subholding, and Integrated Marine Logistics Subholding. This restructuring was completed in September 2021, and Pertamina aims to achieve its "One Energy, One Pertamina" vision through more focused and directed energy procurement activities.