PROCESS DOCUMENT

In [28]:
# Import modul yang diperlukan
from IPython.display import Markdown
from utils.document_processor import DocumentProcessor
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
#from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from typing import List
import os

# Inisialisasi DocumentProcessor dan OllamaLLM
docs = DocumentProcessor()
DATA_PATH = "./data"
INDEX_PATH = "faiss_index"
OLLAMA_MODEL = "llama3.2"
COLLECTION_NAME = "ollama_vectore_test"
EMBEDDING_MODEL = OllamaLLM(model=OLLAMA_MODEL)

# Inisialisasi embedding model menggunakan HuggingFace
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

def get_embedding(text: str) -> List[float]:
    """
    Menghasilkan embedding dari teks menggunakan HuggingFace.
    """
    return embedding_model.encode(text).tolist()

# Langkah 1: Proses dokumen
extracted_docs = []
for filename in os.listdir(DATA_PATH):
    valid_extensions = ('.pdf', '.docx', '.txt')
    if not filename.lower().endswith(valid_extensions):
        continue
    
    filepath = os.path.join(DATA_PATH, filename)
    with open(filepath, "rb") as f:
        document = f.read()
        result = docs.process_document(document, filename)
        extracted_docs.append(result)

# Ambil konten plain_text untuk vektorisasi
documents = [{"text": doc[3], "metadata": {"filename": filename}} for doc, filename in zip(extracted_docs, os.listdir(DATA_PATH))]

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


In [29]:
# Langkah 2: Buat dokumen menjadi vektor
texts = [doc["text"] for doc in documents]
metadatas = [doc["metadata"] for doc in documents]

# Buat FAISS Vectorstore dengan memberikan embedding instance
vectorstore = FAISS.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas
)

# Simpan index lokal
vectorstore.save_local(INDEX_PATH)

# Langkah 3: Query dengan RAG
query = "Apa isi dokumen tentang topik X?"
query_embedding = embedding_model.embed_query(query)
retrieved_docs = vectorstore.similarity_search_by_vector(query_embedding, k=3)

# Gabungkan dokumen hasil pencarian untuk input ke model
retrieved_text = "\n\n".join([doc.page_content for doc in retrieved_docs])


In [30]:
# Template prompt
template = """
  You are a helpful assistant for text summarization. 
  Only include information that is part of the document. 
  Do not include your own opinion or analysis.

  Document: 
  "{document}"
  Summary:
"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model=OLLAMA_MODEL)

chain = prompt | model

response = chain.invoke({"document": retrieved_text})
Markdown(response)

Here is a summary of the text:

The individual has worked as a technician in the telecommunications industry, providing internet services with the name "Ofon O3" in the Kepulauan Riau region. They have gained experience in configuring and optimizing network equipment, including optical fiber modems. Their work involves high initiative, particularly when installing internet services for customers, with an average customer satisfaction rate of 70-95%.

The individual has also completed their studies at Universitas Riau, expected to graduate in August 2026, majoring in Informatics Engineering. During their studies, they have participated in various campus activities, including organizations and events, as well as independent research projects.

In addition to their work experience and academic background, the individual has gained skills in areas such as:

* Cloud computing
* Cybersecurity
* Programming languages (JavaScript, HTML5)
* Database management (SQL)
* Graphic design (Figma)
* Network equipment configuration (Mikrotik)

Soft skills include:

* Public speaking
* Fluent English language skills
* Teamwork and collaboration
* Adaptability
* Time management

Some of the individual's notable achievements include:

* Winning a short story competition for Malay prose among SMA schools in the province of Kepulauan Riau.
* Completing an independent research project on diabetes prevention solutions (Glusity).
* Designing a new website landing page for Universitas Riau.

Overall, the individual has gained a strong foundation in technical skills and soft skills, with experience in working in the telecommunications industry and participating in various academic and extracurricular activities.

TEST OPENAI MODEL

In [31]:
from dotenv import load_dotenv
import os
import openai
load_dotenv()

openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")

model = "gpt-35-turbo"
response = openai.ChatCompletion.create(
  engine=model,
  messages=[
    {
      "role": "system", 
      "content": """
        You are a helpful assistant for text summarization.
        Only include information that is part of the document. 
        Do not include your own opinion or analysis.
      """
    },
    {
      "role": "user",
      "content": retrieved_text
    }
  ],
)

Markdown(response.choices[0].message.content)



AttributeError: 'NoneType' object has no attribute 'lower'