In [1]:
path = "/home/x/data/"

In [2]:
from typing import Any

#import nltk
#import nltk.internals
#nltk.download('punkt')

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

# Get elements
raw_pdf_elements = partition_pdf(
    filename=path + "DOKUMEN_TES.pdf",
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 40,
 "<class 'unstructured.documents.elements.Table'>": 40}

In [4]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

40
40


In [5]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
#from langchain_openai import ChatOpenAI
from langchain_community.llms.ollama import Ollama

In [6]:
#from langchain import hub
#obj = hub.pull("rlm/multi-vector-retriever-summarization")

In [7]:
#from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
#from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#import torch
#model = HuggingFacePipeline.from_model_id(
#    model_id="mistralai/Mistral-7B-Instruct-v0.2",
#   task="text-generation",
#    device=0,  # -1 for CPU
#    batch_size=2,  # adjust as needed based on GPU map and model size.
#    model_kwargs={"temperature": 0, "max_length": 4096, "torch_dtype":torch.bfloat16},
#)

In [8]:
#model = Ollama(model = "llama3.1:8b-instruct-q8_0")
#print(model)

In [9]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. 
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
#model = ChatOpenAI(temperature=0, model="gpt-4")
model = Ollama(temperature=0, model = "llama3.1:8b-instruct-q8_0")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
print(model)

[1mOllama[0m
Params: {'model': 'llama3.1:8b-instruct-q8_0', 'format': None, 'options': {'mirostat': None, 'mirostat_eta': None, 'mirostat_tau': None, 'num_ctx': None, 'num_gpu': None, 'num_thread': None, 'num_predict': None, 'repeat_last_n': None, 'repeat_penalty': None, 'temperature': 0.0, 'stop': None, 'tfs_z': None, 'top_k': None, 'top_p': None}, 'system': None, 'template': None, 'keep_alive': None, 'raw': None}


In [10]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [None]:
# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [None]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
#from langchain_openai import OpenAIEmbeddings
#from langchain_community.embeddings.ollama import OllamaEmbeddings
#def get_embedding_function():
#    embeddings = OllamaEmbeddings(model="nomic-embed-text")
#    return embeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

# The vectorstore to use to index the child chunks
#vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
#vectorstore = Chroma(collection_name="summaries", embedding_function=get_embedding_function())
vectorstore = Chroma(collection_name="summaries", embedding_function=FastEmbedEmbeddings())#OpenAIEmbeddings())

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))


In [None]:
# # We can also add the original chunks to the vectorstore if we so want
    #for i, doc in enumerate(docs):
     #   doc.metadata[id_key] = doc_ids[i]
#retriever.vectorstore.add_documents(docs)

In [None]:
from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
#model = ChatOpenAI(temperature=0, model="gpt-4")
model = Ollama(temperature=0, model = "llama3.1:8b-instruct-q8_0")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("Wilayah Kerja Blok Rimau ada di kabupaten mana?")

In [None]:
chain.invoke("apa tipe kontrak wilayah kerja Blok Rimau?")

In [None]:
import ipywidgets

chain.invoke("berapa jumlah CO2 pada Data Komposisi Gas Blok Rimau Before Treatment?")

In [None]:
chain.invoke("siapa penjabat tertinggi operasi produksi Blok Rimau?")

In [None]:
chain.invoke("apa tipe kontrak Wilayah Kerja Seram Non Bula?")

In [None]:
chain.invoke("apa tipe kontrak Wilayah Kerja Bentu?")

In [None]:
chain.invoke("apa saja direktori fasilitas Lifting Bentu?")

In [None]:
chain.invoke("apa saja FASILITAS PRODUKSI Kaji Station BLOK RIMAU?")

In [None]:
chain.invoke("Sebutkan FASILITAS PRODUKSI BLOK RIMAU, Kaji Station?")

In [None]:
chain.invoke("Sebutkan FASILITAS PRODUKSI BLOK RIMAU, Semoga Station?")

In [None]:
chain.invoke("Sebutkan FASILITAS PRODUKSI BLOK RIMAU, 2. Semoga Station?")

In [None]:
chain.invoke("Sebutkan jenis chemical yang digunakan di Langkap Station!")

In [None]:
chain.invoke("Sebutkan berapa rate penggunaannya?")

In [None]:
chain.invoke("Sebutkan rate penggunaan dari jenis chemical yang digunakan di Langkap Station!")

In [None]:
chain.invoke("Sebutkan berapa koordinat Bangka Marine Terminal!")

In [None]:
chain.invoke("Sebutkan jenis kapal yang dapat diterima Bangka Marine Terminal!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), sebutkan produksi tahun 2018!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), sebutkan produksi tahun 2009!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), sebutkan WP&B tahun 2018!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), WP&B tahun 2018 adalah 1.16, coba periksa kembali jawaban anda!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), sebutkan WP&B tahun 2009!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), sebutkan produksi tahun 2010!")

In [None]:
chain.invoke("berdasarkan tabel Produksi Gas Bumi SERAM NON BULA (Status: Januari 2024), sebutkan produksi tahun 2011!")

In [None]:
chain.invoke("Sebutkan jumlah sumur produksi SERAM NON BULA di lapangan Oseil")

In [None]:
chain.invoke("PadSebutkan jumlah sumur Lapangan Oseil a tabel 3. Status Sumur Produksi SERAM NON BULA, sebutkan jumlah sumur Lapangan Oseil!")

In [None]:
chain.invoke("Pada tabel 3. Status Sumur Produksi SERAM NON BULA, sebutkan nama-nama Lapangan!")

In [None]:
chain.invoke("Pada CSEL BULA TERMINAL, berap produkis crude/kondensat per hari?")

In [None]:
chain.invoke("Pada CSEL BULA TERMINAL, berapa kapasitas tangki operasi?")

In [None]:
chain.invoke("Pada CSEL BULA TERMINAL, berapa jumlah dead stock?")

In [None]:
chain.invoke("Pada Wilayah Kerja Bentu, sebutkan masa kontrak!")