In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.basic import chunk_elements

In [1]:
import sys
sys.path.append('../')

from utils.img_convertor.cpm_convertor import CPMConvertor

In [None]:
import os
file_path="./3900911.pdf"  # mandatory
filename=file_path.split("/")[-1]
output_img_dir = os.path.join("./images", filename.split(".")[0])

raw_docs = partition_pdf(
    filename=file_path,  # mandatory
    strategy="hi_res",  # mandatory to use ``hi_res`` strategy
    extract_images_in_pdf=True,  # mandatory to set as ``True``
    extract_image_block_types=["Image", "Table"],  # optional
    extract_image_block_to_payload=False,  # optional
    extract_image_block_output_dir=output_img_dir,  # optional
)

In [None]:
raw_docs[2].to_dict()

In [None]:
img_ele_idx = []
for doc in raw_docs:
    if doc.to_dict()['type'] == 'Image' or doc.to_dict()['type'] == 'Table':
        img_ele_idx.append(doc.to_dict()['element_id'])

img_ele_idx

In [None]:
converter = CPMConvertor()
question = 'Please convert the image to a markdown syntax table'

In [None]:
from tqdm import tqdm
import pandas as pd
import os

data_path = "./data"
if not os.path.exists(data_path):
    os.makedirs(data_path)

documents = []
img_captions = []

for doc in tqdm(raw_docs, desc="Converting documents"):
    if doc.to_dict()["type"] in ["Table", "Image"]:
        img_path = doc.to_dict()["metadata"]["image_path"]
        converted_text = converter.convert(question=question, image_path=img_path)
        img_captions.append({"image_path": img_path, "caption": converted_text})
        converted_doc = doc
        converted_doc.text = converted_text
        documents.append(converted_doc)
    else:
        documents.append(doc)

img_captions_df = pd.DataFrame(img_captions)

In [None]:
img_caption_path = os.path.join(data_path, filename.split('.')[0])
if not os.path.exists(img_caption_path):
    os.makedirs(img_caption_path)

img_captions_df.to_csv(os.path.join(img_caption_path, 'img_captions.csv'))

In [None]:
chunks = chunk_elements(documents,
                        max_characters=512,
                        overlap=50,)

In [None]:
chunks[0].to_dict()

In [None]:
from llama_index.core import Document

documents = []

for chunk in chunks:
    document = Document(
        doc_id=chunk.to_dict()["element_id"],
        text=chunk.to_dict()["text"],
        metadata={"page_number": chunk.to_dict()["metadata"]["page_number"],
                  "filename": chunk.to_dict()["metadata"]["filename"]}
    )
    documents.append(document)

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels
from llama_index.llms.ollama import Ollama
import os

generator_llm = Ollama(model="qwen2", request_timeout=60.0)
critic_llm = Ollama(model="qwen2", request_timeout=60.0)

embedding_path = "/home/project/data/jc/mmRAG/model/bge-m3"
embeddings = HuggingFaceEmbedding(embedding_path)

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings
)

In [None]:
testset = generator.generate_with_llamaindex_docs(documents,
                                                  test_size=20,
                                                  with_debugging_logs=True,
                                                  raise_exceptions=False,
                                                  distributions={simple: 0.5,
                                                                 reasoning: 0.25,
                                                                 multi_context: 0.25})

In [None]:
testset.to_pandas()

In [None]:
testset.to_pandas().to_csv(os.path.join(img_caption_path, 'testset.csv'))

In [None]:
ds = testset.to_dataset()

In [None]:
ds_dict = ds.to_dict()

In [None]:
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
)
from llama_index.core import (
    Document,
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    ServiceContext,
)
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels
from llama_index.llms.ollama import Ollama

import os
from utils.system_prompt import EXPERT_Q_AND_A_SYSTEM


llm = Ollama(model="qwen2", request_timeout=60.0)
embed_path = "/home/project/data/jc/mmRAG/model/bge-m3"
embed_model = HuggingFaceEmbedding(embed_path)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    system_prompt=EXPERT_Q_AND_A_SYSTEM,
)

vector_store = MilvusVectorStore(
        uri="http://localhost:19530/",
        token="root:Milvus",
        collection_name='demo',
        dim=1024,
        overwrite=True,
        enable_sparse=True,
        hybrid_ranker="RRFRanker",
        hybrid_ranker_params={"k": 60},
    )

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context,
        embed_model=embed_model,
        storage_context=storage_context,
)

rerank = FlagEmbeddingReranker(model="BAAI/bge-reranker-large", top_n=5)
query_engine = index.as_query_engine(
    similarity_top_k=10, node_postprocessors=[rerank]
)

In [None]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

from ragas.integrations.llama_index import evaluate

result = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    dataset=ds_dict,
    llm=llm,
    embeddings=embed_model,
    raise_exceptions=False
)

In [None]:
result.to_pandas()

In [None]:
result.to_pandas().to_csv(os.path.join(img_caption_path,'result.csv'))