In [8]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from pprint import pprint

In [9]:
# Qdrant server URL
URL ="localhost"
# Qdrant dimension of the collection
DIMENSION = 384
# Qdrant collection name
COLLECTION_NAME = "data" #"incident data"
METRIC_NAME ="COSINE"

In [10]:
client = QdrantClient(URL,port=6333)

In [11]:
vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)

In [12]:
import fitz
doc = fitz.open('data/civil_code.pdf')

In [13]:
from llama_index.core.node_parser import SentenceSplitter

In [14]:
text_splitter = SentenceSplitter(
    chunk_size=512
)

In [24]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [25]:
len(doc_idxs)

56

In [26]:
pprint(text_chunks[1])

('c) non-registered organisational entity – a union provided for by the\n'
 'legislation of Georgia or other jurisdiction (a partnership of flat\n'
 'owners, a non-registered union, a partnership, etc.), which has an\n'
 'internal organisational structure and acts in its own right in\n'
 'relations with a third party and, at the same time, is not registered\n'
 'as a legal person;\n'
 'd) beneficial owner – a natural person as determined by Article 13\n'
 'of this Law;\n'
 'e) UN Sanctions Committee – a respective sanctions committee\n'
 'established on the basis of the resolutions of the United Nations\n'
 'Security Council;\n'
 'f) UN Security Council resolution – a respective resolution of the\n'
 'United Nations Security Council adopted on the basis of Chapter VII\n'
 'of the Charter of the United Nations, which aims at preventing,\n'
 'detecting and suppressing the financing of terrorism or the\n'
 'proliferation of weapons of mass destruction;\n'
 'g) transaction – a transaction 

In [27]:
from llama_index.core.schema import TextNode

In [28]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [29]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.core  import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

llm = OpenAI(api_base="http://localhost:1234/v1",
             api_key="lm-studio",
            #  model = 'lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf',
             )
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.llm = llm


In [30]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [31]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:42<00:00,  8.50s/it]
100%|██████████| 56/56 [11:54<00:00, 12.76s/it]


In [32]:
pprint(nodes[0].metadata)

{'document_title': 'Based on the provided context, I would suggest a '
                   'comprehensive title that captures the essence of the '
                   "document. Here's a potential title:\n"
                   '\n'
                   '"Georgia\'s Financial Regulation and Compliance Framework: '
                   'Definitions, Obligations, and Concepts for Anti-Money '
                   'Laundering and Combating Terrorism Financing"\n'
                   '\n'
                   'This title incorporates key elements from the candidate '
                   'titles, including:\n'
                   '\n'
                   "* Georgia's financial regulation and compliance framework\n"
                   '* Definitions of various terms related to anti-money '
                   'laundering (AML) and combating terrorism financing (CFT)\n'
                   '* Obligations and requirements for entities subject to '
                   'AML/CFT regulations in Georgia\n'
          

In [33]:
for node in nodes:
    node_embedding = Settings.embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [34]:
pprint(nodes[0])

TextNode(id_='8020aa03-7453-4dbc-b86b-e6122b2ba1d1', embedding=[-0.00614240113645792, -0.018232472240924835, -0.004203591030091047, 0.0234830379486084, 0.027193590998649597, -0.016421686857938766, 0.016526786610484123, -0.015487439930438995, -0.03571461886167526, -0.024261096492409706, -0.013850330375134945, 0.019820384681224823, -0.02937014400959015, -0.0027103081811219454, 0.016687119379639626, -0.014176814816892147, -0.0075833494774997234, -0.03670141473412514, -0.035148605704307556, 0.07207450270652771, 0.037949491292238235, -0.019267741590738297, 0.02659335918724537, 0.031268682330846786, 0.09716033935546875, 0.02058488503098488, -0.0452427975833416, -0.0320291705429554, -0.038427263498306274, -0.1752973049879074, -0.02957446686923504, -0.03430212661623955, -0.047572050243616104, 0.02262840047478676, -0.008386263623833656, -0.0036717483308166265, -0.05559055507183075, 0.032905787229537964, 0.043053217232227325, 0.03986918181180954, 0.047329775989055634, 0.01739349775016308, -0.029

In [35]:
vector_store.add(nodes)

['8020aa03-7453-4dbc-b86b-e6122b2ba1d1', 'b830bbf5-bca6-4796-b552-ff64ad4fb05a', '4affa93a-1eae-49e6-8d9f-359b615e8fe1', '1b74ede0-3fb3-434b-b190-367c15fd2909', '887ddf80-ec1e-46c9-8989-f011eec9a0ba', '95ac4d42-4ace-42b9-94eb-99941096f860', 'f2f2f7de-c563-49a2-9c12-b882fd2547bd', 'b34a3d9e-c95a-4bba-af66-4c833a2fd788', '84624b19-9b21-44e5-99ba-090c724cf6f3', '70ce99e7-b1b8-469f-bdc8-34d7cf257125', 'b19a610a-dbc5-4b1b-b6ea-a89d8deea42f', 'ffd85b92-eb34-4763-b1eb-bc3c7082e2c1', '23453b9e-691b-4cfb-8b4d-832682852e13', '64fed1f3-5ab0-4e02-aa6d-9797e084d6bb', 'a193736a-4838-4b55-a5b1-8b80814ff011', '23c9a2d8-8098-46ab-a0ce-e84f5a723782', 'b97c907a-9e80-42d2-b1ab-abb6e2308fa7', 'f35415bc-8419-49ae-b7c3-4ace3bb577bf', '3f67276d-c290-48c9-ba82-2b70a3ca19e6', '0a82734a-a64a-4750-b7d7-985e9c550618', '43609729-ceed-4638-adac-4fdb8c06c4a4', 'fd22fb18-da0f-4d4d-8f68-73069172abd9', 'e17a0a61-fe2e-41da-94d3-cc6760b6e212', '8989494d-274e-444b-bf53-7e092469dd15', '04a913a2-4ce0-4312-88a4-298130fa3f0c',

In [2]:
from llama_index.core  import VectorStoreIndex

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [37]:
query_engine = index.as_query_engine()

In [38]:
query = "Which code includes Significance of form for the validity of transactions? In which article, book and chapter is it written?"

In [39]:
response = query_engine.query(query)

In [77]:
print("query was:", query)
print("answer was:", response)

> Source (Doc id: a7388c7d-0245-42ba-9a72-d286b0638a71): k) verification – obtaining information (documents) which enables
an obliged entity to verify the...

> Source (Doc id: 5693df76-6e6a-4bf2-a90a-b3d1186df8c7): report on a suspicious transaction or the attempt to prepare, make,
or complete such a transactio...
query was: Which code includes Significance of form for the validity of transactions? In which article, book and chapter is it written?
answer was: The code that includes the significance of form for the validity of transactions is Article 19(1) of this Law.

This information can be found in the excerpt provided.


In [78]:
pprint(str(response))

('The code that includes the significance of form for the validity of '
 'transactions is Article 19(1) of this Law.\n'
 '\n'
 'This information can be found in the excerpt provided.')
