In [18]:
from langchain_qdrant import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from qdrant_client import QdrantClient

In [19]:
# Load PDF gpt technical report and split
loader = PyPDFLoader("gpt-4.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                   chunk_overlap=50)
texts = text_splitter.split_documents(documents)

In [20]:
# Load the embedding model 
model_name="BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [21]:
# Create the Collection in Qdrant
url = "http://localhost:6333"

qdrant = Qdrant.from_documents(
    texts,
    embeddings,
    url=url,
    prefer_grpc=False,
    collection_name="vector_db"
)

print("Vector DB Successfully Created!")

Vector DB Successfully Created!


In [22]:
client = QdrantClient(
    url=url, prefer_grpc=False
)

db = Qdrant(client=client, embeddings=embeddings, collection_name="vector_db")

In [23]:
# Do similarity search in db and print score
query = "How is the performance of GPT-4 compared to GPT-3 or GPT-3.5?"

docs = db.similarity_search_with_score(query=query, k=5)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata})

{'score': 0.7190241, 'content': 'gpt-4 (no vision)\ngpt3.5Figure 4. GPT performance on academic and professional exams. In each case, we simulate the\nconditions and scoring of the real exam. Exams are ordered from low to high based on GPT-3.5\nperformance. GPT-4 outperforms GPT-3.5 on most exams tested. To be conservative we report the\nlower end of the range of percentiles, but this creates some artifacts on the AP exams which have very', 'metadata': {'page': 5, 'source': 'gpt-4.pdf', '_id': '5cd1c1eb-0855-4d33-91dc-ae410f9595a0', '_collection_name': 'vector_db'}}
{'score': 0.71419555, 'content': 'GPT-4 signiﬁcantly outperforms both GPT-3.5 and Askell et al [100].ﬁxes to plot legend and title\n65', 'metadata': {'page': 64, 'source': 'gpt-4.pdf', '_id': '0839131a-c396-40b6-933f-70207a6c16d3', '_collection_name': 'vector_db'}}
{'score': 0.6699581, 'content': 'Figure 11: Results on IF evaluations across GPT3.5, GPT3.5-Turbo, GPT-4-launch\n98', 'metadata': {'page': 97, 'source': 'gpt-4.p

In [24]:
query = "Imageprocessing in GPT-4"

docs = db.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata})

{'score': 0.64725494, 'content': 'post [ 65]. We plan to release more information about GPT-4’s visual capabilities in follow-up work.\n8', 'metadata': {'page': 7, 'source': 'gpt-4.pdf', '_id': '0243d33d-96c2-4a2f-8c18-f5f2cadd7f07', '_collection_name': 'vector_db'}}
{'score': 0.5994881, 'content': 'GPT-4.\n1 Introduction\nThis technical report presents GPT-4, a large multimodal model capable of processing image and\ntext inputs and producing text outputs. Such models are an important area of study as they have the\npotential to be used in a wide range of applications, such as dialogue systems, text summarization,\nand machine translation. As such, they have been the subject of substantial interest and progress in\nrecent years [1–34].', 'metadata': {'page': 0, 'source': 'gpt-4.pdf', '_id': '85ca532c-ccfb-48f5-803f-824755937b8c', '_collection_name': 'vector_db'}}
{'score': 0.59122264, 'content': 'highest probability from the model.\nG Examples of GPT-4 Visual Input\n29', 'metadata': {'

In [25]:
# Anfrage auch in Deutsch möglich Ähliches Ergebnis zum oberen englischem
query = "Bildverarbeitung in GPT-4"

docs = db.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata})

{'score': 0.6275759, 'content': 'post [ 65]. We plan to release more information about GPT-4’s visual capabilities in follow-up work.\n8', 'metadata': {'page': 7, 'source': 'gpt-4.pdf', '_id': '0243d33d-96c2-4a2f-8c18-f5f2cadd7f07', '_collection_name': 'vector_db'}}
{'score': 0.5793121, 'content': 'highest probability from the model.\nG Examples of GPT-4 Visual Input\n29', 'metadata': {'page': 28, 'source': 'gpt-4.pdf', '_id': '4d0ddd4d-a746-4a51-baa2-c083bdcc8a13', '_collection_name': 'vector_db'}}
{'score': 0.57928073, 'content': 'GPT-4.\n1 Introduction\nThis technical report presents GPT-4, a large multimodal model capable of processing image and\ntext inputs and producing text outputs. Such models are an important area of study as they have the\npotential to be used in a wide range of applications, such as dialogue systems, text summarization,\nand machine translation. As such, they have been the subject of substantial interest and progress in\nrecent years [1–34].', 'metadata': {'

In [26]:
embedding_vector = embeddings.embed_query(query)
docs =  db.similarity_search_by_vector(embedding_vector)
for i in docs:
    doc = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata})
print("################################################")
print("Embedded QUERY:",embedding_vector)

{'score': 0.57928073, 'content': 'post [ 65]. We plan to release more information about GPT-4’s visual capabilities in follow-up work.\n8', 'metadata': {'page': 7, 'source': 'gpt-4.pdf', '_id': '0243d33d-96c2-4a2f-8c18-f5f2cadd7f07', '_collection_name': 'vector_db'}}
{'score': 0.57928073, 'content': 'highest probability from the model.\nG Examples of GPT-4 Visual Input\n29', 'metadata': {'page': 28, 'source': 'gpt-4.pdf', '_id': '4d0ddd4d-a746-4a51-baa2-c083bdcc8a13', '_collection_name': 'vector_db'}}
{'score': 0.57928073, 'content': 'GPT-4.\n1 Introduction\nThis technical report presents GPT-4, a large multimodal model capable of processing image and\ntext inputs and producing text outputs. Such models are an important area of study as they have the\npotential to be used in a wide range of applications, such as dialogue systems, text summarization,\nand machine translation. As such, they have been the subject of substantial interest and progress in\nrecent years [1–34].', 'metadata': 