In [6]:
%pip install -U sentence_transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [23]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [24]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [25]:
import pypdf

def read_pdf(pdf_path, chunk_size=1000):
    """
    Reads a PDF file, extracts metadata, and chunks the text content into 1000-word segments.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        dict: A dictionary containing metadata and a list of text chunks.
    """
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = pypdf.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()

            metadata = reader.metadata
            # Chunk the text into 1000-word segments
            # words = text.split()
            # chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

            return {
                'metadata': metadata,
                'data': text,
            }
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

In [26]:
import os

documents = []
folder_path = './Other'
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(folder_path, filename)
        pdf_data = read_pdf(pdf_path)
        if pdf_data:
            documents.append(pdf_data)

In [27]:
len(documents)

7

In [20]:
client = QdrantClient(":memory:")

In [21]:
client.create_collection(
    collection_name="my_papers",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [22]:
client.upload_points(
    collection_name="my_papers",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(doc["data"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(documents)
    ],
)

In [None]:
hits = client.query_points(
    collection_name="my_papers",
    query=encoder.encode("Levenshtein distance").tolist(),
    limit=3,
).points

{'metadata': {'/Creator': 'Aspose Pty Ltd.', '/ModDate': "D:20250325022231-07'00'", '/CreationDate': "D:20210913202330+05'30'", '/Producer': 'Aspose.PDF for .NET 23.2.0; modified using iText 4.2.0 by 1T3XT', '/Subject': 'Security and Communication Networks 2021.2021:9923234', '/WPS-PROCLEVEL': '3', '/WPS-JOURNALDOI': '10.1155/2037', '/Title': 'PBDT: Python Backdoor Detection Model Based on Combined Features', '/WPS-ARTICLEDOI': '10.1155/2021/9923234'}, 'data': "Research Article\nPBDT: Python Backdoor Detection Model Based on\nCombined Features\nYong Fang, Mingyu Xie, and Cheng Huang\nSchool of Cyber Science and Engineering, Sichuan University, Chengdu, China\nCorrespondence should be addressed to Cheng Huang; opcodesec@gmail.com\nReceived 1 April 2021; Accepted 31 August 2021; Published 14 September 2021\nAcademic\nEditor: Shah Nazir\nCopyright © 2021 Yong Fang et al. 'is is an open access article distributed under the Creative Commons Attribution License,\nwhich permits unrestricted u

In [25]:
import json

for hit in hits:
    print(json.dumps(hit.payload, indent=2))
    print("score:", hit.score)

{
  "metadata": {
    "/Creator": "Aspose Pty Ltd.",
    "/ModDate": "D:20250325022231-07'00'",
    "/CreationDate": "D:20210913202330+05'30'",
    "/Producer": "Aspose.PDF for .NET 23.2.0; modified using iText 4.2.0 by 1T3XT",
    "/Subject": "Security and Communication Networks 2021.2021:9923234",
    "/WPS-PROCLEVEL": "3",
    "/WPS-JOURNALDOI": "10.1155/2037",
    "/Title": "PBDT: Python Backdoor Detection Model Based on Combined Features",
    "/WPS-ARTICLEDOI": "10.1155/2021/9923234"
  },
  "data": "Research Article\nPBDT: Python Backdoor Detection Model Based on\nCombined Features\nYong Fang, Mingyu Xie, and Cheng Huang\nSchool of Cyber Science and Engineering, Sichuan University, Chengdu, China\nCorrespondence should be addressed to Cheng Huang; opcodesec@gmail.com\nReceived 1 April 2021; Accepted 31 August 2021; Published 14 September 2021\nAcademic\nEditor: Shah Nazir\nCopyright \u00a9 2021 Yong Fang et al. 'is is an open access article distributed under the Creative Commons 

In [28]:
import chromadb

# For an in-memory instance (data is not persisted)
client = chromadb.Client()

# For a persistent instance (data is saved to disk)
# client = chromadb.PersistentClient(path="path/to/your/chroma_db")

In [29]:
collection_name = "my_documents"
collection = client.get_or_create_collection(name=collection_name)

In [30]:
embeddings =[
        encoder.encode(doc["data"]).tolist()
        for doc in documents
    ]

In [31]:
embeddings[0]

[-0.10231786966323853,
 0.03884272277355194,
 -0.009886211715638638,
 -0.02548011764883995,
 -0.00843384675681591,
 -0.060136012732982635,
 0.07843732833862305,
 0.029126381501555443,
 0.0380822978913784,
 -0.00318942335434258,
 0.012428926303982735,
 -0.0063712457194924355,
 0.1191607341170311,
 0.03740200400352478,
 0.014361702837049961,
 0.014668512158095837,
 -0.022604146972298622,
 0.044772665947675705,
 0.024940093979239464,
 -0.05348007380962372,
 -0.0474286749958992,
 -0.010265951976180077,
 -0.04467985779047012,
 0.041803862899541855,
 -0.06506053358316422,
 -0.041189778596162796,
 -0.06850238144397736,
 0.016066089272499084,
 -0.023639196529984474,
 -0.012441636994481087,
 -0.023678215220570564,
 0.07648281008005142,
 0.005355120170861483,
 0.018500324338674545,
 0.04129360243678093,
 0.05991631746292114,
 -0.028266189619898796,
 -0.035140667110681534,
 0.045688338577747345,
 -0.07624939829111099,
 0.016479967162013054,
 -0.060813456773757935,
 -0.054565638303756714,
 0.04367

In [32]:
metadatas = [
    doc['metadata'] for doc in documents
]
documents = [
    doc['data'] for doc in documents
]
ids = [
    str(idx) for idx in range(len(documents))
]
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [33]:
query = 'Levenshtein distance'
query_embedding = encoder.encode(query).tolist()

In [34]:
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Number of most similar results to return
    include=["documents", "metadatas", "distances"]  # What information to include in the results
)

In [35]:
import json

print(json.dumps(results,indent=2))

{
  "ids": [
    [
      "2",
      "0"
    ]
  ],
  "embeddings": null,
  "documents": [
    [
      "Research Article\nPBDT: Python Backdoor Detection Model Based on\nCombined Features\nYong Fang, Mingyu Xie, and Cheng Huang\nSchool of Cyber Science and Engineering, Sichuan University, Chengdu, China\nCorrespondence should be addressed to Cheng Huang; opcodesec@gmail.com\nReceived 1 April 2021; Accepted 31 August 2021; Published 14 September 2021\nAcademic\nEditor: Shah Nazir\nCopyright \u00a9 2021 Yong Fang et al. 'is is an open access article distributed under the Creative Commons Attribution License,\nwhich permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.\nApplication security is essential in today\u2019s highly development period. Backdoor is a means by which attackers can invade the\nsystem to achieve illegal purposes and damage users\u2019 rights. It has posed a serious threat to network security. 'us, it is u