In [1]:
# !pip install langchain
# !pip install weaviate-client
# !pip install openai
# !pip install unstructured
# !pip install "unstructured[pdf]
# !pip install PyMuPDF
# !pip install sentence-transformers

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.vectorstores import Weaviate
model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [5]:
import weaviate 
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

client = weaviate.Client(
    "http://localhost:8081", 
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPEN_AI_API_KEY")
    }
)

In [6]:
from weaviate.gql.get import HybridFusion

In [12]:
# Clear up the schema, so that we can recreate it
client.schema.delete_all()
client.schema.get()

# Define the Schema object for a PDF document
pdf_schema = {
    "class": "PDFDocument",
    "description": "A collection of PDF documents",
    "vectorizer": "text2vec-huggingface",
    "moduleConfig": {
        "text2vec-huggingface": {
          "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
          "type": "text"
        }
    },
    "properties": [{
        "name": "source",
        "description": "File path to the PDF document",
        "dataType": ["string"]
    },
    {
        "name": "content",
        "description": "Contents of the PDF document",
        "dataType": ["text"]
    },
    {
        "name": "page",
        "description": "Page number of PDF document",
        "dataType": ["string"]
    }]
}

# Add the PDFDocument schema
client.schema.create_class(pdf_schema)

# Get the schema to verify that it worked
client.schema.get()


{'classes': [{'class': 'PDFDocument',
   'description': 'A collection of PDF documents',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-huggingface': {'model': 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
     'type': 'text',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'File path to the PDF document',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-huggingface': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'source',
     'tokenization': 'whitespace'},
    {'dataType': ['text'],
     'description': 'Contents of the PDF document',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-huggingface': {'skip': False,
       've

In [13]:
# Query documents in the PDFDocument class
query_results = (
    client.query
    .get(
        class_name="PDFDocument",  # Use the actual class name from your schema
        properties=[
            "content",
            "source",
            "page",
        ],
    )
    .with_additional(properties=["score"])
    .with_autocut(2)
    .with_hybrid(
        query="INT3170",  # Replace with your specific query
        fusion_type=HybridFusion.RELATIVE_SCORE,
        properties=[
            "content",
        ],
    )
    .do()
)

In [14]:
query_results


{'data': {'Get': {'PDFDocument': []}}}

In [15]:
from langchain.docstore.document import Document
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time

def split_document(docs, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    # Splitting the documents into chunks
    chunks = text_splitter.create_documents([docs])
    return chunks

def insert_pdf_to_db(client, file_path):
    # Load pdf into pages
    pages = fitz.open(file_path)
    chunks = []  # create empty chunks
    # insert từng chunk vào chunk
    for page in pages:
        docs = split_document(page.get_text().replace('\n', ' ').lower())  # Return Langchain Documents list

        for doc in docs:
            chunk_data = {
                'content': doc.page_content,
                'source': str(pages.name), 
                'page': str(page.number)
            }
            chunks.append(chunk_data)

    # Insert chunks into the database
    

# Your existing code...

    for chunk in chunks:
        try:
            client.batch.add_data_object(chunk, class_name="PDFDocument")
        except Exception as e:
            error_message = str(e)
            if "rate_limit_exceeded" in error_message:
                # Implement rate limit handling, e.g., wait for a certain period and retry
                print("Rate limit exceeded. Waiting and retrying...")
                time.sleep(60)  # Wait for 60 seconds
                client.batch.add_data_object(chunk, class_name="PDFDocument")
            else:
                print(f"Error adding chunk: {chunk}. Error: {error_message}")


    return chunks

In [16]:
client.batch.configure(
    batch_size=10, 
    dynamic=True,
    timeout_retries=3,
#   callback=None,
)

<weaviate.batch.crud_batch.Batch at 0x215e4fa3f90>

In [17]:
sample_pdf_path = ["storage/LNCT800SoftwareApplicationManual.pdf"]
all_chunks = []
all_docs = []

for path in sample_pdf_path:
    chunk = insert_pdf_to_db(client, path)
    if chunk is not None:
        all_chunks.extend(chunk)

{'error': [{'message': 'update vector: failed with status: 429 error: Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate'}]}
{'error': [{'message': 'update vector: failed with status: 429 error: Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate'}]}
{'error': [{'message': 'update vector: failed with status: 429 error: Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate'}]}
{'error': [{'message': 'update vector: failed with status: 429 error: Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate'}]}
{'error': [{'message': 'update vector: failed with status: 429 error: Rate limit reached

In [18]:
result = (
    client.query.aggregate("PDFDocument")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["PDFDocument"], "\n")

Object count:  [{'meta': {'count': 299}}] 



In [37]:
test_article = (
    client.query
    .get("PDFDocument", ["content", "source", "page"])
    .with_hybrid("INT3170", 0.5)
    .with_limit(10)
    .do()
)["data"]["Get"]["PDFDocument"]

print(test_article)


[{'content': 'lnc-t800  system parameters    62  lnc technology co., ltd.  no.  type  description  active  level  page 9310  hardware [s2] 1st spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9311  hardware [s2] 2nd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9312  hardware [s2] 3rd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9320  hardware [s2] 1st spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9321  hardware [s2] 2nd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9322  hardware [s2] 3rd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9330  hardware [s2] 1st spindle pulse output width(0~25000ns)  ☉  maker  119 9331  hardware [s2] 2nd spindle pulse output width(0~25000ns)  ☉  maker  119 9332  hardware [s2] 3rd spindle pulse output width(0~25000ns)  ☉  maker  119 9340  hardware [s2] 1st spindle position loop gain(1/s)  ☉  maker  120 9341  hardware [s2] 2nd spindle position l

In [26]:
import json

def hybrid_query_weaviate(query, collection_name, alpha_val):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "source", "content", "page",
        "_additional { score }"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_hybrid(json.dumps(query), fusion_type=HybridFusion.RELATIVE_SCORE , alpha=alpha_val)
        .with_limit(10)
        .do()
    )
    
    return result["data"]["Get"][collection_name]

def kw_query_weaviate(query, collection_name, alpha_val):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "source", "content", "page",
        "_additional { score }"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_bm25(json.dumps(query))
        .with_limit(10)
        .do()
    )
    
    # Check for errors
    # if ("errors" in result):
    #     print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
    #     raise Exception(result["errors"][0]['message'])
    
    return result["data"]["Get"][collection_name]

def vector_query_weaviate(query, collection_name, alpha_val):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "source", "content", "page",
        "_additional { score }"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_near_text({"concepts": [query]})
        .with_limit(10)
        .do()
    )
    
    return result["data"]["Get"][collection_name]

In [64]:
test_article = (
    client.query
    .get("PDFDocument", ["content", "source", "page"])
    .with_hybrid(query="INT3170", alpha=0.5, fusion_type=HybridFusion.RELATIVE_SCORE)
    .with_limit(10)
    .with_autocut(2)
    .do()
)["data"]["Get"]["PDFDocument"]

print(test_article)


[{'content': 'lnc-t800  system parameters    62  lnc technology co., ltd.  no.  type  description  active  level  page 9310  hardware [s2] 1st spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9311  hardware [s2] 2nd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9312  hardware [s2] 3rd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9320  hardware [s2] 1st spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9321  hardware [s2] 2nd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9322  hardware [s2] 3rd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9330  hardware [s2] 1st spindle pulse output width(0~25000ns)  ☉  maker  119 9331  hardware [s2] 2nd spindle pulse output width(0~25000ns)  ☉  maker  119 9332  hardware [s2] 3rd spindle pulse output width(0~25000ns)  ☉  maker  119 9340  hardware [s2] 1st spindle position loop gain(1/s)  ☉  maker  120 9341  hardware [s2] 2nd spindle position l

In [75]:
query_results = (
    client.query
    .get("PDFDocument", ["content", "source", "page"])
    .with_limit(10)
    .with_autocut(2)
)

query_results = query_results.with_hybrid(
    query="INT3170", 
    alpha=0.5, 
    fusion_type=HybridFusion.RELATIVE_SCORE
).do()


print(query_results)


{'data': {'Get': {'PDFDocument': [{'content': 'lnc-t800  system parameters    62  lnc technology co., ltd.  no.  type  description  active  level  page 9310  hardware [s2] 1st spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9311  hardware [s2] 2nd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9312  hardware [s2] 3rd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9320  hardware [s2] 1st spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9321  hardware [s2] 2nd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9322  hardware [s2] 3rd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9330  hardware [s2] 1st spindle pulse output width(0~25000ns)  ☉  maker  119 9331  hardware [s2] 2nd spindle pulse output width(0~25000ns)  ☉  maker  119 9332  hardware [s2] 3rd spindle pulse output width(0~25000ns)  ☉  maker  119 9340  hardware [s2] 1st spindle position loop gain(1/s)  ☉  maker  120 9341  har

In [76]:
test_article = (
    client.query
    .get("PDFDocument", ["content", "source", "page"])
    .with_bm25("INT3170")
    .with_limit(10)
    .do()
)["data"]["Get"]["PDFDocument"]

print(test_article)

[]


In [79]:
test_article = (
    client.query
    .get("PDFDocument", ["content", "source", "page"])
    .with_near_text({"concepts": ["INT3170"]})
    .with_limit(10)
    .do()
)["data"]["Get"]["PDFDocument"]

print(test_article)


[{'content': 'lnc-t800  system parameters    62  lnc technology co., ltd.  no.  type  description  active  level  page 9310  hardware [s2] 1st spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9311  hardware [s2] 2nd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9312  hardware [s2] 3rd spd signal  format(0:a/b,1:cw/ccw,2:p/d,3:da10v,4:da+-10v)  ☉  maker  118 9320  hardware [s2] 1st spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9321  hardware [s2] 2nd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9322  hardware [s2] 3rd spindle pulse command inverse(0:no,1:yes)  ☉  maker  119 9330  hardware [s2] 1st spindle pulse output width(0~25000ns)  ☉  maker  119 9331  hardware [s2] 2nd spindle pulse output width(0~25000ns)  ☉  maker  119 9332  hardware [s2] 3rd spindle pulse output width(0~25000ns)  ☉  maker  119 9340  hardware [s2] 1st spindle position loop gain(1/s)  ☉  maker  120 9341  hardware [s2] 2nd spindle position l