In [8]:
weaviate_api_key = "MtQ11Yo0rOO1ECdh3bLU7ilSHvYIsELI61SI", weaviate_url = "https://aiwriter-3ekmzu2w.weaviate.network"

In [56]:
import os
import json
import requests
import weaviate
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

class WeaviateClient:
    def __init__(self, weaviate_url=None, weaviate_api_key=None, openai_api_key=None):
        # Use environment variables if the values are not provided explicitly
        self.__weaviate_url = weaviate_url or os.getenv("WEAVIATE_URL")
        self.__weaviate_api_key = weaviate_api_key or os.getenv("WEAVIATE_API_KEY")
        self.__openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
        
        # Initialize Weaviate client
        self.client = weaviate.Client(
            url=self.__weaviate_url,
            auth_client_secret=weaviate.AuthApiKey(api_key=self.__weaviate_api_key),
            additional_headers={
                "X-OpenAI-Api-Key": self.__openai_api_key
            }
        )

    def create_collection(self, collection_name, embedding_model):
        
        if embedding_model == 'openai': #uses text-embedding-ada-002 by default
            vectorizer = "text2vec-openai"
            
        class_obj = {
            "class": collection_name,
            "vectorizer":vectorizer ,
            "moduleConfig": {
                
                vectorizer: {}, #class level module configs
            }
        }
        self.client.schema.create_class(class_obj)

        
    
    def embed(self, collection_name, data):
        self.client.batch.configure(batch_size=100)  # Configure batch
        with self.client.batch as batch:  # Initialize a batch process
            for i, d in enumerate(data):  # Batch import data
                print(f"Importing question: {i + 1}")
                
                properties = {
                    "text": d.page_content,
                    "file_path": d.metadata['file_path'],
                    "page" : d.metadata['page']
                }
                
                batch.add_data_object(
                    data_object=properties,
                    class_name= collection_name
                )
                
                
                
    def semantic_search(self, collection_name, query_text, K=5):

        response = (
            self.client.query
            .get(collection_name, ["text", "file_path", "page"])
            .with_near_text({"concepts": query_text })
            .with_limit(K)
            .do()
        )

        return response

    def hybrid_search(self, collection_name, query_text, K=5):
        
        response = (
            self.client.query
            .get(collection_name, ["text", "file_path", "page"])
            .with_hybrid(
                query= query_text
            )
            .with_limit(K)
            .do()
        )
            
        return response


# weaviate_instance = WeaviateClient()




In [57]:
weaviate_client = WeaviateClient(weaviate_api_key = "MtQ11Yo0rOO1ECdh3bLU7ilSHvYIsELI61SI", weaviate_url = "https://aiwriter-3ekmzu2w.weaviate.network")

In [11]:
weaviate_client.create_collection("sample_test", "openai")

In [55]:
weaviate_client.embed("sample_test", chunks)

Importing question: 1
Importing question: 2
Importing question: 3
Importing question: 4
Importing question: 5
Importing question: 6
Importing question: 7
Importing question: 8
Importing question: 9
Importing question: 10
Importing question: 11
Importing question: 12
Importing question: 13
Importing question: 14
Importing question: 15
Importing question: 16
Importing question: 17
Importing question: 18
Importing question: 19
Importing question: 20
Importing question: 21
Importing question: 22
Importing question: 23
Importing question: 24
Importing question: 25
Importing question: 26
Importing question: 27
Importing question: 28
Importing question: 29
Importing question: 30
Importing question: 31
Importing question: 32
Importing question: 33
Importing question: 34
Importing question: 35
Importing question: 36
Importing question: 37
Importing question: 38
Importing question: 39
Importing question: 40
Importing question: 41
Importing question: 42
Importing question: 43
Importing question: 

In [58]:
ans = weaviate_client.semantic_search('sample_test', 'investing in bonds')

In [59]:
ans

{'data': {'Get': {'Sample_test': [{'file_path': 'sec-guide-to-savings-and-investing.pdf',
     'page': 15,
     'text': 'in ten years, plus pay you interest twice a year at the rate of 8% a year.\nIf you buy the stock, you take on the risk of potentially los-\ning a portion or all of your initial investment if the company \ndoes poorly or the stock market drops in value. But you also \nmay see the stock increase in value beyond what you could \nearn from the bonds. If you buy the stock, you become an \n“owner” of the company.\nYou wrestle with the decision. If you buy the bonds, you'},
    {'file_path': 'sec-guide-to-savings-and-investing.pdf',
     'page': 15,
     'text': 'may see the stock increase in value beyond what you could \nearn from the bonds. If you buy the stock, you become an \n“owner” of the company.\nYou wrestle with the decision. If you buy the bonds, you \nwill get your money back plus the 8% interest a year. And you \nthink the company will be able to honor its promi

In [15]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("example_data/layout-parser-paper.pdf")
pages = loader.load_and_split()

ModuleNotFoundError: No module named 'langchain_community'

In [16]:
from langchain.document_loaders import PyMuPDFLoader

In [17]:
loader = PyMuPDFLoader("sec-guide-to-savings-and-investing.pdf")

In [18]:
doc = loader.load()

In [20]:
len(doc)

32

[Document(page_content='A ROADMAP TO YOUR JOURNEY TO FINANCIAL SECURITY  |  1\nSaving and Investing\nA Roadmap To Your Financial Security \nThrough Saving and Investing\n', metadata={'source': 'sec-guide-to-savings-and-investing.pdf', 'file_path': 'sec-guide-to-savings-and-investing.pdf', 'page': 0, 'total_pages': 32, 'format': 'PDF 1.7', 'title': 'SEC Saving and Investing ', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign CC 2017 (Macintosh)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20170703135544-04'00'", 'modDate': "D:20170719104906-04'00'", 'trapped': ''}),
 Document(page_content='2  |  SAVING AND INVESTING\n', metadata={'source': 'sec-guide-to-savings-and-investing.pdf', 'file_path': 'sec-guide-to-savings-and-investing.pdf', 'page': 1, 'total_pages': 32, 'format': 'PDF 1.7', 'title': 'SEC Saving and Investing ', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign CC 2017 (Macintosh)', 'producer': 'Adobe PDF Library 15.0'

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size = self.size, chunk_overlap = self.chunk_overlap)
text = splitter.create_documents([doc], metadatas = [{'file_path' : file_path, 'page' : page}])

return text

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [47]:

chunks = []
for each_page in doc:
    splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 200)
    text = splitter.create_documents([each_page.page_content], metadatas = [{'file_path' : each_page.metadata['file_path'], 'page' : each_page.metadata['page']}])
    chunks += text

In [48]:
len(chunks)

144

In [50]:
chunks

[Document(page_content='A ROADMAP TO YOUR JOURNEY TO FINANCIAL SECURITY  |  1\nSaving and Investing\nA Roadmap To Your Financial Security \nThrough Saving and Investing', metadata={'file_path': 'sec-guide-to-savings-and-investing.pdf', 'page': 0}),
 Document(page_content='2  |  SAVING AND INVESTING', metadata={'file_path': 'sec-guide-to-savings-and-investing.pdf', 'page': 1}),
 Document(page_content='A ROADMAP TO YOUR JOURNEY TO FINANCIAL SECURITY  |  1\nDear Reader\nWhile money doesn’t grow on trees, it can grow when \nyou save and invest wisely.\nKnowing how to secure your financial well-being is one \nof the most important things you’ll ever need in life. You \ndon’t have to be a genius to do it. You just need to know \na few basics, form a plan, and be ready to stick to it. No \nmatter how much or little money you have, the important \nthing is to educate yourself about your opportunities. In', metadata={'file_path': 'sec-guide-to-savings-and-investing.pdf', 'page': 2}),
 Document(