In [None]:
!pip install -qU langchain-community pypdf

## Data Reading

In [None]:
!mkdir data

In [None]:
# from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

# # 1. Define the path to your folder
# path = "/content/data"

# # 2. Setup the DirectoryLoader
# # glob="./*.pdf" ensures we only grab PDF files
# # loader_cls=PyPDFLoader tells LangChain to use the PDF parser for each file found
# loader = DirectoryLoader(
#     path,
#     glob="./*.pdf",
#     loader_cls=PyPDFLoader
# )

# # 3. Load the documents
# # documents = loader.load()

# documents = loader.lazy_load()

# documents

In [None]:
# documents[106]

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "/content/data/yolov7paper.pdf"
loader = PyPDFLoader(file_path, mode="page")

# mode
# single: 1 document object for 107 pages
# page: 107 document objects for 107 pages

data = loader.load()
print(len(data))

In [None]:
data

## Text Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)
docs

In [None]:
docs[0]

In [None]:
len(docs)

In [None]:
!pip install langchain langchain-openai

In [None]:
!pip install -U weaviate-client[agents]

In [None]:
from google.colab import userdata
weaviate_url = userdata.get("WEAVIATE_URL")
weaviate_api_key = userdata.get("WEAVIATE_API_KEY")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")

In [None]:
# print(weaviate_url)
# print(weaviate_api_key)

In [None]:
# import weaviate
# from weaviate.classes.init import Auth
# # import os

# # Recommended: save sensitive data as environment variables
# # openai_key = os.getenv("OPENAI_APIKEY")

# headers = {
#     "X-OpenAI-Api-Key": OPENAI_API_KEY,
# }

# client = weaviate.connect_to_weaviate_cloud(
#     cluster_url=weaviate_url,                       # `weaviate_url`: your Weaviate URL
#     auth_credentials=Auth.api_key(weaviate_api_key),      # `weaviate_key`: your Weaviate API key
#     headers=headers
# )

# # Work with Weaviate
# if client.is_ready():
#     print("Weaviate is ready to accept requests.")
# else:
#     print("Weaviate is not ready yet.")

# client.close()

In [None]:
docs[0]

In [None]:
docs[:2]

In [None]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
text_meta_pair[0]

In [None]:
list(zip(*text_meta_pair))

In [None]:
texts, meta = list(zip(*text_meta_pair))

In [None]:
type(texts), type(meta)

In [None]:
meta[:2]

In [None]:
texts[:2]

In [None]:
len(texts), len(meta)

In [None]:
texts, meta = list(zip(*text_meta_pair))
# vectorstore.add_texts(texts, meta)

In [None]:
# import weaviate
# from weaviate.classes.init import Auth
# import os

# # Recommended: save sensitive data as environment variables
# # openai_key = os.getenv("OPENAI_APIKEY")

# headers = {
#     "X-OpenAI-Api-Key": OPENAI_API_KEY,
# }

# client = weaviate.connect_to_weaviate_cloud(
#     cluster_url=weaviate_url,                       # `weaviate_url`: your Weaviate URL
#     auth_credentials=Auth.api_key(weaviate_api_key),      # `weaviate_key`: your Weaviate API key
#     headers=headers
# )

# # Work with Weaviate
# if client.is_ready():
#     print("Weaviate is ready to accept requests.")
# else:
#     print("Weaviate is not ready yet.")

# client.close()

## Vector Database Storage

In [None]:
import weaviate
# import weaviate.classes as wvc
# from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType
import os


# 1. Connect to Weaviate (ensure your OpenAI API key is in your environment)

headers = {
    "X-OpenAI-Api-Key": OPENAI_API_KEY,
}

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,                       # `weaviate_url`: your Weaviate URL
    auth_credentials = weaviate_api_key,            # auth_credentials=Auth.api_key(weaviate_api_key),      # `weaviate_key`: your Weaviate API key
    headers=headers
)


if client.is_ready():
    print("Weaviate is ready to accept requests.")
else:
    print("Weaviate is not ready yet.")

# 2. Define the Collection with the OpenAI vectorizer
client.collections.create(
    name="YoloPaper",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(
        model="text-embedding-3-small",
        vectorize_collection_name=False
    ),
    properties=[
       Property(name="content", data_type=DataType.TEXT),
       Property(name="source", data_type=DataType.TEXT),
       Property(name="page", data_type=DataType.INT),
    ]
)


# Get the collection reference
paper_collection = client.collections.get("YoloPaper")

# Use a dynamic batch to import your 'texts' and 'meta'
with paper_collection.batch.dynamic() as batch:
    # We iterate through the parallel lists you created
    for text_body, metadata in zip(texts, meta):

        # Prepare the object properties
        properties = {
            "content": text_body,
            "source": metadata.get("source"),
            "page": int(metadata.get("page", 0))
        }

        # Add to the batch (Weaviate & OpenAI handle the embedding automatically)
        batch.add_object(properties=properties)

print("Import Complete!")
client.close()

## Semantic (vector) search
```
Semantic search finds results based on meaning. This is called nearText in Weaviate. The following example searches for 2 objects (limit) whose meaning is most similar
```

In [None]:
import weaviate
import os, json

# Best practice: store your credentials in environment variables
# weaviate_url = os.environ["WEAVIATE_URL"]
# weaviate_api_key = os.environ["WEAVIATE_API_KEY"]
from google.colab import userdata
weaviate_url = userdata.get("WEAVIATE_URL")
weaviate_api_key = userdata.get("WEAVIATE_API_KEY")

# Step 1: Connect to your Weaviate Cloud instance
with weaviate.connect_to_weaviate_cloud(
    cluster_url = weaviate_url,
    auth_credentials = weaviate_api_key,
    headers = {
    "X-OpenAI-Api-Key": OPENAI_API_KEY,
    }

) as client:

    # Step 2: Use this collection
    paper = client.collections.use("YoloPaper")

    # Step 3: Perform a semantic search with NearText
    response = paper.query.near_text(
        query="what is yolo",
        limit=2
    )

    print(response)
    print("*"*20)
    for obj in response.objects:
        print(json.dumps(obj.properties, indent=2))  # Inspect the results