In [3]:
# Import Weaviate and Connect to Client
import weaviate

# client = weaviate.connect_to_local()  # Connect to local host
client = weaviate.connect_to_local()

In [10]:
client.collections.delete_all()

In [11]:
import weaviate.classes.config as wvcc

collection = client.collections.create(
    name="WeaviateRecipesChunk",
    vectorizer_config=wvcc.Configure.Vectorizer.text2vec_cohere
    (
        model="embed-multilingual-v3.0"
    ),
    properties=[
            wvcc.Property(name="content", data_type=wvcc.DataType.TEXT),
            wvcc.Property(name="filename", data_type=wvcc.DataType.TEXT),
      ]
)

In [12]:
import os
import re
from pydantic import BaseModel

class FileChunk(BaseModel):
    filename: str
    content: str

def chunk_list(lst, chunk_size):
    """Break a list into chunks of the specified size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def split_into_sentences(text):
    """Split text into sentences using regular expressions."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def read_and_chunk_files(folder_path, chunk_size=5):
    file_chunks = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith((".py", ".md", ".ipynb")):
                file_path = os.path.join(root, file)
                with open(file_path, "r") as f:
                    content = f.read()
                    sentences = split_into_sentences(content)
                    sentence_chunks = chunk_list(sentences, chunk_size)
                    for chunk in sentence_chunks:
                        file_chunks.append(FileChunk(
                            filename=file,
                            content=' '.join(chunk)
                        ))
    return file_chunks

# Specify the folder paths
integrations_folder = "../../../integrations"
weaviate_features_folder = "../../../weaviate-features"

# Read and chunk files in the "integrations" folder
integrations_chunks = read_and_chunk_files(integrations_folder)

# Read and chunk files in the "weaviate-features" folder
weaviate_features_chunks = read_and_chunk_files(weaviate_features_folder)

# Concatenate the file chunks from both folders
all_chunks = integrations_chunks + weaviate_features_chunks

# Access the file chunks
for chunk in all_chunks:
    print(f"File: {chunk.filename}")
    print(f"Content: {chunk.content}")
    print("---")
    break

print(f"\nRead {len(all_chunks)} chunks!")

File: README.md
Content: ## Integrations 🤝
Learn about the various [Integrations](https://github.com/weaviate/recipes/tree/main/integrations) with Weaviate! A few demos we have are:
1. [DSPy](https://github.com/weaviate/recipes/tree/main/integrations/dspy) - Getting started with DSPy, Query to Blog Post demo, and more
2. [LlamaIndex](https://github.com/weaviate/recipes/tree/main/integrations/llamaindex) - Indexes, Query Engines, Advanced RAG, and more
3. [Nomic](https://github.com/weaviate/recipes/tree/main/integrations/nomic/vector-space-visualization) - Visualize your embeddings
4. [Ragas](https://github.com/weaviate/recipes/tree/main/integrations/ragas) - Evaluate your RAG application
---

Read 2553 chunks!


In [13]:
from weaviate.util import get_valid_uuid
from uuid import uuid4
import time

weaviate_recipes = client.collections.get("WeaviateRecipesChunk")


start = time.time()
for idx, file_chunk in enumerate(all_chunks):
    upload = weaviate_recipes.data.insert(
        properties={
            "filename": file_chunk.filename,
            "content": file_chunk.content
        }
    )

print(f"Uploaded {idx} file chunks to Weaviate in {time.time() - start} seconds!")

Uploaded 2552 file chunks to Weaviate in 420.52195405960083 seconds!


In [15]:
collection = client.collections.create(
    name="MemGPTMemory",
    vectorizer_config=wvcc.Configure.Vectorizer.text2vec_cohere(model="embed-english-v3.0"),
    properties=[
        wvcc.Property(name="memory", data_type=wvcc.DataType.TEXT)
    ]
)