In [6]:
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [44]:
with open("scraped_content.txt") as f:
    scraped_content = f.read()

pages = scraped_content.split("\f")
len(pages)

23

In [54]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=10,
    length_function=len,
    is_separator_regex=False,
)

In [55]:
chunks = []
    
for page in pages:
    content_lines = page.strip().split("\n")
    content_lines = [line for line in content_lines if not line.startswith("URL:") and not line.startswith("="*80)]
    content = "\n".join(content_lines)

    
    page_chunk = text_splitter.split_text(content)  
    chunks.append(page_chunk)

chunks


[['MeTTa\nA language for cognitive computations\nOpenCog Hyperon, towards AGI\nMultiparadigmality\nMeTTa programs organically combine elements of functional, logical and probabilistic programming providing a synergetic framework for representing declarative and procedural knowledge.\nAtomspace\nEach MeTTa program is represented as a subgraph of an Atomspace metagraph, and operates centrally by querying and rewriting portions of Atomspaces.\nSelf-modification\nMeTTa handles highly abstract constructs like run-time self-modifying code simply and naturally. Programs are fully self-reflective – we can read/modify the code inside the programs.\nGradual dependent types\nType system is one of the most important features in terms of application of MeTTa language. Built-in mathematical reasoning by supporting a state-of-the-art type system.\nNeural-symbolic integration\nMeTTa is capable to support neural-symbolic reasoning and handling uncertainties, using probabilistic logical reasoning.\nInfe

In [56]:
flattened_chunks = [chunk for sublist in chunks for chunk in sublist]
flattened_chunks

['MeTTa\nA language for cognitive computations\nOpenCog Hyperon, towards AGI\nMultiparadigmality\nMeTTa programs organically combine elements of functional, logical and probabilistic programming providing a synergetic framework for representing declarative and procedural knowledge.\nAtomspace\nEach MeTTa program is represented as a subgraph of an Atomspace metagraph, and operates centrally by querying and rewriting portions of Atomspaces.\nSelf-modification\nMeTTa handles highly abstract constructs like run-time self-modifying code simply and naturally. Programs are fully self-reflective – we can read/modify the code inside the programs.\nGradual dependent types\nType system is one of the most important features in terms of application of MeTTa language. Built-in mathematical reasoning by supporting a state-of-the-art type system.\nNeural-symbolic integration\nMeTTa is capable to support neural-symbolic reasoning and handling uncertainties, using probabilistic logical reasoning.\nInfer

In [63]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma/"
EMBED_MODEL = "llmrails/ember-v1"
COLLECTION_NAME = "chunksn"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [64]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

In [65]:
documents=flattened_chunks
collection.add(
    documents=documents,
    ids=[f"id{i}" for i in range(len(documents))],
)

In [66]:
query_results = collection.query(
     query_texts=["atom kinds and types in metta"],
     include=["documents", "distances"],
     n_results=5
 )

In [67]:
query_results["documents"]

[['In an Atomspace, an Atom is a fundamental building block of all the data. In the context of graph representation, an Atom can be either a node or a link. In an Atomspace as metagraph, links can connect not only nodes, but other links, that is, they connect atoms, and they can connect any number of atoms (in contrast to ordinary graphs). In MeTTa as a programming language, atoms play the role of terms.\nIn the context of AI, Atoms can represent anything from objects, to concepts, to processes, functions or relationships. This enables the creation of rich, complex models of knowledge and reasoning.\nAtom kinds and types\n\u200b\nThere are 4 kinds of Atoms in MeTTa:\n    - Symbol , which represents some idea or concept. Two symbols having the same name are considered equal and representing the same concept. Names of symbols can be arbitrary strings. Nearly anything can be a symbol, e.g., A , f , known? , replace-me , ≱ , etc.',
  "Concrete types\n\u200b\nTypes of symbols\n\u200b\nAtoms