# prepare Knowledge Base Documents

# Load Documents

In [1]:
#!pip install langchain==0.1.0

In [2]:
from langchain_community.document_loaders import TextLoader
sentence_loader = TextLoader("sentences_doc.txt")
sentence_documents = sentence_loader.load()
sentence_documents

[Document(page_content='The baby cried for milk.\n\nThe car drove away.\n\nDog lives in kennel.\n\nThe baby laughed.\n\nThe kid was playing.', metadata={'source': 'sentences_doc.txt'})]

In [3]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=30, chunk_overlap=30)
docs_after_splitting = text_splitter.split_documents(sentence_documents)

docs_after_splitting

[Document(page_content='The baby cried for milk.', metadata={'source': 'sentences_doc.txt'}),
 Document(page_content='The car drove away.', metadata={'source': 'sentences_doc.txt'}),
 Document(page_content='Dog lives in kennel.', metadata={'source': 'sentences_doc.txt'}),
 Document(page_content='The baby laughed.', metadata={'source': 'sentences_doc.txt'}),
 Document(page_content='The kid was playing.', metadata={'source': 'sentences_doc.txt'})]

# Initiate Embedding Model

In [4]:
#all-MiniLM-L6-v2 is very fast

In [None]:
import langchain_community.embeddings.sentence_transformer                                                                


embedder = langchain_community.embeddings.sentence_transformer.SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [6]:
embedding_test = embedder.embed_query('hello')
embedding_test

[-0.06277172267436981,
 0.05495871976017952,
 0.05216478183865547,
 0.08578995615243912,
 -0.08274895697832108,
 -0.07457294315099716,
 0.06855467706918716,
 0.01839648187160492,
 -0.08201128244400024,
 -0.03738482668995857,
 0.01212485134601593,
 0.00351826217956841,
 -0.004134283866733313,
 -0.04378452152013779,
 0.02180730365216732,
 -0.005102727096527815,
 0.019546622410416603,
 -0.04234866797924042,
 -0.11035970598459244,
 0.0054245200008153915,
 -0.05573476478457451,
 0.02805246412754059,
 -0.023158758878707886,
 0.028481444343924522,
 -0.053709667176008224,
 -0.052601609379053116,
 0.033939335495233536,
 0.04538872465491295,
 0.023718371987342834,
 -0.07312075048685074,
 0.054777730256319046,
 0.017047351226210594,
 0.0813603475689888,
 -0.0028626741841435432,
 0.011958112940192223,
 0.07355856150388718,
 -0.09423747658729553,
 -0.0813620463013649,
 0.04001549631357193,
 0.0006922045140527189,
 -0.013393291272222996,
 -0.054538048803806305,
 0.005151511635631323,
 -0.02613976225

In [7]:
len(embedding_test)

384

Dimension of embeddings produced by all-MiniLM-L6-v2 is 384

# Vector DB 1. Chroma DB

In [8]:
#!pip install chromadb 

runs in-memory

### Create Chroma Collection and add documents and embeddings

In [9]:
from langchain_community.vectorstores import Chroma

chroma_db = Chroma.from_documents(docs_after_splitting, embedder,collection_name='chroma_test')


### Search Chromadb for similar sentence

In [10]:

user_query = "The truck was fast"
relevant_docs = chroma_db.similarity_search(user_query)


print(relevant_docs[0].page_content)

The car drove away.


#### Some other useful ChromaDB Commands



In [11]:
chroma_db.get()

{'ids': ['2d8aefd6-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af094-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af0a8-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af0bc-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af0d0-b56d-11ee-a8f8-2ec3c8cd2c6e'],
 'embeddings': None,
 'metadatas': [{'source': 'sentences_doc.txt'},
  {'source': 'sentences_doc.txt'},
  {'source': 'sentences_doc.txt'},
  {'source': 'sentences_doc.txt'},
  {'source': 'sentences_doc.txt'}],
 'documents': ['The baby cried for milk.',
  'The car drove away.',
  'Dog lives in kennel.',
  'The baby laughed.',
  'The kid was playing.'],
 'uris': None,
 'data': None}

#### Alternate way than using langchain

In [12]:
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.get_collection(name='chroma_test')
collection

Collection(name=chroma_test)

In [13]:
collection.count()

5

In [14]:
collection.peek()

{'ids': ['2d8aefd6-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af094-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af0a8-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af0bc-b56d-11ee-a8f8-2ec3c8cd2c6e',
  '2d8af0d0-b56d-11ee-a8f8-2ec3c8cd2c6e'],
 'embeddings': [[0.026743043214082718,
   0.009990915656089783,
   -0.0037782862782478333,
   0.12095930427312851,
   0.07037460058927536,
   0.02908904105424881,
   -0.033766765147447586,
   0.10197418183088303,
   -0.012217715382575989,
   -0.0066671292297542095,
   -0.028279952704906464,
   -0.037938348948955536,
   -0.05967285484075546,
   -0.0005077590467408299,
   -0.015095002017915249,
   0.024383505806326866,
   -0.030351392924785614,
   0.0133482301607728,
   0.03766992688179016,
   -0.06918719410896301,
   0.06810162961483002,
   0.001881241099908948,
   0.012283035553991795,
   0.03555252403020859,
   0.07695236802101135,
   0.06214265152812004,
   0.05314602702856064,
   -0.015133118256926537,
   0.031958818435668945,
   0.013643025420606136,
   -0.03987084329

In [15]:
len(collection.peek()['embeddings'][0])

384

In [16]:
search_results = collection.query(
    query_texts=user_query,
    n_results=1
)
search_results

{'ids': [['2d8af094-b56d-11ee-a8f8-2ec3c8cd2c6e']],
 'distances': [[1.312137484550476]],
 'metadatas': [[{'source': 'sentences_doc.txt'}]],
 'embeddings': None,
 'documents': [['The car drove away.']],
 'uris': None,
 'data': None}

In [17]:
chroma_client.delete_collection(name='chroma_test')

# Vector DB 2. Pinecone

In [18]:
#!pip install pinecone-client

https://www.pinecone.io/ - Sign Up for Free

app.pinecone.io - Create Project 

app.pinecone.io - Create API Keys

### Create Pinecone Index

In [19]:
import os
import pinecone

api_key = os.getenv("PINECONE_API_KEY")
#starter plan indexes are hosted in the gcp-starter environment,
env = 'gcp-starter'

pinecone.init(api_key=api_key, environment=env)


In [22]:
index_name = 'pinecone-test'
pinecone.create_index(
        index_name,
        dimension=384,  # Dimension of embeddings produced by all-MiniLM-L6-v2 is 384
        metric='cosine'
    )

In [23]:
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Add Documents and Embeddings

In [24]:
from langchain.vectorstores import Pinecone
pinecone_db = Pinecone.from_documents(docs_after_splitting, embedder, index_name=index_name)

In [31]:
# It might take a minute or two for pinecone index to be ready
# check if there total_vector_count =5 before running search 
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

### Search Pinecone for similar sentence

In [32]:
relevant_documents = pinecone_db.similarity_search_with_score(user_query, k=1)
print(relevant_documents)

[(Document(page_content='The car drove away.', metadata={'source': 'sentences_doc.txt'}), 0.344002873)]


In [33]:
pinecone.delete_index(index_name)

# Vector DB 3. Weaviate

https://weaviate.io/developers/wcs/quickstart

https://console.weaviate.cloud/ - sign up for free

Create cluster in Free Sandbox

Get cluster URL and api key

In [None]:
#!pip install "weaviate-client==3.*"

### Create Weaviate Collection/Class and add objects/Embeddings

In [34]:
from langchain_community.vectorstores import Weaviate

WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

weaviate_db = Weaviate.from_documents(docs_after_splitting, embedder, 
                                      weaviate_url="https://test-sand-cluster.weaviate.network/", 
                                      by_text=False)


### Search Weaviate for similar sentence

In [35]:
user_query = "The truck was fast"
docs = weaviate_db.similarity_search(user_query)
docs[0]

Document(page_content='The car drove away.', metadata={'source': 'sentences_doc.txt'})

### Alternate way than using langchain

In [36]:
import weaviate
import os


client = weaviate.Client(
    url="https://test-sand-cluster.weaviate.network/",  
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY), 
)

client.schema.get()

{'classes': [{'class': 'LangChain_840eb00fc5684bf4beeca234d8f02e58',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'text',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Wed Jan 17 06:02:30 2024",
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'source',
     'tokenization': 'word'}],
   'replicationConfig': {'factor': 1},
   'shardingConfig': {'virtualPerPhysical': 128,
    'desiredCount': 1,
    'actualCount': 1,
    'desiredVirtualCount': 128,
    'actualVirtualCount': 128,
    'key': '_id',
    'strategy': 'hash',
    'function': 'murmur3'},
   'vectorIndexConfig': {'skip': False,
    'cle

In [37]:
import json
class_objects = client.data_object.get()
print(json.dumps(class_objects,indent=3))

{
   "deprecations": null,
   "objects": [
      {
         "class": "LangChain_629437e597d74821bd24f24e7af461e4",
         "creationTimeUnix": 1705518669377,
         "id": "7d2c7645-a002-446d-8033-40e5bc2ca73c",
         "lastUpdateTimeUnix": 1705518669377,
         "properties": {
            "source": "sentences_doc.txt",
            "text": "The baby cried for milk."
         },
         "vectorWeights": null
      },
      {
         "class": "LangChain_629437e597d74821bd24f24e7af461e4",
         "creationTimeUnix": 1705518669377,
         "id": "9157b570-9d13-494c-a42f-5166b42ea187",
         "lastUpdateTimeUnix": 1705518669377,
         "properties": {
            "source": "sentences_doc.txt",
            "text": "Dog lives in kennel."
         },
         "vectorWeights": null
      },
      {
         "class": "LangChain_629437e597d74821bd24f24e7af461e4",
         "creationTimeUnix": 1705518669377,
         "id": "95c8ff88-1a00-474a-a74c-f81ee131722d",
         "lastUpdateTi

In [38]:
from langchain_core.embeddings import Embeddings

query_result = embedder.embed_query(user_query)

vector = {"vector": query_result}

search_response = (
    client.query
    .get("LangChain_840eb00fc5684bf4beeca2", ["source", "text"])
    .with_near_vector( vector)
    .with_limit(1)
    .do()
)

print(json.dumps(search_response,indent=3))

{
   "data": {
      "Get": {
         "LangChain_840eb00fc5684bf4beeca234d8f02e58": [
            {
               "source": "sentences_doc.txt",
               "text": "The car drove away."
            }
         ]
      }
   }
}
