In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np

In [2]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"), 
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [3]:
pages_char_split

[Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'),
 Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individu

In [4]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings

In [5]:
import dotenv

dotenv.load_dotenv()
import os 

model = "sentence-transformers/all-mpnet-base-v2"
api_key = os.getenv("HUGGINGFACE_API_KEY")
embedding = HuggingFaceEndpointEmbeddings(
    model=model,
    huggingfacehub_api_token=api_key,
)

In [6]:
vector1 = embedding.embed_query(pages_char_split[3].page_content)

In [7]:
vector2 = embedding.embed_query(pages_char_split[5].page_content)

In [8]:
vector3 = embedding.embed_query(pages_char_split[18].page_content)

In [9]:
vector1

[-0.037844218313694,
 0.022497206926345825,
 -0.035434793680906296,
 -0.004900737199932337,
 -0.03452989459037781,
 0.03454287350177765,
 -0.052174948155879974,
 -0.008026188239455223,
 0.001340832095593214,
 0.029432328417897224,
 0.07249563932418823,
 -0.016341660171747208,
 -0.013320174999535084,
 0.10202054679393768,
 0.014350945129990578,
 0.032770786434412,
 -0.007778957020491362,
 0.012298909947276115,
 -0.011187851428985596,
 0.0510597787797451,
 0.005111318081617355,
 0.02080385573208332,
 -0.020406518131494522,
 0.015749573707580566,
 -0.006115148309618235,
 0.010165884159505367,
 -0.024700241163372993,
 -0.038533780723810196,
 -0.005882441531866789,
 -0.020718161016702652,
 -0.013437260873615742,
 -0.03268495947122574,
 0.007872928865253925,
 0.04784652218222618,
 2.0745580968650756e-06,
 -0.015162941068410873,
 -0.04918700084090233,
 0.0047182198613882065,
 0.026523571461439133,
 0.0067448364570736885,
 0.055647097527980804,
 0.0142807736992836,
 0.005431660450994968,
 0.03

In [10]:
len(vector1), len(vector2), len(vector3)

(768, 768, 768)

In [11]:
np.dot(vector1, vector2)

0.6236003858041939

In [12]:
np.dot(vector2, vector3)

0.24200016840788152

In [13]:
len(pages_char_split)

20

In [33]:
pip install chromadb

^C
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for chroma-hnswlib (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [5 lines of output]
      running bdist_wheel
      running build
      running build_ext
      building 'hnswlib' extension
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for chroma-hnswlib
ERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (chroma-hnswlib)


Collecting chromadb
  Downloading chromadb-0.6.2-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6.tar.gz (32 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.5-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp312-cp312-win_amd64.whl.metadata (4.7 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.29.0-py3-none-any.whl.metadata (1.4 kB)
Collecting openteleme

In [22]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(documents = pages_char_split, 
                                    embedding = embedding, 
                                    persist_directory = "./intro-to-ds-lectures")

In [23]:
vectorstore_from_directory = Chroma(persist_directory = "./intro-to-ds-lectures", 
                                    embedding_function = embedding)

In [24]:
vectorstore_from_directory.get()

{'ids': ['d96fc952-1cf5-4921-81e9-8dea3500867d',
  '1188b65c-5ecc-4acc-9201-035996987ff7',
  '0dea4731-3bf7-43a5-8f82-78e382eb7d23',
  '7563f202-2bfa-4614-8cae-df6c7f6d9b23',
  'fc16ff70-0a75-4c0e-95ae-67e62bd7bbe4',
  '67cf13a7-2113-4514-8352-6ffeaa8a3677',
  '93e84ad9-f18c-46fa-a432-6054f1be86f8',
  '027ab76f-44f3-42fe-a90b-331b70068690',
  '9335260c-c081-4cee-94b4-ae31bb3d2141',
  '46ed308c-eca0-45f9-9657-0de167607346',
  '5716771c-2107-46ad-bca3-9f7175438d8c',
  '5111c262-11e3-4acf-8c41-7a6690223f12',
  '249dd8e4-6785-4503-be5e-e1e96d15baee',
  '36f1aeab-4145-42c1-9a67-e821934995f8',
  '4dff88a4-e7a9-4ffc-8f4e-c2a1365d6beb',
  '58c69337-15fc-4752-997e-fcbbe2da06dd',
  'c9c539a0-58d2-4638-8a80-34c220206636',
  'f4cb71dd-b881-4fc9-8b8e-da6bb0487167',
  'f2cc578b-bf9d-4692-b2ed-ffaee48d5db2',
  '12a2c431-871b-4b74-8e47-31fc1efe4101'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the simi

In [25]:
vectorstore_from_directory.get(ids = "d96fc952-1cf5-4921-81e9-8dea3500867d", 
                               include = ["embeddings"])

{'ids': ['d96fc952-1cf5-4921-81e9-8dea3500867d'],
 'embeddings': array([[-2.84305606e-02,  2.27939561e-02, -4.66530174e-02,
         -1.15311956e-02, -3.24488729e-02,  7.83926807e-03,
          6.56666001e-03, -4.84436192e-02, -1.26556838e-02,
          1.58783291e-02,  5.58935590e-02, -3.92562002e-02,
          9.51971021e-03,  1.14860594e-01,  2.05441788e-02,
          3.32319736e-02, -4.96172626e-03,  3.49077163e-03,
         -4.59013991e-02,  5.28328866e-02,  3.89843173e-02,
          9.59076826e-03, -5.81804896e-03,  2.61757001e-02,
          7.05781393e-04,  3.29818875e-02, -1.82292331e-02,
         -5.84849007e-02, -1.14883678e-02, -1.56401005e-02,
         -1.29413158e-02, -3.67891267e-02,  3.25516798e-03,
          4.68611047e-02,  2.09809036e-06, -1.18870446e-02,
         -2.85230223e-02,  2.69843284e-02,  5.08713648e-02,
          2.53947601e-02,  9.50021446e-02,  4.86994907e-03,
          9.52032395e-03,  2.78564207e-02, -2.02703979e-02,
          3.02826371e-02, -1.0787537

# CRUD

In [27]:
from langchain_core.documents import Document
added_document = Document(page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis', 
                          metadata={'Course Title': 'Introduction to Data and Data Science', 
                                    'Lecture Title': 'Analysis vs Analytics'})

In [29]:
vectorstore_from_directory.add_documents([added_document])

['529b8234-7828-4faa-80a4-9f0bf8787762']

In [30]:
vectorstore_from_directory.get('529b8234-7828-4faa-80a4-9f0bf8787762')

{'ids': ['529b8234-7828-4faa-80a4-9f0bf8787762'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'],
 'uris': None,
 'data': None,
 'metadatas': [{'Course Title': 'Introduction to Data and Data Science',
   'Lecture Title': 'Analysis vs Analytics'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [31]:
updated_document = Document(page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!', 
                            metadata={'Course Title': 'Introduction to Data and Data Science', 
                                     'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [32]:
vectorstore_from_directory.update_document(document_id = "529b8234-7828-4faa-80a4-9f0bf8787762", 
                                           document = updated_document)

In [33]:
vectorstore_from_directory.get('529b8234-7828-4faa-80a4-9f0bf8787762')

{'ids': ['529b8234-7828-4faa-80a4-9f0bf8787762'],
 'embeddings': None,
 'documents': ['Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'],
 'uris': None,
 'data': None,
 'metadatas': [{'Course Title': 'Introduction to Data and Data Science',
   'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [34]:
vectorstore_from_directory.delete("529b8234-7828-4faa-80a4-9f0bf8787762")

In [35]:
vectorstore_from_directory.get('529b8234-7828-4faa-80a4-9f0bf8787762')


{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}