In [4]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_chroma import Chroma


In [2]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings

In [3]:
import dotenv

dotenv.load_dotenv()
import os 

model = "sentence-transformers/all-mpnet-base-v2"
api_key = os.getenv("HUGGINGFACE_API_KEY")
embedding = HuggingFaceEndpointEmbeddings(
    model=model,
    huggingfacehub_api_token=api_key,
)

In [5]:
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

In [6]:
vectorstore.get()

{'ids': ['d96fc952-1cf5-4921-81e9-8dea3500867d',
  '1188b65c-5ecc-4acc-9201-035996987ff7',
  '0dea4731-3bf7-43a5-8f82-78e382eb7d23',
  '7563f202-2bfa-4614-8cae-df6c7f6d9b23',
  'fc16ff70-0a75-4c0e-95ae-67e62bd7bbe4',
  '67cf13a7-2113-4514-8352-6ffeaa8a3677',
  '93e84ad9-f18c-46fa-a432-6054f1be86f8',
  '027ab76f-44f3-42fe-a90b-331b70068690',
  '9335260c-c081-4cee-94b4-ae31bb3d2141',
  '46ed308c-eca0-45f9-9657-0de167607346',
  '5716771c-2107-46ad-bca3-9f7175438d8c',
  '5111c262-11e3-4acf-8c41-7a6690223f12',
  '249dd8e4-6785-4503-be5e-e1e96d15baee',
  '36f1aeab-4145-42c1-9a67-e821934995f8',
  '4dff88a4-e7a9-4ffc-8f4e-c2a1365d6beb',
  '58c69337-15fc-4752-997e-fcbbe2da06dd',
  'c9c539a0-58d2-4638-8a80-34c220206636',
  'f4cb71dd-b881-4fc9-8b8e-da6bb0487167',
  'f2cc578b-bf9d-4692-b2ed-ffaee48d5db2',
  '12a2c431-871b-4b74-8e47-31fc1efe4101'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the simi

In [7]:
added_document = Document(page_content='Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in real life? Certainly, with the help of computers. You can basically split the relevant tools into two categories—programming languages and software. Knowing a programming language enables you to devise programs that can execute specific operations. Moreover, you can reuse these programs whenever you need to execute the same action', 
                          metadata={'Course Title': 'Introduction to Data and Data Science', 
                                    'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [8]:
vectorstore.add_documents([added_document])

['a2586a75-492c-4818-844c-f8ba1742f774']

In [9]:
question = "What programming languages do data scientists use?"

In [10]:
retrieved_docs = vectorstore.similarity_search(query = question, 
                                               k = 5)

In [11]:
retrieved_docs

[Document(id='36f1aeab-4145-42c1-9a67-e821934995f8', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Thus, we need a lot of computational power, and we can expect people to use the languages similar to those in the big data column. Apart from R, Python, and MATLAB, other, faster languages are used like Java, JavaScript, C, C++, and Scala. Cool. What we said may be wonderful, but that’s not all! By using one or more programming languages, people create application software or, as they are sometimes called, software solutions, that are adjusted for specific business needs'),
 Document(id='46ed308c-eca0-45f9-9657-0de167607346', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='As you can see from the infographic, R, and P

In [12]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

Page Content: Thus, we need a lot of computational power, and we can expect people to use the languages similar to those in the big data column. Apart from R, Python, and MATLAB, other, faster languages are used like Java, JavaScript, C, C++, and Scala. Cool. What we said may be wonderful, but that’s not all! By using one or more programming languages, people create application software or, as they are sometimes called, software solutions, that are adjusted for specific business needs
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of

# MMR Search

In [13]:
question = "What software do data scientists use?"

In [23]:
retrieved_docs = vectorstore.max_marginal_relevance_search(query = question, 
                                               k = 3,
                                               lambda_mult = 0.5,
                                               filter = {'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'} )

In [24]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

Page Content: As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in real life? Certainly, with the help of computers. You can basically split the relevant tools into two categories—programming languages and software. Knowing a programming language enables you to devise programs that can execute specific operations. Moreover, you can reuse these programs whenever you need 