In [9]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_chroma import Chroma


In [10]:
import dotenv

dotenv.load_dotenv()
import os 

model = "sentence-transformers/all-mpnet-base-v2"
api_key = os.getenv("HUGGINGFACE_API_KEY")
embedding = HuggingFaceEndpointEmbeddings(
    model=model,
    huggingfacehub_api_token=api_key,
)

In [11]:
vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

In [13]:
len(vectorstore.get()['documents'])

21

In [23]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k': 3, 
                                                      'lambda_mult': 0.5})

In [15]:
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEndpointEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001E3F7F8FE30>, search_type='mmr', search_kwargs={'k': 3, 'lambda_mult': 0.7})

In [78]:
question = "What software do data scientists use?"

In [83]:
retrieved_docs = retriever.invoke(question)

In [18]:
retrieved_docs

[Document(id='46ed308c-eca0-45f9-9657-0de167607346', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end'),
 Document(id='12a2c431-871b-4b74-8e47-31fc1efe4101', metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='Great! We hope we gave you a good idea about the level of applicability 

In [84]:
for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")

Page Content: As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied i

# Generation by using a chain

In [32]:
from langchain_huggingface import HuggingFaceEndpointEmbeddings

import dotenv

dotenv.load_dotenv()
import os 

model = "sentence-transformers/all-mpnet-base-v2"
api_key = os.getenv("HUGGINGFACE_API_KEY")
embedding = HuggingFaceEndpointEmbeddings(
    model=model,
    huggingfacehub_api_token=api_key,
)

In [33]:
from langchain_chroma import Chroma

vectorstore = Chroma(persist_directory = "./intro-to-ds-lectures", 
                     embedding_function = embedding)

In [110]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k': 5, 
                                                      'lambda_mult': 0.588})

In [134]:
from langchain_core.prompts import PromptTemplate


TEMPLATE = '''
Answer the following question:
{question}

To answer the question, use only the following context:
{context}

Give proper answer consisting of exactly one small summarised paragraph of about 50 words c which completely summarizes the context by shortening big sentences and at the end of the response, specify the name of the lecture this context is taken from in the format:
Resources: *Lecture Title*
where *Lecture Title* should be substituted with the title of all resource lectures.
'''

prompt_template = PromptTemplate.from_template(TEMPLATE)

In [118]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="text-generation",
    max_new_tokens=250,
    do_sample=False,
    repetition_penalty=1.03,
)

chat = ChatHuggingFace(llm=llm, verbose=True)

In [35]:
question = "What software do data scientists use?"

In [135]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain_core.output_parsers import StrOutputParser

chain = ({'context': retriever, 
         'question': RunnablePassthrough()} 
         | prompt_template
         | chat
         | StrOutputParser())

In [136]:
print(chain.invoke(question))

Data scientists commonly use R and Python, as they can manipulate data and are integrated with multiple data platforms. Other tools include EViews for econometric time-series models, Stata for statistical research, and Apache Hadoop, Apache Hbase, and Mongo DB for big data.

Resources: Programming Languages & Software Employed in Data Science - All the Tools You Need


# Test zone

In [100]:
retriever = vectorstore.as_retriever(search_type = 'mmr', 
                                     search_kwargs = {'k': 4, 
                                                      'lambda_mult': 0.7})

retrieved_docs = retriever.invoke(question)

for i in retrieved_docs:
    print(f"Page Content: {i.page_content}\n----------\nLecture Title:{i.metadata['Lecture Title']}\n")



Page Content: As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!
----------
Lecture Title:Programming Languages & Software Employed in Data Science - All the Tools You Need

Page Content: Thus, we need a lot of computational power, and we can expect people to use the languages similar to those