In [1]:
#%pip install -U -q faiss-cpu tiktoken

In [8]:
from operator import itemgetter

from langchain_community.vectorstores import Weaviate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings



#for vector store

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

In [2]:
from dotenv import load_dotenv
load_dotenv()
import os

In [None]:
import tiktoken
encoder = tiktoken.get_encoding("cl100k_base")


In [3]:
import weaviate

client = weaviate.Client(url=os.environ['WEAVIATE_URL'])

## This cell is for local use only

In [45]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model='mistral')

db = Weaviate.from_documents(docs, embeddings, client=client,)


#This cell can be execute4d locally but will take huge time

## Didnt do the cell below due to rate limit

In [48]:
from langchain_community.document_loaders import TextLoader


loader = TextLoader("aws.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=10)
docs = text_splitter.split_documents(documents)


embeddings = OpenAIEmbeddings()

Created a chunk of size 103, which is longer than the specified 100
Created a chunk of size 145, which is longer than the specified 100
Created a chunk of size 162, which is longer than the specified 100
Created a chunk of size 160, which is longer than the specified 100
Created a chunk of size 114, which is longer than the specified 100
Created a chunk of size 115, which is longer than the specified 100
Created a chunk of size 150, which is longer than the specified 100
Created a chunk of size 204, which is longer than the specified 100
Created a chunk of size 147, which is longer than the specified 100
Created a chunk of size 102, which is longer than the specified 100
Created a chunk of size 102, which is longer than the specified 100
Created a chunk of size 204, which is longer than the specified 100
Created a chunk of size 124, which is longer than the specified 100
Created a chunk of size 335, which is longer than the specified 100


In [49]:
vectorstore = Weaviate.from_documents(docs, embeddings, client=client)

In [52]:
query = "what is a data lake and how it is different from database"
vectorstore.similarity_search_with_score(query)

[(Document(page_content='Connect with data sharing\nLake Formation provides a straightforward way to share Data Catalog objects like databases and tables with internal and external users. This mechanism empowers organizations with quick and secure access to data and speeds up their business decision-making. Let’s review the new features and enhancements made in 2023 under this theme.', metadata={'_additional': {'vector': [-0.006786916, 0.006372448, 0.01845765, -0.03102985, -0.027244374, 0.022312203, -0.03832449, -0.0030152558, -0.031941682, -0.024799012, 0.012924499, 0.004973618, 0.0042172135, 0.0017675341, -0.0033002028, 0.026981877, 0.021994444, -0.04929408, 0.031886417, -0.02030894, 0.0063793557, -0.03210747, -0.0023745573, -0.007888711, -0.005578051, -0.0013876049, 0.013035024, -0.01128735, 0.020115523, 0.018941196, 0.0206267, -0.020695778, -0.0042931996, -0.0028132028, 0.00012801023, 0.0016319684, -0.005011611, -0.019853026, 0.008717648, 0.005643675, 0.0039754407, -0.01860962, 0.0

In [53]:
#also can use maximal marginal relevance search(MMR)
retriever = vectorstore.as_retriever(search_type="mmr")
retriever.get_relevant_documents(query)[0]
#the highest score vector gets displayed

Document(page_content='Connect with data sharing\nLake Formation provides a straightforward way to share Data Catalog objects like databases and tables with internal and external users. This mechanism empowers organizations with quick and secure access to data and speeds up their business decision-making. Let’s review the new features and enhancements made in 2023 under this theme.', metadata={'source': 'aws.txt'})

mini Rag

In [54]:
from langchain_core.prompts import ChatPromptTemplate


template = """You are a helpful assistant that helps people to find solutions regarding cloud computing in AWS

question:{question}
context:{context}

Answer:
"""
prompt = ChatPromptTemplate.from_template(template=template)
print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='You are a helpful assistant that helps people to find solutions regarding cloud computing in AWS\n\nquestion:{question}\ncontext:{context}\n\nAnswer:\n'))]


In [55]:
#initialize the chatbot

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)



In [57]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Now initialize the retriever

doc_search = Weaviate.from_documents(docs,
                                     client=client,
                                     embedding=embeddings,
                                     )
retriever = doc_search.as_retriever()

chain = (
    {'context':retriever, "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


chain.invoke("What is a data lake?")

'A data lake is a centralized repository that allows you to store and analyze large amounts of structured, semi-structured, and unstructured data. It is built on Amazon Simple Storage Service (Amazon S3) and can integrate with various AWS analytics services. With a data lake, you can store data in its raw format without the need for a predefined schema, enabling you to perform flexible and scalable analysis on the data. It provides a cost-effective solution for storing and processing big data, and allows for easy data ingestion, data exploration, and data sharing across different teams and departments within an organization.'

### RAG

In [92]:
#new template
template = """Answer the question based on the context:{context},If the the answer is not found in the context just say I have no Idea

Question: {question}

Answer in the following language: {language}
"""


#init the prompt

prompt = ChatPromptTemplate.from_template(template=template)


In [94]:
#init the chain

chain = (
    {
        "context":itemgetter('question')|retriever,
        "question":itemgetter("question"),
        "language":itemgetter("language")
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [97]:
chain.invoke({"question":"How can I build a simple datalake","language":"English"})

'I have no idea.'

In [100]:
chain.invoke({"question":"What is a Data Lake?", "language":"English"})

'A Data Lake is a centralized repository that allows organizations to store and analyze large volumes of structured, semi-structured, and unstructured data. It is built on Amazon Simple Storage Service (Amazon S3) and integrates with multiple AWS analytics services. The Data Lake enables organizations to catalog, discover, share, and govern data stored across AWS, on premises, and third-party sources. It provides a foundation for data governance and facilitates quick and secure access to data, speeding up business decision-making.'