In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Data Ingestion Techniques


In [2]:
# txt loader...
from langchain_community.document_loaders import TextLoader
reader = TextLoader(file_path="./data/data.txt", encoding="utf-8")
text = reader.load()
text

[Document(metadata={'source': './data/data.txt'}, page_content='What is Retrieval-Augmented Generation?\nRetrieval-Augmented Generation (RAG) is the process of optimizing the output of a large language model, so it references an authoritative knowledge base outside of its training data sources before generating a response. Large Language Models (LLMs) are trained on vast volumes of data and use billions of parameters to generate original output for tasks like answering questions, translating languages, and completing sentences. RAG extends the already powerful capabilities of LLMs to specific domains or an organization\'s internal knowledge base, all without the need to retrain the model. It is a cost-effective approach to improving LLM output so it remains relevant, accurate, and useful in various contexts.\n\nWhy is Retrieval-Augmented Generation important?\nLLMs are a key artificial intelligence (AI) technology powering intelligent chatbots and other natural language processing (NLP

In [3]:
# web based loader...
from langchain_community.document_loaders import WebBaseLoader
import bs4 # Beautiful Soup is a library that makes it easy to scrape information from web pages.
reader = WebBaseLoader(web_path="https://python.langchain.com/docs/introduction/", bs_kwargs=dict(parse_only = bs4.SoupStrainer(class_="theme-doc-markdown markdown")))
text = reader.load()
text

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://python.langchain.com/docs/introduction/'}, page_content='Introduction\nLangChain is a framework for developing applications powered by large language models (LLMs).\nLangChain simplifies every stage of the LLM application lifecycle:\n\nDevelopment: Build your applications using LangChain\'s open-source components and third-party integrations.\nUse LangGraph to build stateful agents with first-class streaming and human-in-the-loop support.\nProductionization: Use LangSmith to inspect, monitor and evaluate your applications, so that you can continuously optimize and deploy with confidence.\nDeployment: Turn your LangGraph applications into production-ready APIs and Assistants with LangGraph Platform.\n\n\n\nLangChain implements a standard interface for large language models and related\ntechnologies, such as embedding models and vector stores, and integrates with\nhundreds of providers. See the integrations page for\nmore.\n\nSelect chat model:OpenA

In [4]:
from langchain_community.document_loaders import PyPDFLoader 
reader = PyPDFLoader(file_path="./data/AI_Governance.pdf")
doc = reader.load()
doc

[Document(metadata={'source': './data/AI_Governance.pdf', 'page': 0, 'page_label': 'i'}, page_content='Research \nPaper\nAI governance \nand\xa0human\xa0rights\nResetting the relationship\nKate Jones\nInternational Law \nProgramme  \nJanuary 2023\n'),
 Document(metadata={'source': './data/AI_Governance.pdf', 'page': 1, 'page_label': 'ii'}, page_content='Chatham House, the Royal Institute of International  \nAffairs, is a world-leading policy institute based in\xa0London. \nOur mission is to help governments and societies build \na\xa0sustainably secure, prosperous and\xa0just world.'),
 Document(metadata={'source': './data/AI_Governance.pdf', 'page': 2, 'page_label': '1'}, page_content='1 Chatham House\nContents\n Summary 2\n01 Introduction 3\n02 What is AI? 5\n03 Governing AI: why human rights? 9\n04 Principles of AI governance: the contribution of human rights 21\n05 Processes of AI governance: the contribution of human rights 34\n06 Remedies in AI\xa0governance: the\xa0contribution 

# Chunking or Splitting...

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_doc = text_splitter.split_documents(documents=doc)
splitted_doc

[Document(metadata={'source': './data/AI_Governance.pdf', 'page': 0, 'page_label': 'i'}, page_content='Research \nPaper\nAI governance \nand\xa0human\xa0rights\nResetting the relationship\nKate Jones\nInternational Law \nProgramme  \nJanuary 2023'),
 Document(metadata={'source': './data/AI_Governance.pdf', 'page': 1, 'page_label': 'ii'}, page_content='Chatham House, the Royal Institute of International  \nAffairs, is a world-leading policy institute based in\xa0London. \nOur mission is to help governments and societies build \na\xa0sustainably secure, prosperous and\xa0just world.'),
 Document(metadata={'source': './data/AI_Governance.pdf', 'page': 2, 'page_label': '1'}, page_content='1 Chatham House\nContents\n Summary 2\n01 Introduction 3\n02 What is AI? 5\n03 Governing AI: why human rights? 9\n04 Principles of AI governance: the contribution of human rights 21\n05 Processes of AI governance: the contribution of human rights 34\n06 Remedies in AI\xa0governance: the\xa0contribution of

# Embeddings

##Storing embeddings by using Chroma DB 

In [6]:
from langchain_community.embeddings import OpenAIEmbeddings, OllamaEmbeddings
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents=doc[:20], embedding=OpenAIEmbeddings())

  db = Chroma.from_documents(documents=doc[:20], embedding=OpenAIEmbeddings())


In [7]:
query = "what is AI?"
result = db.similarity_search(query=query)
result

[Document(metadata={'page': 6, 'page_label': '5', 'source': './data/AI_Governance.pdf'}, page_content='5 Chatham House\n02 \nWhat is AI?\nAI has capacity to transform human life\xa0– \nboth\xa0for\xa0better and for worse.\nAI is increasingly present in our lives, and its impact will expand significantly \nin the\xa0coming years. From predictive text, to social media news feeds, to virtual \nhomes and mobile phone voice assistants, AI is already a part of everyday life. \nAI\xa0offers automated translation, assists shoppers buying online and recommends \nthe fastest route on the drive home. It is also a key component of much-debated, \nrapidly developing technologies such as facial recognition and self-driving vehicles.\nThere is no single agreed definition of AI: it is a general term referring to \nmachines’\xa0evolving capacity to take on tasks requiring some form of intelligence. \nThe tasks that AI performs can include generating predictions, making decisions \nand providing recomme

In [8]:
result[0].page_content

'5 Chatham House\n02 \nWhat is AI?\nAI has capacity to transform human life\xa0– \nboth\xa0for\xa0better and for worse.\nAI is increasingly present in our lives, and its impact will expand significantly \nin the\xa0coming years. From predictive text, to social media news feeds, to virtual \nhomes and mobile phone voice assistants, AI is already a part of everyday life. \nAI\xa0offers automated translation, assists shoppers buying online and recommends \nthe fastest route on the drive home. It is also a key component of much-debated, \nrapidly developing technologies such as facial recognition and self-driving vehicles.\nThere is no single agreed definition of AI: it is a general term referring to \nmachines’\xa0evolving capacity to take on tasks requiring some form of intelligence. \nThe tasks that AI performs can include generating predictions, making decisions \nand providing recommendations.1 This means that AI may make decisions itself, \nor\xa0provide information for use in human 

##Storing embeddings by using faiss vector DB

In [9]:
from langchain_community.vectorstores import FAISS
faiss_db = FAISS.from_documents(documents=doc[:20], embedding=OpenAIEmbeddings())

In [10]:
query = "what is AI?"
result = faiss_db.similarity_search(query=query)
result

[Document(id='11e627a8-1ea8-45f8-9bb6-3d94f1b80bc2', metadata={'source': './data/AI_Governance.pdf', 'page': 6, 'page_label': '5'}, page_content='5 Chatham House\n02 \nWhat is AI?\nAI has capacity to transform human life\xa0– \nboth\xa0for\xa0better and for worse.\nAI is increasingly present in our lives, and its impact will expand significantly \nin the\xa0coming years. From predictive text, to social media news feeds, to virtual \nhomes and mobile phone voice assistants, AI is already a part of everyday life. \nAI\xa0offers automated translation, assists shoppers buying online and recommends \nthe fastest route on the drive home. It is also a key component of much-debated, \nrapidly developing technologies such as facial recognition and self-driving vehicles.\nThere is no single agreed definition of AI: it is a general term referring to \nmachines’\xa0evolving capacity to take on tasks requiring some form of intelligence. \nThe tasks that AI performs can include generating prediction

In [11]:
result[0].page_content

'5 Chatham House\n02 \nWhat is AI?\nAI has capacity to transform human life\xa0– \nboth\xa0for\xa0better and for worse.\nAI is increasingly present in our lives, and its impact will expand significantly \nin the\xa0coming years. From predictive text, to social media news feeds, to virtual \nhomes and mobile phone voice assistants, AI is already a part of everyday life. \nAI\xa0offers automated translation, assists shoppers buying online and recommends \nthe fastest route on the drive home. It is also a key component of much-debated, \nrapidly developing technologies such as facial recognition and self-driving vehicles.\nThere is no single agreed definition of AI: it is a general term referring to \nmachines’\xa0evolving capacity to take on tasks requiring some form of intelligence. \nThe tasks that AI performs can include generating predictions, making decisions \nand providing recommendations.1 This means that AI may make decisions itself, \nor\xa0provide information for use in human 

##Storing embeddings by using Lance vector DB

In [12]:
from langchain_community.vectorstores import LanceDB
lance_db = LanceDB.from_documents(documents=doc[:20], embedding=OpenAIEmbeddings())

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
query = "what is AI?"
result = faiss_db.similarity_search(query=query)
result

[Document(id='11e627a8-1ea8-45f8-9bb6-3d94f1b80bc2', metadata={'source': './data/AI_Governance.pdf', 'page': 6, 'page_label': '5'}, page_content='5 Chatham House\n02 \nWhat is AI?\nAI has capacity to transform human life\xa0– \nboth\xa0for\xa0better and for worse.\nAI is increasingly present in our lives, and its impact will expand significantly \nin the\xa0coming years. From predictive text, to social media news feeds, to virtual \nhomes and mobile phone voice assistants, AI is already a part of everyday life. \nAI\xa0offers automated translation, assists shoppers buying online and recommends \nthe fastest route on the drive home. It is also a key component of much-debated, \nrapidly developing technologies such as facial recognition and self-driving vehicles.\nThere is no single agreed definition of AI: it is a general term referring to \nmachines’\xa0evolving capacity to take on tasks requiring some form of intelligence. \nThe tasks that AI performs can include generating prediction

In [14]:
result[0].page_content

'5 Chatham House\n02 \nWhat is AI?\nAI has capacity to transform human life\xa0– \nboth\xa0for\xa0better and for worse.\nAI is increasingly present in our lives, and its impact will expand significantly \nin the\xa0coming years. From predictive text, to social media news feeds, to virtual \nhomes and mobile phone voice assistants, AI is already a part of everyday life. \nAI\xa0offers automated translation, assists shoppers buying online and recommends \nthe fastest route on the drive home. It is also a key component of much-debated, \nrapidly developing technologies such as facial recognition and self-driving vehicles.\nThere is no single agreed definition of AI: it is a general term referring to \nmachines’\xa0evolving capacity to take on tasks requiring some form of intelligence. \nThe tasks that AI performs can include generating predictions, making decisions \nand providing recommendations.1 This means that AI may make decisions itself, \nor\xa0provide information for use in human 