In [4]:
from langchain_community.document_loaders import TextLoader

#Data Ingestion
loader = TextLoader("speech.txt")
text_document = loader.load()
text_document


[Document(page_content='I\'m sorry, but I don\'t want to be an emperor\nThat\'s not my business\nI don\'t want to rule or conquer anyone\nI should like to help everyone if possible\nJew, Gentile, Black Man, White\nWe all want to help one another, human beings are like that\nWe want to live by each other\'s happiness, not by each other\'s misery\nWe don\'t want to hate and despise one another\nAnd this world has room for everyone, and the good Earth is rich and can provide for everyone\nThe way of life can be free and beautiful, but we have lost the way\nGreed has posioned men\'s souls, has barricaded the world with hate, has goose-stepped us into misery and bloodshed\nWe have developed speed, but we have shut ourselves in\nMachinery that gives us abundance has left us in want\nOur knowledge has made us cynincal\nOur cleverness, hard and unkind\nWe think too much, and feel too little\nMore than machinery, we need humanity\nMore that cleverness, we need kindness and gentleness\nWithout t

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [17]:
# web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load,chunk and index the content of the html page

loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post-content","post-header")

                     )))

text_documents=loader.load()
text_documents


In [23]:
#pdf loader

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('alice_in_wonderland.pdf')
docs = loader.load()

In [25]:
#splitting the document into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents[:5]


[Document(page_content="The Project Gutenberg eBook of Alice's Adventures in W onderland\nThis ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with\nalmost no restrictions whatsoever . You may copy it, give it away or re-use it under the terms of the Project Gutenberg\nLicense included with this ebook or online at www .gutenberg.org  (http://www .gutenberg.org) . If you are not located in the\nUnited States, you will have to check the laws of the country where you are located before using this eBook.\nTitle: Alice's Adventures in W onderland\nAuthor: Lewis Carroll\nRelease date: June 27, 2008 [eBook #1 1] Most recently updated: March 30, 2021\nLanguage: English\nCredits: Arthur DiBianca and David Widger\n_ START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND _  [Illustration]\nAlice’ s Adventures in W onderland\nby Lewis Carroll\nTHE MILLENNIUM FULCRUM EDITION 3.0\nContents\nCHAPTER I. Down the Rabbit-Hole C

In [28]:
#vectorization (vector embedding and vector store)

#from langchain_community.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db1 = Chroma.from_documents(documents[:20], OpenAIEmbeddings())

<langchain_community.vectorstores.chroma.Chroma at 0x1ede5447f70>

In [33]:
#Querying the vector store - similarity search Chroma

query = "Mad hatter saying intersting things"
result = db1.similarity_search(query)
result[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'page_content': 'whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when\nsuddenly a White Rabbit with pink eyes ran close by her .\nThere was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to\nitself, “Oh dear! Oh dear! I shall be late!” (when she thought it over afterwards, it occurred to her that she ought to have\nwondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its\nwaistcoat-pocket , and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she\nhad never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity , she\nran across the field after it, and fortunately was just in time to see it pop down a

In [34]:
from langchain_community.vectorstores import FAISS
db2 = FAISS.from_documents(documents[:20], OpenAIEmbeddings())


In [35]:
#Querying the vector store - similarity search FAISS

query = "Mad hatter saying intersting things"
result = db2.similarity_search(query)
result[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'page_content': 'whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when\nsuddenly a White Rabbit with pink eyes ran close by her .\nThere was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to\nitself, “Oh dear! Oh dear! I shall be late!” (when she thought it over afterwards, it occurred to her that she ought to have\nwondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its\nwaistcoat-pocket , and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she\nhad never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity , she\nran across the field after it, and fortunately was just in time to see it pop down a