In [None]:
# Data ingestion

from langchain_community.document_loaders import TextLoader
loader = TextLoader("speech.txt")

text_documents = loader.load()

text_documents

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY','')

In [None]:
# Web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load, chunk and index the content of the html page

loader = WebBaseLoader(
    web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
    bs_kwargs=dict(parse_only=bs4.SoupStrainer(
        class_=("post-title","post-content","post-header")
    )),
)

web_document = loader.load()
web_document


In [None]:
# PDF reader

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('std7-bb-english.pdf')

pdf_document = loader.load()
pdf_document

In [None]:
# Document chunking...
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
)

documents = text_splitter.split_documents(pdf_document)
documents

In [None]:
# Chroma Vector embeddings...
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(documents[:10], OpenAIEmbeddings())
db

In [None]:
# Chroma similarity search

query = ""

result = db.similarity_search(query)
result[0].page_content


In [None]:
# FAISS database

from langchain_community.vectorstores import FAISS

db1 = FAISS.from_documents(documents, OpenAIEmbeddings())
db1

In [None]:
# FAISS Similarity search

query = ""

result1 = db1.similarity_search(query)
result1[0].page_content