In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import nest_asyncio

In [2]:
nest_asyncio.apply()

In [3]:
articles = ["https://docs.python.org/3.13/whatsnew/3.13.html"]

In [4]:
# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()

In [5]:
# Converts HTML to plain text 
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

In [11]:
# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=500, 
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

Created a chunk of size 1698, which is longer than the specified 500
Created a chunk of size 540, which is longer than the specified 500
Created a chunk of size 708, which is longer than the specified 500
Created a chunk of size 662, which is longer than the specified 500
Created a chunk of size 515, which is longer than the specified 500
Created a chunk of size 1698, which is longer than the specified 500


In [12]:
# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents, 
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))


In [10]:
db.search("Python", search_type='mmr')

 Document(page_content='### tkinter¶', metadata={'source': 'https://docs.python.org/3.13/whatsnew/3.13.html'}),
 Document(page_content='* index\n  * modules |\n  * next |\n  * previous |\n  *   * Python »\n  * EnglishSpanishFrenchJapaneseKoreanBrazilian PortugueseTurkishSimplified ChineseTraditional Chinese', metadata={'source': 'https://docs.python.org/3.13/whatsnew/3.13.html'}),
 Document(page_content='* Python 3.13 and later have two years of full support, followed by three years of security fixes.', metadata={'source': 'https://docs.python.org/3.13/whatsnew/3.13.html'})]